/** * {@inheritdoc} */ public function highlight($words) { $this->_doc->highlightExtended($words, [$this, 'wrapWords'], []); }
/** * Highlight matches in $inputHTMLFragment and return it (without HTML header and body tag) * * @param string $inputHTMLFragment * @param string $encoding Input HTML string encoding * @param Highlighter|null $highlighter * @return string */ public function htmlFragmentHighlightMatches($inputHTMLFragment, $encoding = 'UTF-8', $highlighter = null) { if ($highlighter === null) { $highlighter = new DefaultHighlighter(); } $inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($encoding, 'UTF-8//IGNORE', $inputHTMLFragment) . '</body></html>'; $doc = Document\HTML::loadHTML($inputHTML); $highlighter->setDocument($doc); $this->_highlightMatches($highlighter); return $doc->getHTMLBody(); }
/** * index a file * * @param File $file the file to be indexed * @param bool $commit * * @return bool true when something was stored in the index, false otherwise (eg, folders are not indexed) * @throws NotIndexedException when an unsupported file type is encountered */ public function indexFile(File $file, $commit = true) { // we decide how to index on mime type or file extension $mimeType = $file->getMimeType(); $fileExtension = strtolower(pathinfo($file->getName(), PATHINFO_EXTENSION)); // initialize plain lucene document $doc = new Document(); // index content for local files only $storage = $file->getStorage(); if ($storage->isLocal()) { $path = $storage->getLocalFile($file->getInternalPath()); //try to use special lucene document types if ('text/html' === $mimeType) { //TODO could be indexed, even if not local $doc = HTML::loadHTML($file->getContent()); } else { if ('text/' === substr($mimeType, 0, 5) || 'application/x-tex' === $mimeType) { $body = $file->getContent(); if ($body != '') { $doc->addField(Document\Field::UnStored('body', $body)); } } else { if ('application/pdf' === $mimeType) { $doc = Pdf::loadPdf($file->getContent()); // the zend classes only understand docx and not doc files } else { if ($fileExtension === 'docx') { $doc = Document\Docx::loadDocxFile($path); //} else if ('application/msexcel' === $mimeType) { } else { if ($fileExtension === 'xlsx') { $doc = Document\Xlsx::loadXlsxFile($path); //} else if ('application/mspowerpoint' === $mimeType) { } else { if ($fileExtension === 'pptx') { $doc = Document\Pptx::loadPptxFile($path); } else { if ($fileExtension === 'odt') { $doc = Odt::loadOdtFile($path); } else { if ($fileExtension === 'ods') { $doc = Ods::loadOdsFile($path); } else { throw new NotIndexedException(); } } } } } } } } } // Store filecache id as unique id to lookup by when deleting $doc->addField(Document\Field::Keyword('fileId', $file->getId())); // Store document path for the search results $doc->addField(Document\Field::Text('path', $file->getPath(), 'UTF-8')); $doc->addField(Document\Field::unIndexed('mtime', $file->getMTime())); $doc->addField(Document\Field::unIndexed('size', $file->getSize())); $doc->addField(Document\Field::unIndexed('mimetype', $mimeType)); $this->index->updateFile($doc, $file->getId(), $commit); return true; }
/** * @dataProvider dataProviderXHTML */ function testParseXHTMLSplit($term, $field, $descriptiveLocation, $skipped) { if ($skipped) { $this->markTestSkipped('TODO search ' . $descriptiveLocation . ' in ' . $field); } $data = file_get_contents(__DIR__ . '/data/libreoffice/document split.xhtml'); $doc = HTML::loadHTML($data, true); $value = $doc->getFieldValue($field); $containsTestTerm = is_string(stristr($value, $term)); $this->assertTrue($containsTestTerm, $field . '/' . $descriptiveLocation . ' does not contain "' . $term . '" in ' . $value); }
public function testHtmlNoFollowLinks() { $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>'; $oldNoFollowValue = Document\HTML::getExcludeNoFollowLinks(); Document\HTML::setExcludeNoFollowLinks(false); $doc1 = Document\HTML::loadHTML($html); $this->assertTrue($doc1 instanceof Document\HTML); $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html')); Document\HTML::setExcludeNoFollowLinks(true); $doc2 = Document\HTML::loadHTML($html); $this->assertTrue($doc2 instanceof Document\HTML); $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html')); }
/** * Highlight specified words * * @param string|array $words Words to highlight. They could be organized using the array or string. */ public function highlight($words) { $color = $this->_highlightColors[$this->_currentColorIndex]; $this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors); $this->_doc->highlight($words, $color); }
/** * {@inheritdoc} */ public function highlight($words) { $this->doc->highlightExtended($words, array($this, 'applyColour'), array()); }