Load HTML document from a string
public static loadHTML ( string $data, boolean $storeContent = false, string $defaultEncoding = '' ) : Zend_Search_Lucene_Document_Html | ||
$data | string | |
$storeContent | boolean | |
$defaultEncoding | string | HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
return | Zend_Search_Lucene_Document_Html |
public function highlightMatches($inputHTML) { $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML); $colorIndex = 0; $this->highlightMatchesDOM($doc, $colorIndex); return $doc->getHTML(); }
protected function _indexate($url) { if (!stristr($url, 'http://')) { $url = HTTP_HOST . $url; } $url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url; if (!in_array($url, $this->_indexedUrl)) { if (stristr($url, HTTP_HOST)) { array_push($this->_indexedUrl, $url); $html = file_get_contents($url); libxml_use_internal_errors(true); $doc = Zend_Search_Lucene_Document_Html::loadHTML($html); libxml_use_internal_errors(false); if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) { $html = $matches[1]; } $html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html); $html = strip_tags($html); $doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8')); $doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8')); $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8')); $this->_indexHandle->addDocument($doc); Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO); foreach ($doc->getLinks() as $link) { $temp = explode('.', $link); $ext = end($temp); if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) { $this->_indexate($link); } } } } }
public function testHtml() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $doc->highlight('document', '#66ffff'); $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
public function testHtml() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $doc->highlight('document', '#66ffff'); $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n"); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
public function testHtmlNoFollowLinks() { $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>'; $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks(); Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false); $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html); $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html')); Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true); $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html); $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html')); }
/** * index a file * * @author Jörn Dreyer <*****@*****.**> * * @param string $path the path of the file * * @return bool */ public static function indexFile($path = '', $user = null) { if (!Filesystem::isValidPath($path)) { return; } if ($path === '') { //ignore the empty path element return false; } if (is_null($user)) { $view = Filesystem::getView(); $user = \OCP\User::getUser(); } else { $view = new \OC\Files\View('/' . $user . '/files'); } if (!$view) { Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN); return false; } $root = $view->getRoot(); $pk = md5($root . $path); // the cache already knows mime and other basic stuff $data = $view->getFileInfo($path); if (isset($data['mimetype'])) { $mimetype = $data['mimetype']; if ('text/html' === $mimetype) { $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path)); } else { if ('application/msword' === $mimetype) { // FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem //$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path)); //no special treatment yet $doc = new \Zend_Search_Lucene_Document(); } else { $doc = new \Zend_Search_Lucene_Document(); } } // store fscacheid as unique id to lookup by when deleting $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk)); // Store document URL to identify it in the search results $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path)); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size'])); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype)); self::extractMetadata($doc, $path, $view, $mimetype); Lucene::updateFile($doc, $path, $user); return true; } else { Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR); return false; } }
/** * @param string $websiteId * @return string */ public function indexWebsite($websiteId) { $websiteService = new Website('Website'); if (!$websiteService->existsWebsiteAlready($websiteId)) { throw new CmsException('602', __METHOD__, __LINE__); } // Zum Rendern muss die Business-Schicht verwendet werden $renderBusiness = new BusinessRender('Render'); $modulService = new Modul('Modul'); $pageService = new Page('Page'); $allPageIds = $pageService->getIdsByWebsiteId($websiteId); $indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId); if (is_array($allPageIds) && count($allPageIds) > 0) { if (file_exists($indexFileOfWebsite)) { $index = \Zend_Search_Lucene::open($indexFileOfWebsite); $numberOfIndexedDocuments = $index->numDocs(); for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) { if (!$index->isDeleted($id)) { $document = $index->delete($id); } } } else { $index = \Zend_Search_Lucene::create($indexFileOfWebsite); } foreach ($allPageIds as $pageId) { $pageContent = $this->getPageContent($websiteId, $pageId); if ($this->isStoreContentEnabled()) { $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8'); } else { $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8'); } $document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent))); $document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId)); $index->addDocument($document); } $index->commit(); $index->optimize(); unset($index); } return $indexFileOfWebsite; }
/** * * removes html, javascript and additional whitespaces from string * * @param $html * @return mixed|string */ protected function getPlainTextFromHtml($html) { $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8"); $html = $doc->getHTML(); //remove scripts and stuff $search = array('@(<script[^>]*?>.*?</script>)@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@'); $text = preg_replace($search, "", $html); //remove html tags $text = strip_tags($text); //remove additional whitespaces $text = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $text); return $text; }
public function indexCatalog($catalogGuid) { $index = $this->_index; $tblCatalog = new Kutu_Core_Orm_Table_Catalog(); $rowsetCatalog = $tblCatalog->find($catalogGuid); if (count($rowsetCatalog)) { //check if guid exist in index, then delete $term = new Zend_Search_Lucene_Index_Term($catalogGuid, 'guid'); $docIds = $index->termDocs($term); foreach ($docIds as $id) { $doc = $index->getDocument($id); $index->delete($id); } $rowCatalog = $rowsetCatalog->current(); $doc = new Zend_Search_Lucene_Document(); $doc->addField(Zend_Search_Lucene_Field::Keyword('guid', $rowCatalog->guid)); //fill parentGuid with catalogGuid if it's kutu_doc if ($rowCatalog->profileGuid == 'kutu_doc') { $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem(); $rowset = $tblRelatedItem->fetchAll("itemGuid='{$rowCatalog->guid}' AND relateAs='RELATED_FILE'"); if (count($rowset)) { $row = $rowset->current(); $parentCatalogGuid = $row->relatedGuid; $doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $parentCatalogGuid)); } } else { $doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $rowCatalog->guid)); } $doc->addField(Zend_Search_Lucene_Field::Text('profile', $rowCatalog->profileGuid)); $doc->addField(Zend_Search_Lucene_Field::Keyword('publishedDate', $this->_filterDateTime($rowCatalog->publishedDate))); $doc->addField(Zend_Search_Lucene_Field::Keyword('expiredDate', $this->_filterDateTime($rowCatalog->expiredDate))); $doc->addField(Zend_Search_Lucene_Field::Keyword('createdBy', $rowCatalog->createdBy)); $doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedBy', $rowCatalog->modifiedBy)); $doc->addField(Zend_Search_Lucene_Field::Keyword('createdDate', $this->_filterDateTime($rowCatalog->createdDate))); $doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedDate', $this->_filterDateTime($rowCatalog->modifiedDate))); $doc->addField(Zend_Search_Lucene_Field::Keyword('status', $rowCatalog->status)); if ($rowCatalog->profileGuid == 'kutu_doc') { $doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'file')); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'catalog')); } $rowsetCatalogAttribute = $rowCatalog->findDependentRowsetCatalogAttribute(); if (count($rowsetCatalogAttribute)) { foreach ($rowsetCatalogAttribute as $rowCatalogAttribute) { switch ($rowCatalogAttribute->attributeGuid) { case 'fixedTitle': case 'title': $doc->addField(Zend_Search_Lucene_Field::Text('title', $rowCatalogAttribute->value)); break; case 'fixedSubTitle': case 'subTitle': $doc->addField(Zend_Search_Lucene_Field::Text('subtitle', $rowCatalogAttribute->value)); break; case 'fixedContent': case 'content': $docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value); $cleanedText = $docHtml->getFieldValue('body'); $doc->addField(Zend_Search_Lucene_Field::UnStored('content', $cleanedText)); break; case 'fixedKeywords': case 'keywords': $doc->addField(Zend_Search_Lucene_Field::UnStored('keywords', $rowCatalogAttribute->value)); break; case 'fixedDescription': case 'description': $doc->addField(Zend_Search_Lucene_Field::Text('description', $rowCatalogAttribute->value)); break; case 'ptsKetua': $doc->addField(Zend_Search_Lucene_Field::Text('judge', $rowCatalogAttribute->value)); break; case 'prtNomor': case 'fixedNomor': case 'fixedNumber': case 'nomor': case 'ptsNomor': $doc->addField(Zend_Search_Lucene_Field::UnStored('number', $rowCatalogAttribute->value)); break; case 'prtTahun': case 'fixedTahun': case 'fixedYear': case 'tahun': case 'ptsTahun': $doc->addField(Zend_Search_Lucene_Field::UnStored('year', $rowCatalogAttribute->value)); break; default: //check if attribute is a datetime field $tblAttribute = new Kutu_Core_Orm_Table_Attribute(); $rowAttribute = $tblAttribute->find($rowCatalogAttribute->attributeGuid)->current(); if ($rowAttribute->type == 4) { $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $this->_filterDateTime($rowCatalogAttribute->value))); } else { if ($rowAttribute->type == 2) { $docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value); $cleanedText = $docHtml->getFieldValue('body'); $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $cleanedText)); } else { $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $rowCatalogAttribute->value)); } } break; } } //if profile=kutu_doc, extract text from its file and put it in content field if ($rowCatalog->profileGuid == 'kutu_doc') { $row = $rowsetCatalogAttribute->findByAttributeGuid('docSystemName'); $systemName = $row->value; $row = $rowsetCatalogAttribute->findByAttributeGuid('docMimeType'); $mimeType = $row->value; $extactedText = $this->_extractText($rowCatalog->guid, $systemName, $mimeType); $doc->addField(Zend_Search_Lucene_Field::UnStored('content', $extactedText)); } } // if catalog is a kutu_doc, and if field content empty (this means // file can't be read, text can't be extracted, or file empty), do not index if ($rowCatalog->profileGuid == 'kutu_doc') { $tmpS = $doc->getFieldValue('content'); if (!empty($tmpS)) { $index->addDocument($doc); } else { } } else { $index->addDocument($doc); } } else { // do nothing } }
protected function _spider($url) { $queue = array(); $visited = array(); array_push($queue, $url); while (!empty($queue)) { $doc = null; $url = array_shift($queue); if ($url = $this->_sanitizeUrl($url)) { if (!in_array($url, $visited)) { $visited[] = $url; Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG); $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html'; if (file_exists($cachePath)) { Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG); try { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8'); } catch (Exception $e) { Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG); } } else { $this->_client->setUri($this->_getAbsoluteUrl($url)); try { $response = $this->_client->request(); $status = $response->getStatus(); Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG); if ($status == '200') { $data = $response->getBody(); $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8'); } } catch (Exception $e) { Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG); } } if ($doc !== null) { $this->_search()->indexDoc($doc, $url); $this->_indexed++; $links = array_diff($doc->getLinks(), $this->_visited); if (count($visited) < $this->_maxLinks) { $queue = array_merge($queue, $links); } else { Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG); exit; } } } } } }
/** * Loop through all URIs * * @return void * @access public */ public function build() { // Process the queue $i = 0; foreach ($this->pageList as $page) { $uri = translateURL("page/{$page['id']}"); try { /** * check if customised template for indexing exists * this is DEPRECATED approach how to customise indexable content, use getExcludes() instead * remember that you need also to create controller for the template */ if (file_exists(ONXSHOP_PROJECT_DIR . "templates/node/page/{$page['node_controller']}_indexable.html")) { $toFetch = "request/sys/html5.node/page/{$page['node_controller']}_indexable~id={$page['id']}~"; } else { $toFetch = "request/sys/html5.node~id={$page['id']}~"; } msg("Fetching page {$page['id']}: {$uri} using {$toFetch}"); $this->client->setUri($this->profile['uri'] . $toFetch); $response = $this->client->request(); if ($response->isSuccessful() && !$response->isRedirect() && !$response->isError()) { $response_body = $this->filterHtmlDocument($response->getBody()); $this->index($uri, Zend_Search_Lucene_Document_Html::loadHTML($response_body, true)); } } catch (Exception $e) { msg("HTTP fetch exception: " . $e->getMessage()); } $i++; // if ($i == 10) break; } // Optimize index. $this->indexOptimize(); }
static function splitWordsFromCatalog($catalogGuid, $iLimit) { $desc = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedDescription'); $content = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedContent'); $desc = Zend_Search_Lucene_Document_Html::loadHTML($desc); $content = Zend_Search_Lucene_Document_Html::loadHTML($content); $desc = $desc->getFieldValue('body'); $content = $content->getFieldValue('body'); if (!empty($desc)) { if ($iLimit > str_word_count($desc)) { return $desc; } else { return Kutu_Core_Util::getNumberOfWords($desc, $iLimit); } } if (!empty($content)) { if ($iLimit > str_word_count($content)) { return $content; } else { return Kutu_Core_Util::getNumberOfWords($content, $iLimit); } } return ''; }
/** * Index with Zend_Lucene * * @param unknown_type $uri * @param unknown_type $htmlString */ function indexContent($uri, $htmlString) { require_once 'Zend/Search/Lucene.php'; $index_location = ONXSHOP_PROJECT_DIR . 'var/index'; if (is_dir($index_location)) { // Open existing index try { $index = Zend_Search_Lucene::open($index_location); } catch (Exception $e) { // Create index try { $index = Zend_Search_Lucene::create($index_location); } catch (Exception $e) { $index = false; } } } if ($index) { // find and remove pages with the same URI $hits = $index->find("uri:" . $uri); foreach ($hits as $hit) { $index->delete($hit); } $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString, true); $doc->addField(Zend_Search_Lucene_Field::Keyword('uri', $uri)); $index->addDocument($doc); $index->commit(); } }
/** * * @param $article * @param $isNew */ function onIndexContent($article, $isNew = false) { //FIXME move the content type tests and following transformations to the helper global $mainframe; $pk = $article->id; if (!$isNew) { JuceneHelper::removeFromIndex('pk:' . $pk); } $index = JuceneHelper::getIndex(); $xml_field = substr($article->fulltext, 0, 5) != '<?xml' ? $article->introtext : $article->fulltext; if (substr($xml_field, 0, 5) == '<?xml') { $dom = new DOMDocument(); $pmml = true; $xslt = new DOMDocument(); $error = false; //load xslt stylesheet if (!@$xslt->load(JPATH_SITE . DS . 'administrator' . DS . 'components' . DS . 'com_jucene' . DS . 'xslt/jucene.xsl')) { $error = true; $this->raiseMessage("XSLTLOADERROR", 'error'); } $proc = new XSLTProcessor(); if (!$proc->importStylesheet($xslt)) { $error = true; $this->raiseMessage("XSLTIMPORTERROR", 'error'); } unset($artcile->fulltext); unset($record->introtext); if ($dom->loadXML($xml_field) && !$error && $pmml) { //simplify the document - prepare it for the indexation process $xslOutput = $proc->transformToXml($dom); //create new DOM document to preserve output and transform the XML to the indexable one $transXml = new DOMDocument(); $transXml->preserveWhitespace = false; @$transXml->loadXML($xslOutput); //unset unneccessary variables unset($xslOutput); unset($dom); unset($xslt); //index every assoc rule as document with same credentials if (!$error) { $rules = $transXml->getElementsByTagName("AssociationRule"); $rulesCount = $rules->length; if ($rulesCount == 0) { $error = true; $this->raiseMessage('XMLDOCUMENTNORULES', 'error'); } $rule_doc_position = 0; foreach ($rules as $rule) { $additional['rating'] = 0; $additional['position'] = $rule_doc_position; JPluginHelper::importPlugin('content'); $dispatcher =& JDispatcher::getInstance(); $results = $dispatcher->trigger('onIndexPmml', array($rule, $additional)); $rule_doc_position++; } } } } else { $zendDoc = Zend_Search_Lucene_Document_Html::loadHTML($article->fulltext, false, UTF - 8); $index->addDocument($zendDoc); } }
/** * index a file * * @author Jörn Dreyer <*****@*****.**> * * @param string $path the path of the file * * @return bool */ public static function indexFile($path = '', $user = null) { if (!Filesystem::isValidPath($path)) { return; } if ($path === '') { //ignore the empty path element return false; } if (is_null($user)) { $view = Filesystem::getView(); $user = \OCP\User::getUser(); } else { $view = new \OC\Files\View('/' . $user . '/files'); } if (!$view) { Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN); return false; } if (!$view->file_exists($path)) { Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG); return true; } $root = $view->getRoot(); $pk = md5($root . $path); // the cache already knows mime and other basic stuff $data = $view->getFileInfo($path); if (isset($data['mimetype'])) { $mimeType = $data['mimetype']; // initialize plain lucene document $doc = new \Zend_Search_Lucene_Document(); // index content for local files only $localFile = $view->getLocalFile($path); if ($localFile) { //try to use special lucene document types if ('text/plain' === $mimeType) { $body = $view->file_get_contents($path); if ($body != '') { $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body)); } } else { if ('text/html' === $mimeType) { //TODO could be indexed, even if not local $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path)); } else { if ('application/pdf' === $mimeType) { $doc = Pdf::loadPdf($view->file_get_contents($path)); // commented the mimetype checks, as the zend classes only understand docx and not doc files. // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ... //} else if ('application/msword' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.docx') { $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile); //} else if ('application/msexcel' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.xlsx') { $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile); //} else if ('application/mspowerpoint' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.pptx') { $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile); } else { if (strtolower(substr($data['name'], -4)) === '.odt') { $doc = Odt::loadOdtFile($localFile); } else { if (strtolower(substr($data['name'], -4)) === '.ods') { $doc = Ods::loadOdsFile($localFile); } } } } } } } } } // Store filecache id as unique id to lookup by when deleting $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk)); // Store filename $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8')); // Store document path to identify it in the search results $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8')); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size'])); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType)); //self::extractMetadata($doc, $path, $view, $mimeType); Lucene::updateFile($doc, $path, $user); return true; } else { Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR); return false; } }
/** * Gets the indexed page content. * * @param string $pathInfo url path of the page * @param string $Query The search query index file * @param int $MaxResultByWord * @param string $class * @param int $MaxLimitCara * @return string * @access public * * @author Etienne de Longeaux <*****@*****.**> * @since 2012-06-11 */ public function contentPage($pathInfo, $Query = null, $MaxResultByWord = 5, $class = "", $MaxLimitCara = 0) { $body = ""; $searchWords = explode(' ', strtolower($Query)); $result_search = null; try { // we get the content of the page. $body = file_get_contents($this->container->get('request')->getUriForPath('') . $pathInfo); // we delete all contents of tags which are given in params (and all tags which are inside). $body = $this->deleteTags($body); // we get the only words of the body content of the page. $body = \Zend_Search_Lucene_Document_Html::loadHTML($body, false)->getFieldUtf8Value('body'); foreach ($searchWords as $key => $word) { $new_word = strtolower($word); $new_word = str_replace("e", "#@@@#", $new_word); $new_word = str_replace("é", "[ée]{1,2}", $new_word); $new_word = str_replace("è", "[èe]{1,2}", $new_word); $new_word = str_replace("ê", "[êe]{1,2}", $new_word); $new_word = str_replace("ë", "[ëe]{1,2}", $new_word); $new_word = str_replace("#@@@#", "[éèeêë]{1,2}", $new_word); $matches_word = preg_split("#{$new_word}#i", $body); if (($MaxLimitCara - strlen($word)) % 2 == 0) { $maxLimitSegment = ($MaxLimitCara - strlen($word)) / 2; } else { $maxLimitSegment = ($MaxLimitCara - strlen($word) + 1) / 2; } foreach ($matches_word as $key => $value) { if ($key < intval($MaxResultByWord)) { if ($MaxLimitCara != 0) { $words = explode(' ', $value); $words_inverse = array_reverse($words); $inverse_chaine = implode(' ', $words_inverse); $inverse_chaine = $this->container->get('sfynx.tool.string_manager')->truncate($inverse_chaine, $maxLimitSegment, '...'); $words_inverse = explode(' ', $inverse_chaine); $words = array_reverse($words_inverse); $Contents = implode(' ', $words); } else { $Contents = $value; } if (isset($matches_word[$key + 1])) { if (!empty($class)) { $Contents .= "<span class='{$class}' >" . strtoupper($word) . '</span>'; } else { $Contents .= "<span style='color:white;background-color:black;font-size:13;font-weight:bold;' >" . strtoupper($word) . '</span>'; } if ($MaxLimitCara == 0) { $Contents .= $matches_word[$key + 1]; } else { $Contents .= $this->container->get('sfynx.tool.string_manager')->truncate($matches_word[$key + 1], $maxLimitSegment, ''); } } $result_search[] = $Contents; } // end if } // end foreach } return implode(' ', $result_search); } catch (\Exception $e) { return ''; } }
/** * Crawl a URI * * @param string $uri * @return void * @access protected */ protected function crawl($uri) { msg("Crawling: {$uri}"); $this->uriProcessed[] = $uri; $uri_parts = parse_url($uri); $this->validateLinkForCrawl($uri); if ($this->validateLinkForCrawl($uri)) { // Retrieve the content $this->client->setUri($uri); try { $response = $this->client->request(); if ($response->isSuccessful() && !$response->isRedirect() && !$response->isError()) { //msg("Response status: ". $response->getStatus()); $this->index($uri_parts['path'], Zend_Search_Lucene_Document_Html::loadHTML($response->getBody(), true)); } } catch (Exception $e) { } } }
/** * @group ZF-10686 */ public function testLoadHtmlWithAttributesInTagHTML() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertEquals('Page title ', $doc->title); }
function rebuild_search_indexes() { global $success_msg; global $error_msg; global $warning_msg; global $all_settings; global $indexable_folders; $index_folder = get_setting('search_indexes_folder', $all_settings); try { setlocale(LC_CTYPE, LOCALE); Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive()); $index = new Zend_Search_Lucene($index_folder, true); $files_to_index = get_website_files($indexable_folders); foreach ($files_to_index as $html_file => $page_url) { if (can_index_website_file($html_file)) { $f1 = strtolower($html_file); if (end_with($f1, 'html') || end_with($f1, 'htm')) { $file_content = file_get_contents($html_file); } elseif (end_with($f1, 'php')) { if (is_http_code_200($page_url)) { $file_content = get_url_content($page_url); } } if (isset($file_content)) { $file_content = '<html>' . strstr($file_content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content, true, 'UTF-8'); $doc->addField(Zend_Search_Lucene_Field::Text('url', $page_url, 'UTF-8')); $index->addDocument($doc); flush(); } } } $broken_urls = array(); foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) { if (is_http_code_200($url)) { $content = get_url_content($url); $content = '<html>' . strstr($content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($content, true, 'UTF-8'); $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8')); $index->addDocument($doc); flush(); } else { array_push($broken_urls, $url); } } if (file_exists($index_folder)) { if (count($broken_urls) > 0) { $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>'; $warning_msg .= '<ul class="disc">'; foreach ($broken_urls as $broken_url) { $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>'; } $warning_msg .= '</ul>'; $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>'; } else { $success_msg = 'The website was successfully indexed.'; } } else { $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created'; } } catch (Exception $e) { $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage(); } }
/** * Gets our HTML from MySQL Workbench for us. * * @access private * @param Zend_Response $response * @return String * */ private function _getHTMLResponse($response) { if (200 === $response->getStatus()) { $doc = Zend_Search_Lucene_Document_Html::loadHTML($response->getBody()); return $doc->getHTML(); } return false; }
function rebuild_search_indexes() { global $success_msg; global $error_msg; global $warning_msg; global $all_settings; $index_folder = get_setting('search_indexes_folder', $all_settings); try { $index = new Zend_Search_Lucene($index_folder, true); setlocale(LC_CTYPE, 'en_US'); foreach (get_all_html_files(dirname(__FILE__)) as $html_file => $html_url) { if (can_index_html_file($html_file)) { $file_content = file_get_contents($html_file); $file_content = '<html>' . strstr($file_content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content); $doc->addField(Zend_Search_Lucene_Field::Text('url', $html_url, 'UTF-8')); $index->addDocument($doc); flush(); } } $broken_urls = array(); foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) { $headers = get_headers($url); if (strrpos($headers[0], '200')) { $content = file_get_contents($url); $content = '<html>' . strstr($content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($content); $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8')); $index->addDocument($doc); flush(); } else { array_push($broken_urls, $url); } } if (file_exists($index_folder)) { if (count($broken_urls) > 0) { $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>'; $warning_msg .= '<ul class="disc">'; foreach ($broken_urls as $broken_url) { $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>'; } $warning_msg .= '</ul>'; $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>'; } else { $success_msg = 'The website was successfully indexed.'; } } else { $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created'; } } catch (Exception $e) { $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage(); } }
/** * Highlight matches in $inputHtmlFragment and return it (without HTML header and body tag) * * @param string $inputHtmlFragment * @param string $encoding Input HTML string encoding * @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter * @return string */ public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null) { if ($highlighter === null) { // require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php'; $highlighter = new Zend_Search_Lucene_Search_Highlighter_Default(); } $inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($encoding, 'UTF-8//IGNORE', $inputHtmlFragment) . '</body></html>'; /** Zend_Search_Lucene_Document_Html */ // require_once 'Zend/Search/Lucene/Document/Html.php'; $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML); $highlighter->setDocument($doc); $this->_highlightMatches($highlighter); return $doc->getHtmlBody(); }
/** * Extract data from a PDF document and add this to the Lucene index. * * @param \Zend_Search_Lucene_Proxy $Index The Lucene index object. * @param string $type ['html', 'docx', 'xsls', 'pptx', 'content'] * @param array $indexValues * @param string $locale * @param object $obj * @param string $pathFile The path to the PDF document. * * @return \Zend_Search_Lucene_Proxy * @access public * @static * @author Etienne de Longeaux <*****@*****.**> * @since 2012-06-11 */ public static function index(\Zend_Search_Lucene_Proxy $Index, $type, $indexValues = null, $locale = '', $obj = null, $pathFile = '') { // ignore invalid characters for lucene text search \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('utf-8'); \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive()); self::$_index = $Index; self::$_doc = null; switch ($type) { case "html": self::$_doc = \Zend_Search_Lucene_Document_Html::loadHtmlFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "docx": self::$_doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "xsls": self::$_doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "pptx": self::$_doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($pathFile, false); $indexValues['Key'] = filemtime($pathFile); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; case "page": // we create a new instance of Zend_Search_Lucene_Document self::$_doc = \Zend_Search_Lucene_Document_Html::loadHTML($indexValues['Contents'], false); $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body'); break; } if (self::$_doc instanceof \Zend_Search_Lucene_Document) { // Remove all accens $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::minusculesSansAccents($indexValues['Contents']); // Remove all doublons $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::uniqueWord($indexValues['Contents']); // clean the content $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::cleanContent($indexValues['Contents']); // Delete all stop words $stopWord = \Sfynx\ToolBundle\Util\PiStringManager::stopWord(strtolower($locale)); if ($stopWord) { $wordsIndex = explode(' ', $indexValues['Contents']); $diff = array_diff($wordsIndex, $stopWord); $indexValues['Contents'] = implode(' ', $diff); } // print_r($locale); // print_r('<br /><br /><br />'); // print_r(implode(' ', $wordsIndex)); // print_r('<br /><br /><br />'); // print_r(implode(' ', $stopWord)); // print_r('<br /><br /><br />'); // print_r($indexValues['Contents']); // print_r('<br /><br /><br />'); // If the document creation was sucessful then add it to our index. try { setlocale(LC_ALL, $locale); self::defaultAddFields($indexValues); self::addDocument(); // print_r($indexValues['Key']); // print_r('<br />'); // print_r($indexValues['Contents']); // print_r('<br /><br /><br />'); } catch (\Exception $e) { setlocale(LC_ALL, 'fr_FR'); self::defaultAddFields($indexValues); try { self::addDocument(); } catch (\Exception $e) { } } } // Return the Lucene index object. return self::$_index; }