loadHTML() public static method

Load HTML document from a string
public static loadHTML ( string $data, boolean $storeContent = false, string $defaultEncoding = '' ) : Zend_Search_Lucene_Document_Html
$data string
$storeContent boolean
$defaultEncoding string HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
return Zend_Search_Lucene_Document_Html
Ejemplo n.º 1
0
 public function highlightMatches($inputHTML)
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
     $colorIndex = 0;
     $this->highlightMatchesDOM($doc, $colorIndex);
     return $doc->getHTML();
 }
Ejemplo n.º 2
0
 protected function _indexate($url)
 {
     if (!stristr($url, 'http://')) {
         $url = HTTP_HOST . $url;
     }
     $url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url;
     if (!in_array($url, $this->_indexedUrl)) {
         if (stristr($url, HTTP_HOST)) {
             array_push($this->_indexedUrl, $url);
             $html = file_get_contents($url);
             libxml_use_internal_errors(true);
             $doc = Zend_Search_Lucene_Document_Html::loadHTML($html);
             libxml_use_internal_errors(false);
             if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) {
                 $html = $matches[1];
             }
             $html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);
             $html = strip_tags($html);
             $doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8'));
             $doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8'));
             $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8'));
             $this->_indexHandle->addDocument($doc);
             Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO);
             foreach ($doc->getLinks() as $link) {
                 $temp = explode('.', $link);
                 $ext = end($temp);
                 if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) {
                     $this->_indexate($link);
                 }
             }
         }
     }
 }
Ejemplo n.º 3
0
 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false);
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
Ejemplo n.º 4
0
 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n");
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
Ejemplo n.º 5
0
 public function testHtmlNoFollowLinks()
 {
     $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
     $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
     $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
 }
Ejemplo n.º 6
0
 /**
  * index a file
  *
  * @author Jörn Dreyer <*****@*****.**>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimetype = $data['mimetype'];
         if ('text/html' === $mimetype) {
             $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
         } else {
             if ('application/msword' === $mimetype) {
                 // FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem
                 //$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path));
                 //no special treatment yet
                 $doc = new \Zend_Search_Lucene_Document();
             } else {
                 $doc = new \Zend_Search_Lucene_Document();
             }
         }
         // store fscacheid as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store document URL to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype));
         self::extractMetadata($doc, $path, $view, $mimetype);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
 }
Ejemplo n.º 7
0
 /**
  * @param  string $websiteId
  * @return string
  */
 public function indexWebsite($websiteId)
 {
     $websiteService = new Website('Website');
     if (!$websiteService->existsWebsiteAlready($websiteId)) {
         throw new CmsException('602', __METHOD__, __LINE__);
     }
     // Zum Rendern muss die Business-Schicht verwendet werden
     $renderBusiness = new BusinessRender('Render');
     $modulService = new Modul('Modul');
     $pageService = new Page('Page');
     $allPageIds = $pageService->getIdsByWebsiteId($websiteId);
     $indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId);
     if (is_array($allPageIds) && count($allPageIds) > 0) {
         if (file_exists($indexFileOfWebsite)) {
             $index = \Zend_Search_Lucene::open($indexFileOfWebsite);
             $numberOfIndexedDocuments = $index->numDocs();
             for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) {
                 if (!$index->isDeleted($id)) {
                     $document = $index->delete($id);
                 }
             }
         } else {
             $index = \Zend_Search_Lucene::create($indexFileOfWebsite);
         }
         foreach ($allPageIds as $pageId) {
             $pageContent = $this->getPageContent($websiteId, $pageId);
             if ($this->isStoreContentEnabled()) {
                 $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8');
             } else {
                 $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8');
             }
             $document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent)));
             $document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId));
             $index->addDocument($document);
         }
         $index->commit();
         $index->optimize();
         unset($index);
     }
     return $indexFileOfWebsite;
 }
Ejemplo n.º 8
0
 /**
  *
  * removes html, javascript and additional whitespaces from string
  *
  * @param  $html
  * @return mixed|string
  */
 protected function getPlainTextFromHtml($html)
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8");
     $html = $doc->getHTML();
     //remove scripts and stuff
     $search = array('@(<script[^>]*?>.*?</script>)@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@');
     $text = preg_replace($search, "", $html);
     //remove html tags
     $text = strip_tags($text);
     //remove additional whitespaces
     $text = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $text);
     return $text;
 }
Ejemplo n.º 9
0
 public function indexCatalog($catalogGuid)
 {
     $index = $this->_index;
     $tblCatalog = new Kutu_Core_Orm_Table_Catalog();
     $rowsetCatalog = $tblCatalog->find($catalogGuid);
     if (count($rowsetCatalog)) {
         //check if guid exist in index, then delete
         $term = new Zend_Search_Lucene_Index_Term($catalogGuid, 'guid');
         $docIds = $index->termDocs($term);
         foreach ($docIds as $id) {
             $doc = $index->getDocument($id);
             $index->delete($id);
         }
         $rowCatalog = $rowsetCatalog->current();
         $doc = new Zend_Search_Lucene_Document();
         $doc->addField(Zend_Search_Lucene_Field::Keyword('guid', $rowCatalog->guid));
         //fill parentGuid with catalogGuid if it's kutu_doc
         if ($rowCatalog->profileGuid == 'kutu_doc') {
             $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem();
             $rowset = $tblRelatedItem->fetchAll("itemGuid='{$rowCatalog->guid}' AND relateAs='RELATED_FILE'");
             if (count($rowset)) {
                 $row = $rowset->current();
                 $parentCatalogGuid = $row->relatedGuid;
                 $doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $parentCatalogGuid));
             }
         } else {
             $doc->addField(Zend_Search_Lucene_Field::Keyword('parentGuid', $rowCatalog->guid));
         }
         $doc->addField(Zend_Search_Lucene_Field::Text('profile', $rowCatalog->profileGuid));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('publishedDate', $this->_filterDateTime($rowCatalog->publishedDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('expiredDate', $this->_filterDateTime($rowCatalog->expiredDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('createdBy', $rowCatalog->createdBy));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedBy', $rowCatalog->modifiedBy));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('createdDate', $this->_filterDateTime($rowCatalog->createdDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('modifiedDate', $this->_filterDateTime($rowCatalog->modifiedDate)));
         $doc->addField(Zend_Search_Lucene_Field::Keyword('status', $rowCatalog->status));
         if ($rowCatalog->profileGuid == 'kutu_doc') {
             $doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'file'));
         } else {
             $doc->addField(Zend_Search_Lucene_Field::Keyword('objectType', 'catalog'));
         }
         $rowsetCatalogAttribute = $rowCatalog->findDependentRowsetCatalogAttribute();
         if (count($rowsetCatalogAttribute)) {
             foreach ($rowsetCatalogAttribute as $rowCatalogAttribute) {
                 switch ($rowCatalogAttribute->attributeGuid) {
                     case 'fixedTitle':
                     case 'title':
                         $doc->addField(Zend_Search_Lucene_Field::Text('title', $rowCatalogAttribute->value));
                         break;
                     case 'fixedSubTitle':
                     case 'subTitle':
                         $doc->addField(Zend_Search_Lucene_Field::Text('subtitle', $rowCatalogAttribute->value));
                         break;
                     case 'fixedContent':
                     case 'content':
                         $docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value);
                         $cleanedText = $docHtml->getFieldValue('body');
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('content', $cleanedText));
                         break;
                     case 'fixedKeywords':
                     case 'keywords':
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('keywords', $rowCatalogAttribute->value));
                         break;
                     case 'fixedDescription':
                     case 'description':
                         $doc->addField(Zend_Search_Lucene_Field::Text('description', $rowCatalogAttribute->value));
                         break;
                     case 'ptsKetua':
                         $doc->addField(Zend_Search_Lucene_Field::Text('judge', $rowCatalogAttribute->value));
                         break;
                     case 'prtNomor':
                     case 'fixedNomor':
                     case 'fixedNumber':
                     case 'nomor':
                     case 'ptsNomor':
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('number', $rowCatalogAttribute->value));
                         break;
                     case 'prtTahun':
                     case 'fixedTahun':
                     case 'fixedYear':
                     case 'tahun':
                     case 'ptsTahun':
                         $doc->addField(Zend_Search_Lucene_Field::UnStored('year', $rowCatalogAttribute->value));
                         break;
                     default:
                         //check if attribute is a datetime field
                         $tblAttribute = new Kutu_Core_Orm_Table_Attribute();
                         $rowAttribute = $tblAttribute->find($rowCatalogAttribute->attributeGuid)->current();
                         if ($rowAttribute->type == 4) {
                             $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $this->_filterDateTime($rowCatalogAttribute->value)));
                         } else {
                             if ($rowAttribute->type == 2) {
                                 $docHtml = Zend_Search_Lucene_Document_Html::loadHTML($rowCatalogAttribute->value);
                                 $cleanedText = $docHtml->getFieldValue('body');
                                 $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $cleanedText));
                             } else {
                                 $doc->addField(Zend_Search_Lucene_Field::UnStored(strtolower($rowCatalogAttribute->attributeGuid), $rowCatalogAttribute->value));
                             }
                         }
                         break;
                 }
             }
             //if profile=kutu_doc, extract text from its file and put it in content field
             if ($rowCatalog->profileGuid == 'kutu_doc') {
                 $row = $rowsetCatalogAttribute->findByAttributeGuid('docSystemName');
                 $systemName = $row->value;
                 $row = $rowsetCatalogAttribute->findByAttributeGuid('docMimeType');
                 $mimeType = $row->value;
                 $extactedText = $this->_extractText($rowCatalog->guid, $systemName, $mimeType);
                 $doc->addField(Zend_Search_Lucene_Field::UnStored('content', $extactedText));
             }
         }
         // if catalog is a kutu_doc, and if field content empty (this means
         // file can't be read, text can't be extracted, or file empty), do not index
         if ($rowCatalog->profileGuid == 'kutu_doc') {
             $tmpS = $doc->getFieldValue('content');
             if (!empty($tmpS)) {
                 $index->addDocument($doc);
             } else {
             }
         } else {
             $index->addDocument($doc);
         }
     } else {
         // do nothing
     }
 }
Ejemplo n.º 10
0
 protected function _spider($url)
 {
     $queue = array();
     $visited = array();
     array_push($queue, $url);
     while (!empty($queue)) {
         $doc = null;
         $url = array_shift($queue);
         if ($url = $this->_sanitizeUrl($url)) {
             if (!in_array($url, $visited)) {
                 $visited[] = $url;
                 Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG);
                 $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html';
                 if (file_exists($cachePath)) {
                     Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG);
                     try {
                         $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8');
                     } catch (Exception $e) {
                         Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG);
                     }
                 } else {
                     $this->_client->setUri($this->_getAbsoluteUrl($url));
                     try {
                         $response = $this->_client->request();
                         $status = $response->getStatus();
                         Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG);
                         if ($status == '200') {
                             $data = $response->getBody();
                             $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8');
                         }
                     } catch (Exception $e) {
                         Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG);
                     }
                 }
                 if ($doc !== null) {
                     $this->_search()->indexDoc($doc, $url);
                     $this->_indexed++;
                     $links = array_diff($doc->getLinks(), $this->_visited);
                     if (count($visited) < $this->_maxLinks) {
                         $queue = array_merge($queue, $links);
                     } else {
                         Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG);
                         exit;
                     }
                 }
             }
         }
     }
 }
Ejemplo n.º 11
0
 /**
  * Loop through all URIs
  * 
  * @return void
  * @access public
  */
 public function build()
 {
     // Process the queue
     $i = 0;
     foreach ($this->pageList as $page) {
         $uri = translateURL("page/{$page['id']}");
         try {
             /**
              * check if customised template for indexing exists
              * this is DEPRECATED approach how to customise indexable content, use getExcludes() instead
              * remember that you need also to create controller for the template
              */
             if (file_exists(ONXSHOP_PROJECT_DIR . "templates/node/page/{$page['node_controller']}_indexable.html")) {
                 $toFetch = "request/sys/html5.node/page/{$page['node_controller']}_indexable~id={$page['id']}~";
             } else {
                 $toFetch = "request/sys/html5.node~id={$page['id']}~";
             }
             msg("Fetching page {$page['id']}: {$uri} using {$toFetch}");
             $this->client->setUri($this->profile['uri'] . $toFetch);
             $response = $this->client->request();
             if ($response->isSuccessful() && !$response->isRedirect() && !$response->isError()) {
                 $response_body = $this->filterHtmlDocument($response->getBody());
                 $this->index($uri, Zend_Search_Lucene_Document_Html::loadHTML($response_body, true));
             }
         } catch (Exception $e) {
             msg("HTTP fetch exception: " . $e->getMessage());
         }
         $i++;
         // if ($i == 10) break;
     }
     // Optimize index.
     $this->indexOptimize();
 }
Ejemplo n.º 12
0
 static function splitWordsFromCatalog($catalogGuid, $iLimit)
 {
     $desc = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedDescription');
     $content = Kutu_Core_Util::getCatalogAttributeValue($catalogGuid, 'fixedContent');
     $desc = Zend_Search_Lucene_Document_Html::loadHTML($desc);
     $content = Zend_Search_Lucene_Document_Html::loadHTML($content);
     $desc = $desc->getFieldValue('body');
     $content = $content->getFieldValue('body');
     if (!empty($desc)) {
         if ($iLimit > str_word_count($desc)) {
             return $desc;
         } else {
             return Kutu_Core_Util::getNumberOfWords($desc, $iLimit);
         }
     }
     if (!empty($content)) {
         if ($iLimit > str_word_count($content)) {
             return $content;
         } else {
             return Kutu_Core_Util::getNumberOfWords($content, $iLimit);
         }
     }
     return '';
 }
Ejemplo n.º 13
0
 /**
  * Index with Zend_Lucene
  *
  * @param unknown_type $uri
  * @param unknown_type $htmlString
  */
 function indexContent($uri, $htmlString)
 {
     require_once 'Zend/Search/Lucene.php';
     $index_location = ONXSHOP_PROJECT_DIR . 'var/index';
     if (is_dir($index_location)) {
         // Open existing index
         try {
             $index = Zend_Search_Lucene::open($index_location);
         } catch (Exception $e) {
             // Create index
             try {
                 $index = Zend_Search_Lucene::create($index_location);
             } catch (Exception $e) {
                 $index = false;
             }
         }
     }
     if ($index) {
         // find and remove pages with the same URI
         $hits = $index->find("uri:" . $uri);
         foreach ($hits as $hit) {
             $index->delete($hit);
         }
         $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString, true);
         $doc->addField(Zend_Search_Lucene_Field::Keyword('uri', $uri));
         $index->addDocument($doc);
         $index->commit();
     }
 }
Ejemplo n.º 14
0
 /**
  * 
  * @param $article
  * @param $isNew
  */
 function onIndexContent($article, $isNew = false)
 {
     //FIXME move the content type tests and following transformations to the helper
     global $mainframe;
     $pk = $article->id;
     if (!$isNew) {
         JuceneHelper::removeFromIndex('pk:' . $pk);
     }
     $index = JuceneHelper::getIndex();
     $xml_field = substr($article->fulltext, 0, 5) != '<?xml' ? $article->introtext : $article->fulltext;
     if (substr($xml_field, 0, 5) == '<?xml') {
         $dom = new DOMDocument();
         $pmml = true;
         $xslt = new DOMDocument();
         $error = false;
         //load xslt stylesheet
         if (!@$xslt->load(JPATH_SITE . DS . 'administrator' . DS . 'components' . DS . 'com_jucene' . DS . 'xslt/jucene.xsl')) {
             $error = true;
             $this->raiseMessage("XSLTLOADERROR", 'error');
         }
         $proc = new XSLTProcessor();
         if (!$proc->importStylesheet($xslt)) {
             $error = true;
             $this->raiseMessage("XSLTIMPORTERROR", 'error');
         }
         unset($artcile->fulltext);
         unset($record->introtext);
         if ($dom->loadXML($xml_field) && !$error && $pmml) {
             //simplify the document - prepare it for the indexation process
             $xslOutput = $proc->transformToXml($dom);
             //create new DOM document to preserve output and transform the XML to the indexable one
             $transXml = new DOMDocument();
             $transXml->preserveWhitespace = false;
             @$transXml->loadXML($xslOutput);
             //unset unneccessary variables
             unset($xslOutput);
             unset($dom);
             unset($xslt);
             //index every assoc rule as document with same credentials
             if (!$error) {
                 $rules = $transXml->getElementsByTagName("AssociationRule");
                 $rulesCount = $rules->length;
                 if ($rulesCount == 0) {
                     $error = true;
                     $this->raiseMessage('XMLDOCUMENTNORULES', 'error');
                 }
                 $rule_doc_position = 0;
                 foreach ($rules as $rule) {
                     $additional['rating'] = 0;
                     $additional['position'] = $rule_doc_position;
                     JPluginHelper::importPlugin('content');
                     $dispatcher =& JDispatcher::getInstance();
                     $results = $dispatcher->trigger('onIndexPmml', array($rule, $additional));
                     $rule_doc_position++;
                 }
             }
         }
     } else {
         $zendDoc = Zend_Search_Lucene_Document_Html::loadHTML($article->fulltext, false, UTF - 8);
         $index->addDocument($zendDoc);
     }
 }
Ejemplo n.º 15
0
 /**
  * index a file
  *
  * @author Jörn Dreyer <*****@*****.**>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     if (!$view->file_exists($path)) {
         Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG);
         return true;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimeType = $data['mimetype'];
         // initialize plain lucene document
         $doc = new \Zend_Search_Lucene_Document();
         // index content for local files only
         $localFile = $view->getLocalFile($path);
         if ($localFile) {
             //try to use special lucene document types
             if ('text/plain' === $mimeType) {
                 $body = $view->file_get_contents($path);
                 if ($body != '') {
                     $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
                 }
             } else {
                 if ('text/html' === $mimeType) {
                     //TODO could be indexed, even if not local
                     $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
                 } else {
                     if ('application/pdf' === $mimeType) {
                         $doc = Pdf::loadPdf($view->file_get_contents($path));
                         // commented the mimetype checks, as the zend classes only understand docx and not doc files.
                         // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ...
                         //} else if ('application/msword' === $mimeType) {
                     } else {
                         if (strtolower(substr($data['name'], -5)) === '.docx') {
                             $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile);
                             //} else if ('application/msexcel' === $mimeType) {
                         } else {
                             if (strtolower(substr($data['name'], -5)) === '.xlsx') {
                                 $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile);
                                 //} else if ('application/mspowerpoint' === $mimeType) {
                             } else {
                                 if (strtolower(substr($data['name'], -5)) === '.pptx') {
                                     $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile);
                                 } else {
                                     if (strtolower(substr($data['name'], -4)) === '.odt') {
                                         $doc = Odt::loadOdtFile($localFile);
                                     } else {
                                         if (strtolower(substr($data['name'], -4)) === '.ods') {
                                             $doc = Ods::loadOdsFile($localFile);
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
         // Store filecache id as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store filename
         $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8'));
         // Store document path to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8'));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType));
         //self::extractMetadata($doc, $path, $view, $mimeType);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
 }
Ejemplo n.º 16
0
 /**
  * Gets the indexed page content.
  *
  * @param string $pathInfo        url path of the page
  * @param string $Query        The search query index file
  * @param int    $MaxResultByWord
  * @param string $class
  * @param int    $MaxLimitCara
  * @return string
  * @access    public
  *
  * @author Etienne de Longeaux <*****@*****.**>
  * @since 2012-06-11
  */
 public function contentPage($pathInfo, $Query = null, $MaxResultByWord = 5, $class = "", $MaxLimitCara = 0)
 {
     $body = "";
     $searchWords = explode(' ', strtolower($Query));
     $result_search = null;
     try {
         // we get the content of the page.
         $body = file_get_contents($this->container->get('request')->getUriForPath('') . $pathInfo);
         // we delete all contents of tags which are given in params (and all tags which are inside).
         $body = $this->deleteTags($body);
         // we get the only words of the body content of the page.
         $body = \Zend_Search_Lucene_Document_Html::loadHTML($body, false)->getFieldUtf8Value('body');
         foreach ($searchWords as $key => $word) {
             $new_word = strtolower($word);
             $new_word = str_replace("e", "#@@@#", $new_word);
             $new_word = str_replace("é", "[ée]{1,2}", $new_word);
             $new_word = str_replace("è", "[èe]{1,2}", $new_word);
             $new_word = str_replace("ê", "[êe]{1,2}", $new_word);
             $new_word = str_replace("ë", "[ëe]{1,2}", $new_word);
             $new_word = str_replace("#@@@#", "[éèeêë]{1,2}", $new_word);
             $matches_word = preg_split("#{$new_word}#i", $body);
             if (($MaxLimitCara - strlen($word)) % 2 == 0) {
                 $maxLimitSegment = ($MaxLimitCara - strlen($word)) / 2;
             } else {
                 $maxLimitSegment = ($MaxLimitCara - strlen($word) + 1) / 2;
             }
             foreach ($matches_word as $key => $value) {
                 if ($key < intval($MaxResultByWord)) {
                     if ($MaxLimitCara != 0) {
                         $words = explode(' ', $value);
                         $words_inverse = array_reverse($words);
                         $inverse_chaine = implode(' ', $words_inverse);
                         $inverse_chaine = $this->container->get('sfynx.tool.string_manager')->truncate($inverse_chaine, $maxLimitSegment, '...');
                         $words_inverse = explode(' ', $inverse_chaine);
                         $words = array_reverse($words_inverse);
                         $Contents = implode(' ', $words);
                     } else {
                         $Contents = $value;
                     }
                     if (isset($matches_word[$key + 1])) {
                         if (!empty($class)) {
                             $Contents .= "<span class='{$class}' >" . strtoupper($word) . '</span>';
                         } else {
                             $Contents .= "<span style='color:white;background-color:black;font-size:13;font-weight:bold;' >" . strtoupper($word) . '</span>';
                         }
                         if ($MaxLimitCara == 0) {
                             $Contents .= $matches_word[$key + 1];
                         } else {
                             $Contents .= $this->container->get('sfynx.tool.string_manager')->truncate($matches_word[$key + 1], $maxLimitSegment, '');
                         }
                     }
                     $result_search[] = $Contents;
                 }
                 // end if
             }
             // end foreach
         }
         return implode(' ', $result_search);
     } catch (\Exception $e) {
         return '';
     }
 }
Ejemplo n.º 17
0
 /**
  * Crawl a URI
  *
  * @param string $uri
  * @return void
  * @access protected
  */
 protected function crawl($uri)
 {
     msg("Crawling: {$uri}");
     $this->uriProcessed[] = $uri;
     $uri_parts = parse_url($uri);
     $this->validateLinkForCrawl($uri);
     if ($this->validateLinkForCrawl($uri)) {
         // Retrieve the content
         $this->client->setUri($uri);
         try {
             $response = $this->client->request();
             if ($response->isSuccessful() && !$response->isRedirect() && !$response->isError()) {
                 //msg("Response status: ". $response->getStatus());
                 $this->index($uri_parts['path'], Zend_Search_Lucene_Document_Html::loadHTML($response->getBody(), true));
             }
         } catch (Exception $e) {
         }
     }
 }
 /**
  * @group ZF-10686
  */
 public function testLoadHtmlWithAttributesInTagHTML()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertEquals('Page title ', $doc->title);
 }
Ejemplo n.º 19
0
function rebuild_search_indexes()
{
    global $success_msg;
    global $error_msg;
    global $warning_msg;
    global $all_settings;
    global $indexable_folders;
    $index_folder = get_setting('search_indexes_folder', $all_settings);
    try {
        setlocale(LC_CTYPE, LOCALE);
        Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive());
        $index = new Zend_Search_Lucene($index_folder, true);
        $files_to_index = get_website_files($indexable_folders);
        foreach ($files_to_index as $html_file => $page_url) {
            if (can_index_website_file($html_file)) {
                $f1 = strtolower($html_file);
                if (end_with($f1, 'html') || end_with($f1, 'htm')) {
                    $file_content = file_get_contents($html_file);
                } elseif (end_with($f1, 'php')) {
                    if (is_http_code_200($page_url)) {
                        $file_content = get_url_content($page_url);
                    }
                }
                if (isset($file_content)) {
                    $file_content = '<html>' . strstr($file_content, '<head');
                    $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content, true, 'UTF-8');
                    $doc->addField(Zend_Search_Lucene_Field::Text('url', $page_url, 'UTF-8'));
                    $index->addDocument($doc);
                    flush();
                }
            }
        }
        $broken_urls = array();
        foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) {
            if (is_http_code_200($url)) {
                $content = get_url_content($url);
                $content = '<html>' . strstr($content, '<head');
                $doc = Zend_Search_Lucene_Document_Html::loadHTML($content, true, 'UTF-8');
                $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8'));
                $index->addDocument($doc);
                flush();
            } else {
                array_push($broken_urls, $url);
            }
        }
        if (file_exists($index_folder)) {
            if (count($broken_urls) > 0) {
                $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>';
                $warning_msg .= '<ul class="disc">';
                foreach ($broken_urls as $broken_url) {
                    $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>';
                }
                $warning_msg .= '</ul>';
                $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>';
            } else {
                $success_msg = 'The website was successfully indexed.';
            }
        } else {
            $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created';
        }
    } catch (Exception $e) {
        $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage();
    }
}
Ejemplo n.º 20
0
 /**
  * Gets our HTML from MySQL Workbench for us.
  *
  * @access private
  * @param Zend_Response $response
  * @return String
  * 
  */
 private function _getHTMLResponse($response)
 {
     if (200 === $response->getStatus()) {
         $doc = Zend_Search_Lucene_Document_Html::loadHTML($response->getBody());
         return $doc->getHTML();
     }
     return false;
 }
Ejemplo n.º 21
0
function rebuild_search_indexes()
{
    global $success_msg;
    global $error_msg;
    global $warning_msg;
    global $all_settings;
    $index_folder = get_setting('search_indexes_folder', $all_settings);
    try {
        $index = new Zend_Search_Lucene($index_folder, true);
        setlocale(LC_CTYPE, 'en_US');
        foreach (get_all_html_files(dirname(__FILE__)) as $html_file => $html_url) {
            if (can_index_html_file($html_file)) {
                $file_content = file_get_contents($html_file);
                $file_content = '<html>' . strstr($file_content, '<head');
                $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content);
                $doc->addField(Zend_Search_Lucene_Field::Text('url', $html_url, 'UTF-8'));
                $index->addDocument($doc);
                flush();
            }
        }
        $broken_urls = array();
        foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) {
            $headers = get_headers($url);
            if (strrpos($headers[0], '200')) {
                $content = file_get_contents($url);
                $content = '<html>' . strstr($content, '<head');
                $doc = Zend_Search_Lucene_Document_Html::loadHTML($content);
                $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8'));
                $index->addDocument($doc);
                flush();
            } else {
                array_push($broken_urls, $url);
            }
        }
        if (file_exists($index_folder)) {
            if (count($broken_urls) > 0) {
                $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>';
                $warning_msg .= '<ul class="disc">';
                foreach ($broken_urls as $broken_url) {
                    $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>';
                }
                $warning_msg .= '</ul>';
                $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>';
            } else {
                $success_msg = 'The website was successfully indexed.';
            }
        } else {
            $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created';
        }
    } catch (Exception $e) {
        $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage();
    }
}
Ejemplo n.º 22
0
 /**
  * Highlight matches in $inputHtmlFragment and return it (without HTML header and body tag)
  *
  * @param string $inputHtmlFragment
  * @param string  $encoding   Input HTML string encoding
  * @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter
  * @return string
  */
 public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null)
 {
     if ($highlighter === null) {
         // require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php';
         $highlighter = new Zend_Search_Lucene_Search_Highlighter_Default();
     }
     $inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($encoding, 'UTF-8//IGNORE', $inputHtmlFragment) . '</body></html>';
     /** Zend_Search_Lucene_Document_Html */
     // require_once 'Zend/Search/Lucene/Document/Html.php';
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
     $highlighter->setDocument($doc);
     $this->_highlightMatches($highlighter);
     return $doc->getHtmlBody();
 }
Ejemplo n.º 23
0
 /**
  * Extract data from a PDF document and add this to the Lucene index.
  *
  * @param \Zend_Search_Lucene_Proxy $Index             The Lucene index object.
  * @param string                    $type            ['html', 'docx', 'xsls', 'pptx', 'content']
  * @param array                        $indexValues
  * @param string                    $locale
  * @param object                    $obj
  * @param string                     $pathFile        The path to the PDF document.
  *
  * @return \Zend_Search_Lucene_Proxy
  * @access    public
  * @static
  * @author Etienne de Longeaux <*****@*****.**>
  * @since 2012-06-11
  */
 public static function index(\Zend_Search_Lucene_Proxy $Index, $type, $indexValues = null, $locale = '', $obj = null, $pathFile = '')
 {
     // ignore invalid characters for lucene text search
     \Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('utf-8');
     \Zend_Search_Lucene_Analysis_Analyzer::setDefault(new \Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive());
     self::$_index = $Index;
     self::$_doc = null;
     switch ($type) {
         case "html":
             self::$_doc = \Zend_Search_Lucene_Document_Html::loadHtmlFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "docx":
             self::$_doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "xsls":
             self::$_doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "pptx":
             self::$_doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($pathFile, false);
             $indexValues['Key'] = filemtime($pathFile);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
         case "page":
             // we create a new instance of Zend_Search_Lucene_Document
             self::$_doc = \Zend_Search_Lucene_Document_Html::loadHTML($indexValues['Contents'], false);
             $indexValues['Contents'] = self::$_doc->getFieldUtf8Value('body');
             break;
     }
     if (self::$_doc instanceof \Zend_Search_Lucene_Document) {
         // Remove all accens
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::minusculesSansAccents($indexValues['Contents']);
         // Remove all doublons
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::uniqueWord($indexValues['Contents']);
         // clean the content
         $indexValues['Contents'] = \Sfynx\ToolBundle\Util\PiStringManager::cleanContent($indexValues['Contents']);
         // Delete all stop words
         $stopWord = \Sfynx\ToolBundle\Util\PiStringManager::stopWord(strtolower($locale));
         if ($stopWord) {
             $wordsIndex = explode(' ', $indexValues['Contents']);
             $diff = array_diff($wordsIndex, $stopWord);
             $indexValues['Contents'] = implode(' ', $diff);
         }
         //             print_r($locale);
         //             print_r('<br /><br /><br />');
         //             print_r(implode(' ', $wordsIndex));
         //             print_r('<br /><br /><br />');
         //             print_r(implode(' ', $stopWord));
         //             print_r('<br /><br /><br />');
         //             print_r($indexValues['Contents']);
         //             print_r('<br /><br /><br />');
         // If the document creation was sucessful then add it to our index.
         try {
             setlocale(LC_ALL, $locale);
             self::defaultAddFields($indexValues);
             self::addDocument();
             //                 print_r($indexValues['Key']);
             //                 print_r('<br />');
             //                 print_r($indexValues['Contents']);
             //                 print_r('<br /><br /><br />');
         } catch (\Exception $e) {
             setlocale(LC_ALL, 'fr_FR');
             self::defaultAddFields($indexValues);
             try {
                 self::addDocument();
             } catch (\Exception $e) {
             }
         }
     }
     // Return the Lucene index object.
     return self::$_index;
 }