protected function _indexate($url) { if (!stristr($url, 'http://')) { $url = HTTP_HOST . $url; } $url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url; if (!in_array($url, $this->_indexedUrl)) { if (stristr($url, HTTP_HOST)) { array_push($this->_indexedUrl, $url); $html = file_get_contents($url); libxml_use_internal_errors(true); $doc = Zend_Search_Lucene_Document_Html::loadHTML($html); libxml_use_internal_errors(false); if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) { $html = $matches[1]; } $html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html); $html = strip_tags($html); $doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8')); $doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8')); $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8')); $this->_indexHandle->addDocument($doc); Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO); foreach ($doc->getLinks() as $link) { $temp = explode('.', $link); $ext = end($temp); if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) { $this->_indexate($link); } } } } }
/** * Inserts the provided action */ public function insert() { if (!$this->shouldIndex()) { return; } throw new sfException(__CLASS__ . ' not implemented'); extract($this->getActionProperties()); $output = $this->executeAction($params); $content = $output->getContent(); $doc = Zend_Search_Lucene_Document_Html::loadHtml($content); $doc->addField('sfl_title', $output->getLastTitle(), 2); $doc->addField('sfl_uri', $this->getUri($params)); $doc->addField('sfl_description', $content); $doc->addField('sfl_type', 'action'); $categories = $this->getActionCategories(); if (count($categories)) { foreach ($categories as $category) { $this->addCategory($category); } $doc->addField('sfl_category', implode(', ', $categories)); } $doc->addField('sfl_categories_cache', serialize($categories)); $guid = $this->getGuid($params); $this->addDocument($doc, $guid, 'action'); $this->getSearch()->getEventDispatcher()->notify(new sfEvent($this, 'indexer.log', array('Inserted action "%s" of module "%s" to index', $this->getAction(), $this->getModule()))); return $this; }
/** * Highlight specified words * * @param string|array $words Words to highlight. They could be organized using the array or string. */ public function highlight($words) { $color = $this->_highlightColors[$this->_currentColorIndex]; $this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors); $this->_doc->highlight($words, $color); }
public function highlightMatches($inputHTML) { $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML); $colorIndex = 0; $this->highlightMatchesDOM($doc, $colorIndex); return $doc->getHTML(); }
public function testHtml() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $doc->highlight('document', '#66ffff'); $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n"); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
public function testHtml() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $doc->highlight('document', '#66ffff'); $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
/** * Returns a string containing the text in the given HTML document. * * @param String $filename Full filesystem path to the file to process. * @return String Text extracted from the file. */ public static function extract($filename) { if (!file_exists($filename)) { return ''; } try { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, true); } catch (Exception $e) { return ''; } return $doc->body; }
public function addurl() { // use a local file for purpose of demo. $filename = MODPATH . "kosearch" . DIRECTORY_SEPARATOR . "examples" . DIRECTORY_SEPARATOR . "kohana_home.html"; // Note: the Search class is responsible for loading the Zend libraries, so as we // want to instantiate Zend_Search_Lucene_Document_Html prior to calling singleton, // we must first call Search::instance()->load_search_libs(); Search::instance()->load_search_libs(); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, TRUE, "utf-8"); Search::instance()->addDocument($doc); $this->index('Kohana page successfully added ↓ <a href="#form2" title="scroll down">scroll down</a> ↓'); }
public function testHtmlNoFollowLinks() { $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>'; $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks(); Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false); $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html); $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html')); Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true); $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html); $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html')); }
/** * index a file * * @author Jörn Dreyer <*****@*****.**> * * @param string $path the path of the file * * @return bool */ public static function indexFile($path = '', $user = null) { if (!Filesystem::isValidPath($path)) { return; } if ($path === '') { //ignore the empty path element return false; } if (is_null($user)) { $view = Filesystem::getView(); $user = \OCP\User::getUser(); } else { $view = new \OC\Files\View('/' . $user . '/files'); } if (!$view) { Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN); return false; } $root = $view->getRoot(); $pk = md5($root . $path); // the cache already knows mime and other basic stuff $data = $view->getFileInfo($path); if (isset($data['mimetype'])) { $mimetype = $data['mimetype']; if ('text/html' === $mimetype) { $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path)); } else { if ('application/msword' === $mimetype) { // FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem //$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path)); //no special treatment yet $doc = new \Zend_Search_Lucene_Document(); } else { $doc = new \Zend_Search_Lucene_Document(); } } // store fscacheid as unique id to lookup by when deleting $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk)); // Store document URL to identify it in the search results $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path)); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size'])); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype)); self::extractMetadata($doc, $path, $view, $mimetype); Lucene::updateFile($doc, $path, $user); return true; } else { Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR); return false; } }
/** * @param string $websiteId * @return string */ public function indexWebsite($websiteId) { $websiteService = new Website('Website'); if (!$websiteService->existsWebsiteAlready($websiteId)) { throw new CmsException('602', __METHOD__, __LINE__); } // Zum Rendern muss die Business-Schicht verwendet werden $renderBusiness = new BusinessRender('Render'); $modulService = new Modul('Modul'); $pageService = new Page('Page'); $allPageIds = $pageService->getIdsByWebsiteId($websiteId); $indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId); if (is_array($allPageIds) && count($allPageIds) > 0) { if (file_exists($indexFileOfWebsite)) { $index = \Zend_Search_Lucene::open($indexFileOfWebsite); $numberOfIndexedDocuments = $index->numDocs(); for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) { if (!$index->isDeleted($id)) { $document = $index->delete($id); } } } else { $index = \Zend_Search_Lucene::create($indexFileOfWebsite); } foreach ($allPageIds as $pageId) { $pageContent = $this->getPageContent($websiteId, $pageId); if ($this->isStoreContentEnabled()) { $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8'); } else { $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8'); } $document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent))); $document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId)); $index->addDocument($document); } $index->commit(); $index->optimize(); unset($index); } return $indexFileOfWebsite; }
/** * Highlight query terms * * @param integer &$colorIndex * @param Zend_Search_Lucene_Document_Html $doc */ public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) { /** @todo implementation */ $words = array(); $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/'; if (@preg_match('/\\pL/u', 'a') == 1) { // PCRE unicode support is turned on // add Unicode modifier to the match expression $matchExpression .= 'u'; } $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8'); foreach ($tokens as $token) { if (preg_match($matchExpression, $token->getTermText()) === 1) { $words[] = $token->getTermText(); } } $doc->highlight($words, $this->_getHighlightColor($colorIndex)); }
/** * @group ZF-10686 */ public function testLoadHtmlWithAttributesInTagHTML() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertEquals('Page title ', $doc->title); }
/** * index a file * * @author Jörn Dreyer <*****@*****.**> * * @param string $path the path of the file * * @return bool */ public static function indexFile($path = '', $user = null) { if (!Filesystem::isValidPath($path)) { return; } if ($path === '') { //ignore the empty path element return false; } if (is_null($user)) { $view = Filesystem::getView(); $user = \OCP\User::getUser(); } else { $view = new \OC\Files\View('/' . $user . '/files'); } if (!$view) { Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN); return false; } if (!$view->file_exists($path)) { Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG); return true; } $root = $view->getRoot(); $pk = md5($root . $path); // the cache already knows mime and other basic stuff $data = $view->getFileInfo($path); if (isset($data['mimetype'])) { $mimeType = $data['mimetype']; // initialize plain lucene document $doc = new \Zend_Search_Lucene_Document(); // index content for local files only $localFile = $view->getLocalFile($path); if ($localFile) { //try to use special lucene document types if ('text/plain' === $mimeType) { $body = $view->file_get_contents($path); if ($body != '') { $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body)); } } else { if ('text/html' === $mimeType) { //TODO could be indexed, even if not local $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path)); } else { if ('application/pdf' === $mimeType) { $doc = Pdf::loadPdf($view->file_get_contents($path)); // commented the mimetype checks, as the zend classes only understand docx and not doc files. // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ... //} else if ('application/msword' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.docx') { $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile); //} else if ('application/msexcel' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.xlsx') { $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile); //} else if ('application/mspowerpoint' === $mimeType) { } else { if (strtolower(substr($data['name'], -5)) === '.pptx') { $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile); } else { if (strtolower(substr($data['name'], -4)) === '.odt') { $doc = Odt::loadOdtFile($localFile); } else { if (strtolower(substr($data['name'], -4)) === '.ods') { $doc = Ods::loadOdsFile($localFile); } } } } } } } } } // Store filecache id as unique id to lookup by when deleting $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk)); // Store filename $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8')); // Store document path to identify it in the search results $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8')); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size'])); $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType)); //self::extractMetadata($doc, $path, $view, $mimeType); Lucene::updateFile($doc, $path, $user); return true; } else { Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR); return false; } }
private function _extractText($guid, $systemName, $fileName, $mimeType, $lang = 'id') { $query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'"; $results = $this->getDbHandler($lang)->query($query); $rowset = $results->fetchAll(PDO::FETCH_OBJ); if (count($rowset)) { $row = $rowset[0]; $parentCatalogGuid = $row->relatedGuid; if (!empty($systemName)) { $fileName = $systemName; } $sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName; $sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName; $sDir = ''; if (file_exists($sDir1)) { $sDir = $sDir1; } else { if (file_exists($sDir2)) { $sDir = $sDir2; } } if (!empty($sDir)) { $outpath = $sDir . '.txt'; switch ($mimeType) { case 'application/pdf': //$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&literal.name=content&commit=true'); /*$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&fmap.content=content&commit=true'); curl_setopt ($ch, CURLOPT_POSTFIELDS, array('myfile'=>'@'.$sDir)); curl_setopt ($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_BINARYTRANSFER, TRUE); curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-type:multipart/form-data')); $result = curl_exec ($ch);*/ /*$mapping_array = [ "literal.id" => "$guid", "fmap.content" => "content", "commit" => "true" ]; $ch = curl_init(); $solr_extraction_endpoint = "http://192.168.0.61:8983/solr/corehol/update/extract"; curl_setopt($ch, CURLOPT_POST, TRUE); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_URL, ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&'))); $cfile = curl_file_create($sDir); curl_setopt($ch, CURLOPT_POSTFIELDS, array('myfile' => $cfile)); if(!curl_exec($ch) == TRUE) { throw new Exception('Curl Error:' . curl_error($ch)); echo "<br/>Curl Error:<br/>" . curl_error($ch); } curl_close($ch); die;*/ //curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type:multipart/form-data')); /*$cfile = $this->getCurlValue($sDir,'multipart/form-data',$fileName); $data = array('file' => $cfile); $ch = curl_init(); $options = array(CURLOPT_URL => ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')), CURLOPT_RETURNTRANSFER => true, CURLINFO_HEADER_OUT => true, //Request header CURLOPT_HEADER => true, //Return header CURLOPT_SSL_VERIFYPEER => false, //Don't veryify server certificate CURLOPT_POST => true, CURLOPT_POSTFIELDS => $data ); curl_setopt_array($ch, $options); $result = curl_exec($ch); $header_info = curl_getinfo($ch,CURLINFO_HEADER_OUT); $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); $header = substr($result, 0, $header_size); $body = substr($result, $header_size); curl_close($ch);*/ //system('curl "http://192.168.0.61:8983/solr/corehol/update/extract?literal.id="'.$guid.'"&fmap.content=content&commit=true" -F "myfile=@"'.$sDir); //system('curl "'.($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')).'" -F "myfile=@"'.$sDir); $pdfExtractor = $this->_pdfExtractor; system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n"; if (strlen($value) > 20) { return (new Pandamp_Utility_Posts())->sanitize_post_content($value); } else { echo "content file kosong\n"; return ''; } } if ($ret == 127) { print "Could not find pdftotext tool.\n"; } return ''; if ($ret == 1) { print "Could not find pdf file.\n"; } return ''; break; case 'text/html': case 'text/plain': $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir); return $docHtml->getFieldValue('body'); break; case 'application/x-javascript': case 'application/octet-stream': case 'application/msword': if (strpos(strtolower($fileName), '.doc')) { $extractor = $this->_wordExtractor; system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo $value; return $value; } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } } else { return ''; } break; default: return ''; break; } } } return; }
/** * Highlight matches in $inputHtmlFragment and return it (without HTML header and body tag) * * @param string $inputHtmlFragment * @param string $encoding Input HTML string encoding * @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter * @return string */ public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null) { if ($highlighter === null) { // require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php'; $highlighter = new Zend_Search_Lucene_Search_Highlighter_Default(); } $inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($encoding, 'UTF-8//IGNORE', $inputHtmlFragment) . '</body></html>'; /** Zend_Search_Lucene_Document_Html */ // require_once 'Zend/Search/Lucene/Document/Html.php'; $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML); $highlighter->setDocument($doc); $this->_highlightMatches($highlighter); return $doc->getHtmlBody(); }
/** * Highlight query terms * * @param integer &$colorIndex * @param Zend_Search_Lucene_Document_Html $doc */ public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) { $doc->highlight($this->_term->text, $this->_getHighlightColor($colorIndex)); }
/** * Gets our HTML from MySQL Workbench for us. * * @access private * @param Zend_Response $response * @return String * */ private function _getHTMLResponse($response) { if (200 === $response->getStatus()) { $doc = Zend_Search_Lucene_Document_Html::loadHTML($response->getBody()); return $doc->getHTML(); } return false; }
/** * * @param $article * @param $isNew */ function onIndexContent($article, $isNew = false) { //FIXME move the content type tests and following transformations to the helper global $mainframe; $pk = $article->id; if (!$isNew) { JuceneHelper::removeFromIndex('pk:' . $pk); } $index = JuceneHelper::getIndex(); $xml_field = substr($article->fulltext, 0, 5) != '<?xml' ? $article->introtext : $article->fulltext; if (substr($xml_field, 0, 5) == '<?xml') { $dom = new DOMDocument(); $pmml = true; $xslt = new DOMDocument(); $error = false; //load xslt stylesheet if (!@$xslt->load(JPATH_SITE . DS . 'administrator' . DS . 'components' . DS . 'com_jucene' . DS . 'xslt/jucene.xsl')) { $error = true; $this->raiseMessage("XSLTLOADERROR", 'error'); } $proc = new XSLTProcessor(); if (!$proc->importStylesheet($xslt)) { $error = true; $this->raiseMessage("XSLTIMPORTERROR", 'error'); } unset($artcile->fulltext); unset($record->introtext); if ($dom->loadXML($xml_field) && !$error && $pmml) { //simplify the document - prepare it for the indexation process $xslOutput = $proc->transformToXml($dom); //create new DOM document to preserve output and transform the XML to the indexable one $transXml = new DOMDocument(); $transXml->preserveWhitespace = false; @$transXml->loadXML($xslOutput); //unset unneccessary variables unset($xslOutput); unset($dom); unset($xslt); //index every assoc rule as document with same credentials if (!$error) { $rules = $transXml->getElementsByTagName("AssociationRule"); $rulesCount = $rules->length; if ($rulesCount == 0) { $error = true; $this->raiseMessage('XMLDOCUMENTNORULES', 'error'); } $rule_doc_position = 0; foreach ($rules as $rule) { $additional['rating'] = 0; $additional['position'] = $rule_doc_position; JPluginHelper::importPlugin('content'); $dispatcher =& JDispatcher::getInstance(); $results = $dispatcher->trigger('onIndexPmml', array($rule, $additional)); $rule_doc_position++; } } } } else { $zendDoc = Zend_Search_Lucene_Document_Html::loadHTML($article->fulltext, false, UTF - 8); $index->addDocument($zendDoc); } }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { if (!empty($this->metaFields)) { $ajxpNode->loadNodeInfo(false, false, "all"); } else { $ajxpNode->loadNodeInfo(); } $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent) { $body = $this->extractIndexableContent($ajxpNode); if (!empty($body)) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body)); } } $index->addDocument($doc); return $doc; }
function rebuild_search_indexes() { global $success_msg; global $error_msg; global $warning_msg; global $all_settings; $index_folder = get_setting('search_indexes_folder', $all_settings); try { $index = new Zend_Search_Lucene($index_folder, true); setlocale(LC_CTYPE, 'en_US'); foreach (get_all_html_files(dirname(__FILE__)) as $html_file => $html_url) { if (can_index_html_file($html_file)) { $file_content = file_get_contents($html_file); $file_content = '<html>' . strstr($file_content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content); $doc->addField(Zend_Search_Lucene_Field::Text('url', $html_url, 'UTF-8')); $index->addDocument($doc); flush(); } } $broken_urls = array(); foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) { $headers = get_headers($url); if (strrpos($headers[0], '200')) { $content = file_get_contents($url); $content = '<html>' . strstr($content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($content); $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8')); $index->addDocument($doc); flush(); } else { array_push($broken_urls, $url); } } if (file_exists($index_folder)) { if (count($broken_urls) > 0) { $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>'; $warning_msg .= '<ul class="disc">'; foreach ($broken_urls as $broken_url) { $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>'; } $warning_msg .= '</ul>'; $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>'; } else { $success_msg = 'The website was successfully indexed.'; } } else { $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created'; } } catch (Exception $e) { $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage(); } }
function rebuild_search_indexes() { global $success_msg; global $error_msg; global $warning_msg; global $all_settings; global $indexable_folders; $index_folder = get_setting('search_indexes_folder', $all_settings); try { setlocale(LC_CTYPE, LOCALE); Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive()); $index = new Zend_Search_Lucene($index_folder, true); $files_to_index = get_website_files($indexable_folders); foreach ($files_to_index as $html_file => $page_url) { if (can_index_website_file($html_file)) { $f1 = strtolower($html_file); if (end_with($f1, 'html') || end_with($f1, 'htm')) { $file_content = file_get_contents($html_file); } elseif (end_with($f1, 'php')) { if (is_http_code_200($page_url)) { $file_content = get_url_content($page_url); } } if (isset($file_content)) { $file_content = '<html>' . strstr($file_content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content, true, 'UTF-8'); $doc->addField(Zend_Search_Lucene_Field::Text('url', $page_url, 'UTF-8')); $index->addDocument($doc); flush(); } } } $broken_urls = array(); foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) { if (is_http_code_200($url)) { $content = get_url_content($url); $content = '<html>' . strstr($content, '<head'); $doc = Zend_Search_Lucene_Document_Html::loadHTML($content, true, 'UTF-8'); $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8')); $index->addDocument($doc); flush(); } else { array_push($broken_urls, $url); } } if (file_exists($index_folder)) { if (count($broken_urls) > 0) { $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>'; $warning_msg .= '<ul class="disc">'; foreach ($broken_urls as $broken_url) { $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>'; } $warning_msg .= '</ul>'; $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>'; } else { $success_msg = 'The website was successfully indexed.'; } } else { $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created'; } } catch (Exception $e) { $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage(); } }
/** * Set exclude nofollow links flag * * @param boolean $newValue */ public static function setExcludeNoFollowLinks($newValue) { self::$_excludeNoFollowLinks = $newValue; }
protected function _spider($url) { $queue = array(); $visited = array(); array_push($queue, $url); while (!empty($queue)) { $doc = null; $url = array_shift($queue); if ($url = $this->_sanitizeUrl($url)) { if (!in_array($url, $visited)) { $visited[] = $url; Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG); $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html'; if (file_exists($cachePath)) { Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG); try { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8'); } catch (Exception $e) { Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG); } } else { $this->_client->setUri($this->_getAbsoluteUrl($url)); try { $response = $this->_client->request(); $status = $response->getStatus(); Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG); if ($status == '200') { $data = $response->getBody(); $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8'); } } catch (Exception $e) { Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG); } } if ($doc !== null) { $this->_search()->indexDoc($doc, $url); $this->_indexed++; $links = array_diff($doc->getLinks(), $this->_visited); if (count($visited) < $this->_maxLinks) { $queue = array_merge($queue, $links); } else { Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG); exit; } } } } } }
/** * Index with Zend_Lucene * * @param unknown_type $uri * @param unknown_type $htmlString */ function indexContent($uri, $htmlString) { require_once 'Zend/Search/Lucene.php'; $index_location = ONXSHOP_PROJECT_DIR . 'var/index'; if (is_dir($index_location)) { // Open existing index try { $index = Zend_Search_Lucene::open($index_location); } catch (Exception $e) { // Create index try { $index = Zend_Search_Lucene::create($index_location); } catch (Exception $e) { $index = false; } } } if ($index) { // find and remove pages with the same URI $hits = $index->find("uri:" . $uri); foreach ($hits as $hit) { $index->delete($hit); } $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString, true); $doc->addField(Zend_Search_Lucene_Field::Keyword('uri', $uri)); $index->addDocument($doc); $index->commit(); } }
/** * Given a path to a HTML document returns a lucene document with filename and contents set. * @param $path * @return Zend_Search_Lucene_Document */ function createHTMLDocument($path) { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path); $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path))); return $doc; }
/** * Highlight query terms * * @param integer &$colorIndex * @param Zend_Search_Lucene_Document_Html $doc */ public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) { $words = array(); foreach ($this->_matches as $term) { $words[] = $term->text; } $doc->highlight($words, $this->_getHighlightColor($colorIndex)); }
/** * * removes html, javascript and additional whitespaces from string * * @param $html * @return mixed|string */ protected function getPlainTextFromHtml($html) { $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8"); $html = $doc->getHTML(); //remove scripts and stuff $search = array('@(<script[^>]*?>.*?</script>)@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@'); $text = preg_replace($search, "", $html); //remove html tags $text = strip_tags($text); //remove additional whitespaces $text = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $text); return $text; }
/** * Highlight query terms * * @param integer &$colorIndex * @param Zend_Search_Lucene_Document_Html $doc */ public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex) { $words = array(); if ($this->_signs === null) { foreach ($this->_terms as $term) { $words[] = $term->text; } } else { foreach ($this->_signs as $id => $sign) { if ($sign !== false) { $words[] = $this->_terms[$id]->text; } } } $doc->highlight($words, $this->_getHighlightColor($colorIndex)); }
private function _extractText_ZendDb($guid, $systemName, $fileName, $mimeType) { //$c = $this->_registry->get('config'); $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem(); $rowset = $tblRelatedItem->fetchAll("itemGuid='{$guid}' AND relateAs='RELATED_FILE'"); if (count($rowset)) { $row = $rowset->current(); $parentCatalogGuid = $row->relatedGuid; if (!empty($systemName)) { $fileName = $systemName; } $sDir1 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName; $sDir2 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName; $sDir = ''; if (file_exists($sDir1)) { $sDir = $sDir1; } else { if (file_exists($sDir2)) { $sDir = $sDir2; } } if (!empty($sDir)) { $outpath = $sDir . '.txt'; switch ($mimeType) { case 'application/pdf': $pdfExtractor = $this->_pdfExtractor; system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo 'content PDF: '. $sDir.' ' . strlen($value); if (strlen($value) > 20) { return $this->clean_string_input($value); } else { //echo 'content file kosong'; return ''; } } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } break; case 'text/html': case 'text/plain': $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir); return $docHtml->getFieldValue('body'); break; case 'application/x-javascript': case 'application/octet-stream': case 'application/msword': if (strpos(strtolower($fileName), '.doc')) { $extractor = $this->_wordExtractor; system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo $value; return $value; } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } } else { return ''; } break; default: return ''; break; } } } return ''; }