Inheritance: extends Zend_Search_Lucene_Document
 protected function _indexate($url)
 {
     if (!stristr($url, 'http://')) {
         $url = HTTP_HOST . $url;
     }
     $url = substr($url, -1) == '/' ? substr($url, 0, -1) : $url;
     if (!in_array($url, $this->_indexedUrl)) {
         if (stristr($url, HTTP_HOST)) {
             array_push($this->_indexedUrl, $url);
             $html = file_get_contents($url);
             libxml_use_internal_errors(true);
             $doc = Zend_Search_Lucene_Document_Html::loadHTML($html);
             libxml_use_internal_errors(false);
             if (preg_match('/<\\!--index-->(.*)<\\!--\\/index-->/isu', $html, $matches)) {
                 $html = $matches[1];
             }
             $html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);
             $html = strip_tags($html);
             $doc->addField(Zend_Search_Lucene_Field::Text('content', $html, 'utf-8'));
             $doc->addField(Zend_Search_Lucene_Field::UnIndexed('body', '', 'utf-8'));
             $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'utf-8'));
             $this->_indexHandle->addDocument($doc);
             Zend_Registry::get('Logger')->info('Search index is created: ' . $url, Zend_Log::INFO);
             foreach ($doc->getLinks() as $link) {
                 $temp = explode('.', $link);
                 $ext = end($temp);
                 if ($link == $ext || in_array($ext, array('php', 'html', 'txt', 'htm'))) {
                     $this->_indexate($link);
                 }
             }
         }
     }
 }
 /**
  * Inserts the provided action
  */
 public function insert()
 {
     if (!$this->shouldIndex()) {
         return;
     }
     throw new sfException(__CLASS__ . ' not implemented');
     extract($this->getActionProperties());
     $output = $this->executeAction($params);
     $content = $output->getContent();
     $doc = Zend_Search_Lucene_Document_Html::loadHtml($content);
     $doc->addField('sfl_title', $output->getLastTitle(), 2);
     $doc->addField('sfl_uri', $this->getUri($params));
     $doc->addField('sfl_description', $content);
     $doc->addField('sfl_type', 'action');
     $categories = $this->getActionCategories();
     if (count($categories)) {
         foreach ($categories as $category) {
             $this->addCategory($category);
         }
         $doc->addField('sfl_category', implode(', ', $categories));
     }
     $doc->addField('sfl_categories_cache', serialize($categories));
     $guid = $this->getGuid($params);
     $this->addDocument($doc, $guid, 'action');
     $this->getSearch()->getEventDispatcher()->notify(new sfEvent($this, 'indexer.log', array('Inserted action "%s" of module "%s" to index', $this->getAction(), $this->getModule())));
     return $this;
 }
Example #3
0
    /**
     * Highlight specified words
     *
     * @param string|array $words  Words to highlight. They could be organized using the array or string.
     */
    public function highlight($words)
    {
    	$color = $this->_highlightColors[$this->_currentColorIndex];
    	$this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors);

    	$this->_doc->highlight($words, $color);
    }
Example #4
0
 public function highlightMatches($inputHTML)
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
     $colorIndex = 0;
     $this->highlightMatchesDOM($doc, $colorIndex);
     return $doc->getHTML();
 }
 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n");
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
Example #6
0
 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false);
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
 /**
  * Returns a string containing the text in the given HTML document.
  *
  * @param   String  $filename   Full filesystem path to the file to process.
  * @return  String  Text extracted from the file.
  */
 public static function extract($filename)
 {
     if (!file_exists($filename)) {
         return '';
     }
     try {
         $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, true);
     } catch (Exception $e) {
         return '';
     }
     return $doc->body;
 }
Example #8
0
 public function addurl()
 {
     // use a local file for purpose of demo.
     $filename = MODPATH . "kosearch" . DIRECTORY_SEPARATOR . "examples" . DIRECTORY_SEPARATOR . "kohana_home.html";
     // Note: the Search class is responsible for loading the Zend libraries, so as we
     // want to instantiate Zend_Search_Lucene_Document_Html prior to calling singleton,
     // we must first call Search::instance()->load_search_libs();
     Search::instance()->load_search_libs();
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, TRUE, "utf-8");
     Search::instance()->addDocument($doc);
     $this->index('Kohana page successfully added &darr;&nbsp;<a href="#form2" title="scroll down">scroll down</a>&nbsp;&darr;');
 }
Example #9
0
 public function testHtmlNoFollowLinks()
 {
     $html = '<HTML>' . '<HEAD><TITLE>Page title</TITLE></HEAD>' . '<BODY>' . 'Document body.' . '<a href="link1.html">Link 1</a>.' . '<a href="link2.html" rel="nofollow">Link 1</a>.' . '</BODY>' . '</HTML>';
     $oldNoFollowValue = Zend_Search_Lucene_Document_Html::getExcludeNoFollowLinks();
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(false);
     $doc1 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc1 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc1->getLinks()) == array('link1.html', 'link2.html'));
     Zend_Search_Lucene_Document_Html::setExcludeNoFollowLinks(true);
     $doc2 = Zend_Search_Lucene_Document_Html::loadHTML($html);
     $this->assertTrue($doc2 instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc2->getLinks()) == array('link1.html'));
 }
Example #10
0
 /**
  * index a file
  *
  * @author Jörn Dreyer <*****@*****.**>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimetype = $data['mimetype'];
         if ('text/html' === $mimetype) {
             $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
         } else {
             if ('application/msword' === $mimetype) {
                 // FIXME uses ZipArchive ... make compatible with OC\Files\Filesystem
                 //$doc = Zend_Search_Lucene_Document_Docx::loadDocxFile(OC\Files\Filesystem::file_get_contents($path));
                 //no special treatment yet
                 $doc = new \Zend_Search_Lucene_Document();
             } else {
                 $doc = new \Zend_Search_Lucene_Document();
             }
         }
         // store fscacheid as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store document URL to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimetype));
         self::extractMetadata($doc, $path, $view, $mimetype);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
 }
Example #11
0
 /**
  * @param  string $websiteId
  * @return string
  */
 public function indexWebsite($websiteId)
 {
     $websiteService = new Website('Website');
     if (!$websiteService->existsWebsiteAlready($websiteId)) {
         throw new CmsException('602', __METHOD__, __LINE__);
     }
     // Zum Rendern muss die Business-Schicht verwendet werden
     $renderBusiness = new BusinessRender('Render');
     $modulService = new Modul('Modul');
     $pageService = new Page('Page');
     $allPageIds = $pageService->getIdsByWebsiteId($websiteId);
     $indexFileOfWebsite = $this->getIndexFileForWebsite($websiteId);
     if (is_array($allPageIds) && count($allPageIds) > 0) {
         if (file_exists($indexFileOfWebsite)) {
             $index = \Zend_Search_Lucene::open($indexFileOfWebsite);
             $numberOfIndexedDocuments = $index->numDocs();
             for ($id = 0; $id < $numberOfIndexedDocuments; ++$id) {
                 if (!$index->isDeleted($id)) {
                     $document = $index->delete($id);
                 }
             }
         } else {
             $index = \Zend_Search_Lucene::create($indexFileOfWebsite);
         }
         foreach ($allPageIds as $pageId) {
             $pageContent = $this->getPageContent($websiteId, $pageId);
             if ($this->isStoreContentEnabled()) {
                 $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, true, 'UTF-8');
             } else {
                 $document = \Zend_Search_Lucene_Document_Html::loadHTML($pageContent, false, 'UTF-8');
             }
             $document->addField(\Zend_Search_Lucene_Field::unIndexed('md5', md5($pageContent)));
             $document->addField(\Zend_Search_Lucene_Field::unIndexed('pageId', $pageId));
             $index->addDocument($document);
         }
         $index->commit();
         $index->optimize();
         unset($index);
     }
     return $indexFileOfWebsite;
 }
Example #12
0
 /**
  * Highlight query terms
  *
  * @param integer &$colorIndex
  * @param Zend_Search_Lucene_Document_Html $doc
  */
 public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
 {
     /** @todo implementation */
     $words = array();
     $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*'), preg_quote($this->_pattern->text, '/')) . '$/';
     if (@preg_match('/\\pL/u', 'a') == 1) {
         // PCRE unicode support is turned on
         // add Unicode modifier to the match expression
         $matchExpression .= 'u';
     }
     $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8');
     foreach ($tokens as $token) {
         if (preg_match($matchExpression, $token->getTermText()) === 1) {
             $words[] = $token->getTermText();
         }
     }
     $doc->highlight($words, $this->_getHighlightColor($colorIndex));
 }
 /**
  * @group ZF-10686
  */
 public function testLoadHtmlWithAttributesInTagHTML()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML lang="en_US"><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertEquals('Page title ', $doc->title);
 }
Example #14
0
 /**
  * index a file
  *
  * @author Jörn Dreyer <*****@*****.**>
  *
  * @param string $path the path of the file
  *
  * @return bool
  */
 public static function indexFile($path = '', $user = null)
 {
     if (!Filesystem::isValidPath($path)) {
         return;
     }
     if ($path === '') {
         //ignore the empty path element
         return false;
     }
     if (is_null($user)) {
         $view = Filesystem::getView();
         $user = \OCP\User::getUser();
     } else {
         $view = new \OC\Files\View('/' . $user . '/files');
     }
     if (!$view) {
         Util::writeLog('search_lucene', 'could not resolve filesystem view', Util::WARN);
         return false;
     }
     if (!$view->file_exists($path)) {
         Util::writeLog('search_lucene', 'file vanished, ignoring', Util::DEBUG);
         return true;
     }
     $root = $view->getRoot();
     $pk = md5($root . $path);
     // the cache already knows mime and other basic stuff
     $data = $view->getFileInfo($path);
     if (isset($data['mimetype'])) {
         $mimeType = $data['mimetype'];
         // initialize plain lucene document
         $doc = new \Zend_Search_Lucene_Document();
         // index content for local files only
         $localFile = $view->getLocalFile($path);
         if ($localFile) {
             //try to use special lucene document types
             if ('text/plain' === $mimeType) {
                 $body = $view->file_get_contents($path);
                 if ($body != '') {
                     $doc->addField(\Zend_Search_Lucene_Field::UnStored('body', $body));
                 }
             } else {
                 if ('text/html' === $mimeType) {
                     //TODO could be indexed, even if not local
                     $doc = \Zend_Search_Lucene_Document_Html::loadHTML($view->file_get_contents($path));
                 } else {
                     if ('application/pdf' === $mimeType) {
                         $doc = Pdf::loadPdf($view->file_get_contents($path));
                         // commented the mimetype checks, as the zend classes only understand docx and not doc files.
                         // FIXME distinguish doc and docx, xls and xlsx, ppt and pptx, in oc core mimetype helper ...
                         //} else if ('application/msword' === $mimeType) {
                     } else {
                         if (strtolower(substr($data['name'], -5)) === '.docx') {
                             $doc = \Zend_Search_Lucene_Document_Docx::loadDocxFile($localFile);
                             //} else if ('application/msexcel' === $mimeType) {
                         } else {
                             if (strtolower(substr($data['name'], -5)) === '.xlsx') {
                                 $doc = \Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($localFile);
                                 //} else if ('application/mspowerpoint' === $mimeType) {
                             } else {
                                 if (strtolower(substr($data['name'], -5)) === '.pptx') {
                                     $doc = \Zend_Search_Lucene_Document_Pptx::loadPptxFile($localFile);
                                 } else {
                                     if (strtolower(substr($data['name'], -4)) === '.odt') {
                                         $doc = Odt::loadOdtFile($localFile);
                                     } else {
                                         if (strtolower(substr($data['name'], -4)) === '.ods') {
                                             $doc = Ods::loadOdsFile($localFile);
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
         // Store filecache id as unique id to lookup by when deleting
         $doc->addField(\Zend_Search_Lucene_Field::Keyword('pk', $pk));
         // Store filename
         $doc->addField(\Zend_Search_Lucene_Field::Text('filename', $data['name'], 'UTF-8'));
         // Store document path to identify it in the search results
         $doc->addField(\Zend_Search_Lucene_Field::Text('path', $path, 'UTF-8'));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('size', $data['size']));
         $doc->addField(\Zend_Search_Lucene_Field::unIndexed('mimetype', $mimeType));
         //self::extractMetadata($doc, $path, $view, $mimeType);
         Lucene::updateFile($doc, $path, $user);
         return true;
     } else {
         Util::writeLog('search_lucene', 'need mimetype for content extraction', Util::ERROR);
         return false;
     }
 }
Example #15
0
 private function _extractText($guid, $systemName, $fileName, $mimeType, $lang = 'id')
 {
     $query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'";
     $results = $this->getDbHandler($lang)->query($query);
     $rowset = $results->fetchAll(PDO::FETCH_OBJ);
     if (count($rowset)) {
         $row = $rowset[0];
         $parentCatalogGuid = $row->relatedGuid;
         if (!empty($systemName)) {
             $fileName = $systemName;
         }
         $sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName;
         $sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName;
         $sDir = '';
         if (file_exists($sDir1)) {
             $sDir = $sDir1;
         } else {
             if (file_exists($sDir2)) {
                 $sDir = $sDir2;
             }
         }
         if (!empty($sDir)) {
             $outpath = $sDir . '.txt';
             switch ($mimeType) {
                 case 'application/pdf':
                     //$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&literal.name=content&commit=true');
                     /*$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&fmap.content=content&commit=true');
                     		curl_setopt ($ch, CURLOPT_POSTFIELDS, array('myfile'=>'@'.$sDir));
                     		curl_setopt ($ch, CURLOPT_POST, 1);
                     		curl_setopt($ch, CURLOPT_BINARYTRANSFER, TRUE);
                     		curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-type:multipart/form-data'));
                     		$result = curl_exec ($ch);*/
                     /*$mapping_array = [
                     			"literal.id" => "$guid",
                     			"fmap.content" => "content",
                     			"commit" => "true"
                     		];
                     		$ch = curl_init();
                     		$solr_extraction_endpoint = "http://192.168.0.61:8983/solr/corehol/update/extract";
                     		curl_setopt($ch, CURLOPT_POST, TRUE);
                     		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
                     		curl_setopt($ch, CURLOPT_URL, ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')));
                     		$cfile = curl_file_create($sDir);
                     								
                     		curl_setopt($ch, CURLOPT_POSTFIELDS, array('myfile' => $cfile));
                     		
                     		if(!curl_exec($ch) == TRUE)
                     		{
                     			throw new Exception('Curl Error:' . curl_error($ch));
                     			echo "<br/>Curl Error:<br/>" . curl_error($ch);
                     		}
                     		curl_close($ch);
                     		die;*/
                     //curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type:multipart/form-data'));
                     /*$cfile = $this->getCurlValue($sDir,'multipart/form-data',$fileName);
                     		$data = array('file' => $cfile);
                     		$ch = curl_init();
                     		$options = array(CURLOPT_URL => ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')),
                     				CURLOPT_RETURNTRANSFER => true,
                     				CURLINFO_HEADER_OUT => true, //Request header
                     				CURLOPT_HEADER => true, //Return header
                     				CURLOPT_SSL_VERIFYPEER => false, //Don't veryify server certificate
                     				CURLOPT_POST => true,
                     				CURLOPT_POSTFIELDS => $data
                     		);
                     		
                     		curl_setopt_array($ch, $options);
                     		$result = curl_exec($ch);
                     		$header_info = curl_getinfo($ch,CURLINFO_HEADER_OUT);
                     		$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
                     		$header = substr($result, 0, $header_size);
                     		$body = substr($result, $header_size);
                     		curl_close($ch);*/
                     //system('curl "http://192.168.0.61:8983/solr/corehol/update/extract?literal.id="'.$guid.'"&fmap.content=content&commit=true" -F "myfile=@"'.$sDir);
                     //system('curl "'.($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')).'" -F "myfile=@"'.$sDir);
                     $pdfExtractor = $this->_pdfExtractor;
                     system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret);
                     if ($ret == 0) {
                         $value = file_get_contents($outpath);
                         unlink($outpath);
                         echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n";
                         if (strlen($value) > 20) {
                             return (new Pandamp_Utility_Posts())->sanitize_post_content($value);
                         } else {
                             echo "content file kosong\n";
                             return '';
                         }
                     }
                     if ($ret == 127) {
                         print "Could not find pdftotext tool.\n";
                     }
                     return '';
                     if ($ret == 1) {
                         print "Could not find pdf file.\n";
                     }
                     return '';
                     break;
                 case 'text/html':
                 case 'text/plain':
                     $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir);
                     return $docHtml->getFieldValue('body');
                     break;
                 case 'application/x-javascript':
                 case 'application/octet-stream':
                 case 'application/msword':
                     if (strpos(strtolower($fileName), '.doc')) {
                         $extractor = $this->_wordExtractor;
                         system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret);
                         if ($ret == 0) {
                             $value = file_get_contents($outpath);
                             unlink($outpath);
                             //echo $value;
                             return $value;
                         }
                         if ($ret == 127) {
                             //print "Could not find pdftotext tool.";
                             return '';
                         }
                         if ($ret == 1) {
                             //print "Could not find pdf file.";
                             return '';
                         }
                     } else {
                         return '';
                     }
                     break;
                 default:
                     return '';
                     break;
             }
         }
     }
     return;
 }
Example #16
0
 /**
  * Highlight matches in $inputHtmlFragment and return it (without HTML header and body tag)
  *
  * @param string $inputHtmlFragment
  * @param string  $encoding   Input HTML string encoding
  * @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter
  * @return string
  */
 public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null)
 {
     if ($highlighter === null) {
         // require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php';
         $highlighter = new Zend_Search_Lucene_Search_Highlighter_Default();
     }
     $inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($encoding, 'UTF-8//IGNORE', $inputHtmlFragment) . '</body></html>';
     /** Zend_Search_Lucene_Document_Html */
     // require_once 'Zend/Search/Lucene/Document/Html.php';
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
     $highlighter->setDocument($doc);
     $this->_highlightMatches($highlighter);
     return $doc->getHtmlBody();
 }
Example #17
0
 /**
  * Highlight query terms
  *
  * @param integer &$colorIndex
  * @param Zend_Search_Lucene_Document_Html $doc
  */
 public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
 {
     $doc->highlight($this->_term->text, $this->_getHighlightColor($colorIndex));
 }
 /**
  * Gets our HTML from MySQL Workbench for us.
  *
  * @access private
  * @param Zend_Response $response
  * @return String
  * 
  */
 private function _getHTMLResponse($response)
 {
     if (200 === $response->getStatus()) {
         $doc = Zend_Search_Lucene_Document_Html::loadHTML($response->getBody());
         return $doc->getHTML();
     }
     return false;
 }
Example #19
0
 /**
  * 
  * @param $article
  * @param $isNew
  */
 function onIndexContent($article, $isNew = false)
 {
     //FIXME move the content type tests and following transformations to the helper
     global $mainframe;
     $pk = $article->id;
     if (!$isNew) {
         JuceneHelper::removeFromIndex('pk:' . $pk);
     }
     $index = JuceneHelper::getIndex();
     $xml_field = substr($article->fulltext, 0, 5) != '<?xml' ? $article->introtext : $article->fulltext;
     if (substr($xml_field, 0, 5) == '<?xml') {
         $dom = new DOMDocument();
         $pmml = true;
         $xslt = new DOMDocument();
         $error = false;
         //load xslt stylesheet
         if (!@$xslt->load(JPATH_SITE . DS . 'administrator' . DS . 'components' . DS . 'com_jucene' . DS . 'xslt/jucene.xsl')) {
             $error = true;
             $this->raiseMessage("XSLTLOADERROR", 'error');
         }
         $proc = new XSLTProcessor();
         if (!$proc->importStylesheet($xslt)) {
             $error = true;
             $this->raiseMessage("XSLTIMPORTERROR", 'error');
         }
         unset($artcile->fulltext);
         unset($record->introtext);
         if ($dom->loadXML($xml_field) && !$error && $pmml) {
             //simplify the document - prepare it for the indexation process
             $xslOutput = $proc->transformToXml($dom);
             //create new DOM document to preserve output and transform the XML to the indexable one
             $transXml = new DOMDocument();
             $transXml->preserveWhitespace = false;
             @$transXml->loadXML($xslOutput);
             //unset unneccessary variables
             unset($xslOutput);
             unset($dom);
             unset($xslt);
             //index every assoc rule as document with same credentials
             if (!$error) {
                 $rules = $transXml->getElementsByTagName("AssociationRule");
                 $rulesCount = $rules->length;
                 if ($rulesCount == 0) {
                     $error = true;
                     $this->raiseMessage('XMLDOCUMENTNORULES', 'error');
                 }
                 $rule_doc_position = 0;
                 foreach ($rules as $rule) {
                     $additional['rating'] = 0;
                     $additional['position'] = $rule_doc_position;
                     JPluginHelper::importPlugin('content');
                     $dispatcher =& JDispatcher::getInstance();
                     $results = $dispatcher->trigger('onIndexPmml', array($rule, $additional));
                     $rule_doc_position++;
                 }
             }
         }
     } else {
         $zendDoc = Zend_Search_Lucene_Document_Html::loadHTML($article->fulltext, false, UTF - 8);
         $index->addDocument($zendDoc);
     }
 }
 /**
  * @param AJXP_Node $ajxpNode
  * @param Zend_Search_Lucene_Interface $index
  * @throws Exception
  * @return Zend_Search_Lucene_Document
  */
 public function createIndexedDocument($ajxpNode, &$index)
 {
     if (!empty($this->metaFields)) {
         $ajxpNode->loadNodeInfo(false, false, "all");
     } else {
         $ajxpNode->loadNodeInfo();
     }
     $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION));
     $parseContent = $this->indexContent;
     if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) {
         $parseContent = false;
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) {
         $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl());
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile);
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile);
     } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile);
     } else {
         $doc = new Zend_Search_Lucene_Document();
     }
     if ($doc == null) {
         throw new Exception("Could not load document");
     }
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared"));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime)));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize));
     $ajxpMime = $ajxpNode->ajxp_mime;
     if (empty($ajxpMime)) {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)));
     } else {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime));
     }
     // Store a cached copy of the metadata
     $serializedMeta = base64_encode(serialize($ajxpNode->metadata));
     $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
     if (isset($ajxpNode->indexableMetaKeys["shared"])) {
         foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) {
             if ($ajxpNode->{$sharedField}) {
                 $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField}));
             }
         }
     }
     foreach ($this->metaFields as $field) {
         if ($ajxpNode->{$field} != null) {
             $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding());
         }
     }
     if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) {
         $privateDoc = new Zend_Search_Lucene_Document();
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user"));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId()));
         foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) {
             if ($ajxpNode->{$userField}) {
                 $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField}));
             }
         }
         $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
         $index->addDocument($privateDoc);
     }
     if ($parseContent) {
         $body = $this->extractIndexableContent($ajxpNode);
         if (!empty($body)) {
             $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body));
         }
     }
     $index->addDocument($doc);
     return $doc;
 }
Example #21
0
function rebuild_search_indexes()
{
    global $success_msg;
    global $error_msg;
    global $warning_msg;
    global $all_settings;
    $index_folder = get_setting('search_indexes_folder', $all_settings);
    try {
        $index = new Zend_Search_Lucene($index_folder, true);
        setlocale(LC_CTYPE, 'en_US');
        foreach (get_all_html_files(dirname(__FILE__)) as $html_file => $html_url) {
            if (can_index_html_file($html_file)) {
                $file_content = file_get_contents($html_file);
                $file_content = '<html>' . strstr($file_content, '<head');
                $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content);
                $doc->addField(Zend_Search_Lucene_Field::Text('url', $html_url, 'UTF-8'));
                $index->addDocument($doc);
                flush();
            }
        }
        $broken_urls = array();
        foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) {
            $headers = get_headers($url);
            if (strrpos($headers[0], '200')) {
                $content = file_get_contents($url);
                $content = '<html>' . strstr($content, '<head');
                $doc = Zend_Search_Lucene_Document_Html::loadHTML($content);
                $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8'));
                $index->addDocument($doc);
                flush();
            } else {
                array_push($broken_urls, $url);
            }
        }
        if (file_exists($index_folder)) {
            if (count($broken_urls) > 0) {
                $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>';
                $warning_msg .= '<ul class="disc">';
                foreach ($broken_urls as $broken_url) {
                    $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>';
                }
                $warning_msg .= '</ul>';
                $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>';
            } else {
                $success_msg = 'The website was successfully indexed.';
            }
        } else {
            $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created';
        }
    } catch (Exception $e) {
        $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage();
    }
}
Example #22
0
function rebuild_search_indexes()
{
    global $success_msg;
    global $error_msg;
    global $warning_msg;
    global $all_settings;
    global $indexable_folders;
    $index_folder = get_setting('search_indexes_folder', $all_settings);
    try {
        setlocale(LC_CTYPE, LOCALE);
        Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive());
        $index = new Zend_Search_Lucene($index_folder, true);
        $files_to_index = get_website_files($indexable_folders);
        foreach ($files_to_index as $html_file => $page_url) {
            if (can_index_website_file($html_file)) {
                $f1 = strtolower($html_file);
                if (end_with($f1, 'html') || end_with($f1, 'htm')) {
                    $file_content = file_get_contents($html_file);
                } elseif (end_with($f1, 'php')) {
                    if (is_http_code_200($page_url)) {
                        $file_content = get_url_content($page_url);
                    }
                }
                if (isset($file_content)) {
                    $file_content = '<html>' . strstr($file_content, '<head');
                    $doc = Zend_Search_Lucene_Document_Html::loadHTML($file_content, true, 'UTF-8');
                    $doc->addField(Zend_Search_Lucene_Field::Text('url', $page_url, 'UTF-8'));
                    $index->addDocument($doc);
                    flush();
                }
            }
        }
        $broken_urls = array();
        foreach (get_dynamic_urls(get_setting('search_dynamic_pages', $all_settings)) as $url) {
            if (is_http_code_200($url)) {
                $content = get_url_content($url);
                $content = '<html>' . strstr($content, '<head');
                $doc = Zend_Search_Lucene_Document_Html::loadHTML($content, true, 'UTF-8');
                $doc->addField(Zend_Search_Lucene_Field::Text('url', $url, 'UTF-8'));
                $index->addDocument($doc);
                flush();
            } else {
                array_push($broken_urls, $url);
            }
        }
        if (file_exists($index_folder)) {
            if (count($broken_urls) > 0) {
                $warning_msg = '<p>The website was successfully indexed, but the following URL\'s were skipped because they are broken:</p>';
                $warning_msg .= '<ul class="disc">';
                foreach ($broken_urls as $broken_url) {
                    $warning_msg .= '<li><a href="' . $broken_url . '">' . $broken_url . '</a></li>';
                }
                $warning_msg .= '</ul>';
                $warning_msg .= '<p>Please remove them from the "List of dynamic pages" field.</p>';
            } else {
                $success_msg = 'The website was successfully indexed.';
            }
        } else {
            $error_msg = 'An error occurred during the website indexing. The error message is: the folder that stores the website indexes couldn\'t be created';
        }
    } catch (Exception $e) {
        $error_msg = 'An error occurred during the website indexing. The error message is: ' . $e->getMessage();
    }
}
Example #23
0
 /**
  * Set exclude nofollow links flag
  *
  * @param boolean $newValue
  */
 public static function setExcludeNoFollowLinks($newValue)
 {
     self::$_excludeNoFollowLinks = $newValue;
 }
Example #24
0
 protected function _spider($url)
 {
     $queue = array();
     $visited = array();
     array_push($queue, $url);
     while (!empty($queue)) {
         $doc = null;
         $url = array_shift($queue);
         if ($url = $this->_sanitizeUrl($url)) {
             if (!in_array($url, $visited)) {
                 $visited[] = $url;
                 Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG);
                 $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html';
                 if (file_exists($cachePath)) {
                     Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG);
                     try {
                         $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8');
                     } catch (Exception $e) {
                         Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG);
                     }
                 } else {
                     $this->_client->setUri($this->_getAbsoluteUrl($url));
                     try {
                         $response = $this->_client->request();
                         $status = $response->getStatus();
                         Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG);
                         if ($status == '200') {
                             $data = $response->getBody();
                             $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8');
                         }
                     } catch (Exception $e) {
                         Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG);
                     }
                 }
                 if ($doc !== null) {
                     $this->_search()->indexDoc($doc, $url);
                     $this->_indexed++;
                     $links = array_diff($doc->getLinks(), $this->_visited);
                     if (count($visited) < $this->_maxLinks) {
                         $queue = array_merge($queue, $links);
                     } else {
                         Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG);
                         exit;
                     }
                 }
             }
         }
     }
 }
Example #25
0
 /**
  * Index with Zend_Lucene
  *
  * @param unknown_type $uri
  * @param unknown_type $htmlString
  */
 function indexContent($uri, $htmlString)
 {
     require_once 'Zend/Search/Lucene.php';
     $index_location = ONXSHOP_PROJECT_DIR . 'var/index';
     if (is_dir($index_location)) {
         // Open existing index
         try {
             $index = Zend_Search_Lucene::open($index_location);
         } catch (Exception $e) {
             // Create index
             try {
                 $index = Zend_Search_Lucene::create($index_location);
             } catch (Exception $e) {
                 $index = false;
             }
         }
     }
     if ($index) {
         // find and remove pages with the same URI
         $hits = $index->find("uri:" . $uri);
         foreach ($hits as $hit) {
             $index->delete($hit);
         }
         $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString, true);
         $doc->addField(Zend_Search_Lucene_Field::Keyword('uri', $uri));
         $index->addDocument($doc);
         $index->commit();
     }
 }
Example #26
0
/**
 * Given a path to a HTML document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createHTMLDocument($path)
{
    $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}
Example #27
0
 /**
  * Highlight query terms
  *
  * @param integer &$colorIndex
  * @param Zend_Search_Lucene_Document_Html $doc
  */
 public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
 {
     $words = array();
     foreach ($this->_matches as $term) {
         $words[] = $term->text;
     }
     $doc->highlight($words, $this->_getHighlightColor($colorIndex));
 }
Example #28
0
 /**
  *
  * removes html, javascript and additional whitespaces from string
  *
  * @param  $html
  * @return mixed|string
  */
 protected function getPlainTextFromHtml($html)
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML($html, false, "utf-8");
     $html = $doc->getHTML();
     //remove scripts and stuff
     $search = array('@(<script[^>]*?>.*?</script>)@si', '@<style[^>]*?>.*?</style>@siU', '@<![\\s\\S]*?--[ \\t\\n\\r]*>@');
     $text = preg_replace($search, "", $html);
     //remove html tags
     $text = strip_tags($text);
     //remove additional whitespaces
     $text = preg_replace('@[ \\t\\n\\r\\f]+@', " ", $text);
     return $text;
 }
Example #29
0
 /**
  * Highlight query terms
  *
  * @param integer &$colorIndex
  * @param Zend_Search_Lucene_Document_Html $doc
  */
 public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
 {
     $words = array();
     if ($this->_signs === null) {
         foreach ($this->_terms as $term) {
             $words[] = $term->text;
         }
     } else {
         foreach ($this->_signs as $id => $sign) {
             if ($sign !== false) {
                 $words[] = $this->_terms[$id]->text;
             }
         }
     }
     $doc->highlight($words, $this->_getHighlightColor($colorIndex));
 }
Example #30
0
 private function _extractText_ZendDb($guid, $systemName, $fileName, $mimeType)
 {
     //$c = $this->_registry->get('config');
     $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem();
     $rowset = $tblRelatedItem->fetchAll("itemGuid='{$guid}' AND relateAs='RELATED_FILE'");
     if (count($rowset)) {
         $row = $rowset->current();
         $parentCatalogGuid = $row->relatedGuid;
         if (!empty($systemName)) {
             $fileName = $systemName;
         }
         $sDir1 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName;
         $sDir2 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName;
         $sDir = '';
         if (file_exists($sDir1)) {
             $sDir = $sDir1;
         } else {
             if (file_exists($sDir2)) {
                 $sDir = $sDir2;
             }
         }
         if (!empty($sDir)) {
             $outpath = $sDir . '.txt';
             switch ($mimeType) {
                 case 'application/pdf':
                     $pdfExtractor = $this->_pdfExtractor;
                     system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret);
                     if ($ret == 0) {
                         $value = file_get_contents($outpath);
                         unlink($outpath);
                         //echo 'content PDF: '. $sDir.' ' . strlen($value);
                         if (strlen($value) > 20) {
                             return $this->clean_string_input($value);
                         } else {
                             //echo 'content file kosong';
                             return '';
                         }
                     }
                     if ($ret == 127) {
                         //print "Could not find pdftotext tool.";
                         return '';
                     }
                     if ($ret == 1) {
                         //print "Could not find pdf file.";
                         return '';
                     }
                     break;
                 case 'text/html':
                 case 'text/plain':
                     $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir);
                     return $docHtml->getFieldValue('body');
                     break;
                 case 'application/x-javascript':
                 case 'application/octet-stream':
                 case 'application/msword':
                     if (strpos(strtolower($fileName), '.doc')) {
                         $extractor = $this->_wordExtractor;
                         system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret);
                         if ($ret == 0) {
                             $value = file_get_contents($outpath);
                             unlink($outpath);
                             //echo $value;
                             return $value;
                         }
                         if ($ret == 127) {
                             //print "Could not find pdftotext tool.";
                             return '';
                         }
                         if ($ret == 1) {
                             //print "Could not find pdf file.";
                             return '';
                         }
                     } else {
                         return '';
                     }
                     break;
                 default:
                     return '';
                     break;
             }
         }
     }
     return '';
 }