loadHTMLFile() публичный статический Метод

Load HTML document from a file
public static loadHTMLFile ( string $file, boolean $storeContent = false, string $defaultEncoding = '' ) : Zend_Search_Lucene_Document_Html
$file string
$storeContent boolean
$defaultEncoding string HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
Результат Zend_Search_Lucene_Document_Html
Пример #1
0
 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n");
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
Пример #2
0
 public function testHtml()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>');
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $doc->highlight('document', '#66ffff');
     $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false);
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
 /**
  * Returns a string containing the text in the given HTML document.
  *
  * @param   String  $filename   Full filesystem path to the file to process.
  * @return  String  Text extracted from the file.
  */
 public static function extract($filename)
 {
     if (!file_exists($filename)) {
         return '';
     }
     try {
         $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, true);
     } catch (Exception $e) {
         return '';
     }
     return $doc->body;
 }
Пример #4
0
 public function addurl()
 {
     // use a local file for purpose of demo.
     $filename = MODPATH . "kosearch" . DIRECTORY_SEPARATOR . "examples" . DIRECTORY_SEPARATOR . "kohana_home.html";
     // Note: the Search class is responsible for loading the Zend libraries, so as we
     // want to instantiate Zend_Search_Lucene_Document_Html prior to calling singleton,
     // we must first call Search::instance()->load_search_libs();
     Search::instance()->load_search_libs();
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, TRUE, "utf-8");
     Search::instance()->addDocument($doc);
     $this->index('Kohana page successfully added &darr;&nbsp;<a href="#form2" title="scroll down">scroll down</a>&nbsp;&darr;');
 }
 public function testHtmlLinksProcessing()
 {
     $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true);
     $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html);
     $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html'));
     $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html'));
 }
Пример #6
0
 private function _extractText($guid, $systemName, $fileName, $mimeType, $lang = 'id')
 {
     $query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'";
     $results = $this->getDbHandler($lang)->query($query);
     $rowset = $results->fetchAll(PDO::FETCH_OBJ);
     if (count($rowset)) {
         $row = $rowset[0];
         $parentCatalogGuid = $row->relatedGuid;
         if (!empty($systemName)) {
             $fileName = $systemName;
         }
         $sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName;
         $sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName;
         $sDir = '';
         if (file_exists($sDir1)) {
             $sDir = $sDir1;
         } else {
             if (file_exists($sDir2)) {
                 $sDir = $sDir2;
             }
         }
         if (!empty($sDir)) {
             $outpath = $sDir . '.txt';
             switch ($mimeType) {
                 case 'application/pdf':
                     //$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&literal.name=content&commit=true');
                     /*$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&fmap.content=content&commit=true');
                     		curl_setopt ($ch, CURLOPT_POSTFIELDS, array('myfile'=>'@'.$sDir));
                     		curl_setopt ($ch, CURLOPT_POST, 1);
                     		curl_setopt($ch, CURLOPT_BINARYTRANSFER, TRUE);
                     		curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-type:multipart/form-data'));
                     		$result = curl_exec ($ch);*/
                     /*$mapping_array = [
                     			"literal.id" => "$guid",
                     			"fmap.content" => "content",
                     			"commit" => "true"
                     		];
                     		$ch = curl_init();
                     		$solr_extraction_endpoint = "http://192.168.0.61:8983/solr/corehol/update/extract";
                     		curl_setopt($ch, CURLOPT_POST, TRUE);
                     		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
                     		curl_setopt($ch, CURLOPT_URL, ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')));
                     		$cfile = curl_file_create($sDir);
                     								
                     		curl_setopt($ch, CURLOPT_POSTFIELDS, array('myfile' => $cfile));
                     		
                     		if(!curl_exec($ch) == TRUE)
                     		{
                     			throw new Exception('Curl Error:' . curl_error($ch));
                     			echo "<br/>Curl Error:<br/>" . curl_error($ch);
                     		}
                     		curl_close($ch);
                     		die;*/
                     //curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type:multipart/form-data'));
                     /*$cfile = $this->getCurlValue($sDir,'multipart/form-data',$fileName);
                     		$data = array('file' => $cfile);
                     		$ch = curl_init();
                     		$options = array(CURLOPT_URL => ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')),
                     				CURLOPT_RETURNTRANSFER => true,
                     				CURLINFO_HEADER_OUT => true, //Request header
                     				CURLOPT_HEADER => true, //Return header
                     				CURLOPT_SSL_VERIFYPEER => false, //Don't veryify server certificate
                     				CURLOPT_POST => true,
                     				CURLOPT_POSTFIELDS => $data
                     		);
                     		
                     		curl_setopt_array($ch, $options);
                     		$result = curl_exec($ch);
                     		$header_info = curl_getinfo($ch,CURLINFO_HEADER_OUT);
                     		$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
                     		$header = substr($result, 0, $header_size);
                     		$body = substr($result, $header_size);
                     		curl_close($ch);*/
                     //system('curl "http://192.168.0.61:8983/solr/corehol/update/extract?literal.id="'.$guid.'"&fmap.content=content&commit=true" -F "myfile=@"'.$sDir);
                     //system('curl "'.($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')).'" -F "myfile=@"'.$sDir);
                     $pdfExtractor = $this->_pdfExtractor;
                     system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret);
                     if ($ret == 0) {
                         $value = file_get_contents($outpath);
                         unlink($outpath);
                         echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n";
                         if (strlen($value) > 20) {
                             return (new Pandamp_Utility_Posts())->sanitize_post_content($value);
                         } else {
                             echo "content file kosong\n";
                             return '';
                         }
                     }
                     if ($ret == 127) {
                         print "Could not find pdftotext tool.\n";
                     }
                     return '';
                     if ($ret == 1) {
                         print "Could not find pdf file.\n";
                     }
                     return '';
                     break;
                 case 'text/html':
                 case 'text/plain':
                     $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir);
                     return $docHtml->getFieldValue('body');
                     break;
                 case 'application/x-javascript':
                 case 'application/octet-stream':
                 case 'application/msword':
                     if (strpos(strtolower($fileName), '.doc')) {
                         $extractor = $this->_wordExtractor;
                         system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret);
                         if ($ret == 0) {
                             $value = file_get_contents($outpath);
                             unlink($outpath);
                             //echo $value;
                             return $value;
                         }
                         if ($ret == 127) {
                             //print "Could not find pdftotext tool.";
                             return '';
                         }
                         if ($ret == 1) {
                             //print "Could not find pdf file.";
                             return '';
                         }
                     } else {
                         return '';
                     }
                     break;
                 default:
                     return '';
                     break;
             }
         }
     }
     return;
 }
Пример #7
0
 private function _extractText_ZendDb($guid, $systemName, $fileName, $mimeType)
 {
     //$c = $this->_registry->get('config');
     $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem();
     $rowset = $tblRelatedItem->fetchAll("itemGuid='{$guid}' AND relateAs='RELATED_FILE'");
     if (count($rowset)) {
         $row = $rowset->current();
         $parentCatalogGuid = $row->relatedGuid;
         if (!empty($systemName)) {
             $fileName = $systemName;
         }
         $sDir1 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName;
         $sDir2 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName;
         $sDir = '';
         if (file_exists($sDir1)) {
             $sDir = $sDir1;
         } else {
             if (file_exists($sDir2)) {
                 $sDir = $sDir2;
             }
         }
         if (!empty($sDir)) {
             $outpath = $sDir . '.txt';
             switch ($mimeType) {
                 case 'application/pdf':
                     $pdfExtractor = $this->_pdfExtractor;
                     system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret);
                     if ($ret == 0) {
                         $value = file_get_contents($outpath);
                         unlink($outpath);
                         //echo 'content PDF: '. $sDir.' ' . strlen($value);
                         if (strlen($value) > 20) {
                             return $this->clean_string_input($value);
                         } else {
                             //echo 'content file kosong';
                             return '';
                         }
                     }
                     if ($ret == 127) {
                         //print "Could not find pdftotext tool.";
                         return '';
                     }
                     if ($ret == 1) {
                         //print "Could not find pdf file.";
                         return '';
                     }
                     break;
                 case 'text/html':
                 case 'text/plain':
                     $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir);
                     return $docHtml->getFieldValue('body');
                     break;
                 case 'application/x-javascript':
                 case 'application/octet-stream':
                 case 'application/msword':
                     if (strpos(strtolower($fileName), '.doc')) {
                         $extractor = $this->_wordExtractor;
                         system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret);
                         if ($ret == 0) {
                             $value = file_get_contents($outpath);
                             unlink($outpath);
                             //echo $value;
                             return $value;
                         }
                         if ($ret == 127) {
                             //print "Could not find pdftotext tool.";
                             return '';
                         }
                         if ($ret == 1) {
                             //print "Could not find pdf file.";
                             return '';
                         }
                     } else {
                         return '';
                     }
                     break;
                 default:
                     return '';
                     break;
             }
         }
     }
     return '';
 }
Пример #8
0
/**
 * Given a path to a HTML document returns a lucene document with filename and contents set.
 * @param $path
 * @return Zend_Search_Lucene_Document
 */
function createHTMLDocument($path)
{
    $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path);
    $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path)));
    return $doc;
}
Пример #9
0
 protected function _spider($url)
 {
     $queue = array();
     $visited = array();
     array_push($queue, $url);
     while (!empty($queue)) {
         $doc = null;
         $url = array_shift($queue);
         if ($url = $this->_sanitizeUrl($url)) {
             if (!in_array($url, $visited)) {
                 $visited[] = $url;
                 Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG);
                 $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html';
                 if (file_exists($cachePath)) {
                     Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG);
                     try {
                         $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8');
                     } catch (Exception $e) {
                         Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG);
                     }
                 } else {
                     $this->_client->setUri($this->_getAbsoluteUrl($url));
                     try {
                         $response = $this->_client->request();
                         $status = $response->getStatus();
                         Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG);
                         if ($status == '200') {
                             $data = $response->getBody();
                             $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8');
                         }
                     } catch (Exception $e) {
                         Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG);
                     }
                 }
                 if ($doc !== null) {
                     $this->_search()->indexDoc($doc, $url);
                     $this->_indexed++;
                     $links = array_diff($doc->getLinks(), $this->_visited);
                     if (count($visited) < $this->_maxLinks) {
                         $queue = array_merge($queue, $links);
                     } else {
                         Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG);
                         exit;
                     }
                 }
             }
         }
     }
 }
 /**
  * @param AJXP_Node $ajxpNode
  * @param Zend_Search_Lucene_Interface $index
  * @throws Exception
  * @return Zend_Search_Lucene_Document
  */
 public function createIndexedDocument($ajxpNode, &$index)
 {
     if (!empty($this->metaFields)) {
         $ajxpNode->loadNodeInfo(false, false, "all");
     } else {
         $ajxpNode->loadNodeInfo();
     }
     $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION));
     $parseContent = $this->indexContent;
     if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) {
         $parseContent = false;
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) {
         $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl());
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile);
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile);
     } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile);
     } else {
         $doc = new Zend_Search_Lucene_Document();
     }
     if ($doc == null) {
         throw new Exception("Could not load document");
     }
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared"));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime)));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize));
     $ajxpMime = $ajxpNode->ajxp_mime;
     if (empty($ajxpMime)) {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)));
     } else {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime));
     }
     // Store a cached copy of the metadata
     $serializedMeta = base64_encode(serialize($ajxpNode->metadata));
     $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
     if (isset($ajxpNode->indexableMetaKeys["shared"])) {
         foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) {
             if ($ajxpNode->{$sharedField}) {
                 $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField}));
             }
         }
     }
     foreach ($this->metaFields as $field) {
         if ($ajxpNode->{$field} != null) {
             $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding());
         }
     }
     if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) {
         $privateDoc = new Zend_Search_Lucene_Document();
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user"));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId()));
         foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) {
             if ($ajxpNode->{$userField}) {
                 $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField}));
             }
         }
         $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
         $index->addDocument($privateDoc);
     }
     if ($parseContent) {
         $body = $this->extractIndexableContent($ajxpNode);
         if (!empty($body)) {
             $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body));
         }
     }
     $index->addDocument($doc);
     return $doc;
 }
Пример #11
0
 /**
  * @param AJXP_Node $ajxpNode
  * @param Zend_Search_Lucene_Interface $index
  * @throws Exception
  * @return Zend_Search_Lucene_Document
  */
 public function createIndexedDocument($ajxpNode, &$index)
 {
     $ajxpNode->loadNodeInfo();
     $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION));
     $parseContent = $this->indexContent;
     if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) {
         $parseContent = false;
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) {
         $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl());
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile);
     } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile);
     } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile);
     } else {
         $doc = new Zend_Search_Lucene_Document();
     }
     if ($doc == null) {
         throw new Exception("Could not load document");
     }
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding());
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared"));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime)));
     $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize));
     $ajxpMime = $ajxpNode->ajxp_mime;
     if (empty($ajxpMime)) {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)));
     } else {
         $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime));
     }
     // Store a cached copy of the metadata
     $serializedMeta = base64_encode(serialize($ajxpNode->metadata));
     $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
     if (isset($ajxpNode->indexableMetaKeys["shared"])) {
         foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) {
             if ($ajxpNode->{$sharedField}) {
                 $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField}));
             }
         }
     }
     foreach ($this->metaFields as $field) {
         if ($ajxpNode->{$field} != null) {
             $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding());
         }
     }
     if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) {
         $privateDoc = new Zend_Search_Lucene_Document();
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding()));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user"));
         $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId()));
         foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) {
             if ($ajxpNode->{$userField}) {
                 $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField}));
             }
         }
         $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta));
         $index->addDocument($privateDoc);
     }
     if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_TXT")))) {
         $doc->addField(Zend_Search_Lucene_Field::unStored("body", file_get_contents($ajxpNode->getUrl())));
     }
     $unoconv = $this->getFilteredOption("UNOCONV");
     $pipe = false;
     if ($parseContent && !empty($unoconv) && in_array($ext, array("doc", "odt", "xls", "ods"))) {
         $targetExt = "txt";
         if (in_array($ext, array("xls", "ods"))) {
             $targetExt = "csv";
         } else {
             if (in_array($ext, array("odp", "ppt"))) {
                 $targetExt = "pdf";
                 $pipe = true;
             }
         }
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         $unoconv = "HOME=" . AJXP_Utils::getAjxpTmpDir() . " " . $unoconv . " --stdout -f {$targetExt} " . escapeshellarg($realFile);
         if ($pipe) {
             $newTarget = str_replace(".{$ext}", ".pdf", $realFile);
             $unoconv .= " > {$newTarget}";
             register_shutdown_function("unlink", $newTarget);
         }
         $output = array();
         exec($unoconv, $output, $return);
         if (!$pipe) {
             $out = implode("\n", $output);
             $enc = 'ISO-8859-1';
             $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out);
             $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString));
         } else {
             $ext = "pdf";
         }
     }
     $pdftotext = $this->getFilteredOption("PDFTOTEXT");
     if ($parseContent && !empty($pdftotext) && in_array($ext, array("pdf"))) {
         $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl());
         if ($pipe && isset($newTarget) && is_file($newTarget)) {
             $realFile = $newTarget;
         }
         $cmd = $pdftotext . " " . escapeshellarg($realFile) . " -";
         $output = array();
         exec($cmd, $output, $return);
         $out = implode("\n", $output);
         $enc = 'UTF8';
         $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out);
         $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString));
     }
     $index->addDocument($doc);
     return $doc;
 }
Пример #12
0
 private function _extractText($guid, $systemName, $fileName, $mimeType, $lang)
 {
     $db = $this->getDbHandler($lang);
     $query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'";
     $results = $db->query($query);
     $rowset = $results->fetchAll(PDO::FETCH_OBJ);
     if (count($rowset)) {
         $row = $rowset[0];
         $parentCatalogGuid = $row->relatedGuid;
         if (!empty($systemName)) {
             $fileName = $systemName;
         }
         $sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName;
         $sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName;
         $sDir = '';
         if (file_exists($sDir1)) {
             $sDir = $sDir1;
         } else {
             if (file_exists($sDir2)) {
                 $sDir = $sDir2;
             }
         }
         if (!empty($sDir)) {
             $outpath = $sDir . '.txt';
             switch ($mimeType) {
                 case 'application/pdf':
                     $pdfExtractor = $this->_pdfExtractor;
                     system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret);
                     if ($ret == 0) {
                         $value = file_get_contents($outpath);
                         unlink($outpath);
                         echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n";
                         if (strlen($value) > 20) {
                             return (new Pandamp_Utility_Posts())->sanitize_post_content($value);
                         } else {
                             echo "content file kosong\n";
                             return '';
                         }
                     }
                     if ($ret == 127) {
                         print "Could not find pdftotext tool.\n";
                     }
                     return '';
                     if ($ret == 1) {
                         print "Could not find pdf file.\n";
                     }
                     return '';
                     break;
                 case 'text/html':
                 case 'text/plain':
                     $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir);
                     return $docHtml->getFieldValue('body');
                     break;
                 case 'application/x-javascript':
                 case 'application/octet-stream':
                 case 'application/msword':
                     if (strpos(strtolower($fileName), '.doc')) {
                         $extractor = $this->_wordExtractor;
                         system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret);
                         if ($ret == 0) {
                             $value = file_get_contents($outpath);
                             unlink($outpath);
                             //echo $value;
                             return $value;
                         }
                         if ($ret == 127) {
                             //print "Could not find pdftotext tool.";
                             return '';
                         }
                         if ($ret == 1) {
                             //print "Could not find pdf file.";
                             return '';
                         }
                     } else {
                         return '';
                     }
                     break;
                 default:
                     return '';
                     break;
             }
         }
     }
     return;
 }