Load HTML document from a file
public static loadHTMLFile ( string $file, boolean $storeContent = false, string $defaultEncoding = '' ) : Zend_Search_Lucene_Document_Html | ||
$file | string | |
$storeContent | boolean | |
$defaultEncoding | string | HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
Résultat | Zend_Search_Lucene_Document_Html |
public function testHtml() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $doc->highlight('document', '#66ffff'); $this->assertEquals($doc->getHTML(), "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html>\n<head><title>Page title</title></head>\n<body><p><b style=\"color:black;background-color:#66ffff\">Document</b> body.</p></body>\n</html>\n"); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_files/_indexSource/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
public function testHtml() { $doc = Zend_Search_Lucene_Document_Html::loadHTML('<HTML><HEAD><TITLE>Page title</TITLE></HEAD><BODY>Document body.</BODY></HTML>'); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $doc->highlight('document', '#66ffff'); $this->assertTrue(strpos($doc->getHTML(), "<b style=\"color:black;background-color:#66ffff\">Document</b> body.") !== false); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
/** * Returns a string containing the text in the given HTML document. * * @param String $filename Full filesystem path to the file to process. * @return String Text extracted from the file. */ public static function extract($filename) { if (!file_exists($filename)) { return ''; } try { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, true); } catch (Exception $e) { return ''; } return $doc->body; }
public function addurl() { // use a local file for purpose of demo. $filename = MODPATH . "kosearch" . DIRECTORY_SEPARATOR . "examples" . DIRECTORY_SEPARATOR . "kohana_home.html"; // Note: the Search class is responsible for loading the Zend libraries, so as we // want to instantiate Zend_Search_Lucene_Document_Html prior to calling singleton, // we must first call Search::instance()->load_search_libs(); Search::instance()->load_search_libs(); $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($filename, TRUE, "utf-8"); Search::instance()->addDocument($doc); $this->index('Kohana page successfully added ↓ <a href="#form2" title="scroll down">scroll down</a> ↓'); }
public function testHtmlLinksProcessing() { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile(dirname(__FILE__) . '/_indexSource/_files/contributing.documentation.html', true); $this->assertTrue($doc instanceof Zend_Search_Lucene_Document_Html); $this->assertTrue(array_values($doc->getHeaderLinks()) == array('index.html', 'contributing.html', 'contributing.bugs.html', 'contributing.wishlist.html')); $this->assertTrue(array_values($doc->getLinks()) == array('contributing.bugs.html', 'contributing.wishlist.html', 'developers.documentation.html', 'faq.translators-revision-tracking.html', 'index.html', 'contributing.html')); }
private function _extractText($guid, $systemName, $fileName, $mimeType, $lang = 'id') { $query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'"; $results = $this->getDbHandler($lang)->query($query); $rowset = $results->fetchAll(PDO::FETCH_OBJ); if (count($rowset)) { $row = $rowset[0]; $parentCatalogGuid = $row->relatedGuid; if (!empty($systemName)) { $fileName = $systemName; } $sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName; $sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName; $sDir = ''; if (file_exists($sDir1)) { $sDir = $sDir1; } else { if (file_exists($sDir2)) { $sDir = $sDir2; } } if (!empty($sDir)) { $outpath = $sDir . '.txt'; switch ($mimeType) { case 'application/pdf': //$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&literal.name=content&commit=true'); /*$ch = curl_init('http://175.103.48.153:8983/solr/corehol/update/extract?literal.id='.$guid.'&fmap.content=content&commit=true'); curl_setopt ($ch, CURLOPT_POSTFIELDS, array('myfile'=>'@'.$sDir)); curl_setopt ($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_BINARYTRANSFER, TRUE); curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-type:multipart/form-data')); $result = curl_exec ($ch);*/ /*$mapping_array = [ "literal.id" => "$guid", "fmap.content" => "content", "commit" => "true" ]; $ch = curl_init(); $solr_extraction_endpoint = "http://192.168.0.61:8983/solr/corehol/update/extract"; curl_setopt($ch, CURLOPT_POST, TRUE); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_URL, ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&'))); $cfile = curl_file_create($sDir); curl_setopt($ch, CURLOPT_POSTFIELDS, array('myfile' => $cfile)); if(!curl_exec($ch) == TRUE) { throw new Exception('Curl Error:' . curl_error($ch)); echo "<br/>Curl Error:<br/>" . curl_error($ch); } curl_close($ch); die;*/ //curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type:multipart/form-data')); /*$cfile = $this->getCurlValue($sDir,'multipart/form-data',$fileName); $data = array('file' => $cfile); $ch = curl_init(); $options = array(CURLOPT_URL => ($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')), CURLOPT_RETURNTRANSFER => true, CURLINFO_HEADER_OUT => true, //Request header CURLOPT_HEADER => true, //Return header CURLOPT_SSL_VERIFYPEER => false, //Don't veryify server certificate CURLOPT_POST => true, CURLOPT_POSTFIELDS => $data ); curl_setopt_array($ch, $options); $result = curl_exec($ch); $header_info = curl_getinfo($ch,CURLINFO_HEADER_OUT); $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); $header = substr($result, 0, $header_size); $body = substr($result, $header_size); curl_close($ch);*/ //system('curl "http://192.168.0.61:8983/solr/corehol/update/extract?literal.id="'.$guid.'"&fmap.content=content&commit=true" -F "myfile=@"'.$sDir); //system('curl "'.($solr_extraction_endpoint . '?' . http_build_query($mapping_array,'','&')).'" -F "myfile=@"'.$sDir); $pdfExtractor = $this->_pdfExtractor; system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n"; if (strlen($value) > 20) { return (new Pandamp_Utility_Posts())->sanitize_post_content($value); } else { echo "content file kosong\n"; return ''; } } if ($ret == 127) { print "Could not find pdftotext tool.\n"; } return ''; if ($ret == 1) { print "Could not find pdf file.\n"; } return ''; break; case 'text/html': case 'text/plain': $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir); return $docHtml->getFieldValue('body'); break; case 'application/x-javascript': case 'application/octet-stream': case 'application/msword': if (strpos(strtolower($fileName), '.doc')) { $extractor = $this->_wordExtractor; system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo $value; return $value; } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } } else { return ''; } break; default: return ''; break; } } } return; }
private function _extractText_ZendDb($guid, $systemName, $fileName, $mimeType) { //$c = $this->_registry->get('config'); $tblRelatedItem = new Kutu_Core_Orm_Table_RelatedItem(); $rowset = $tblRelatedItem->fetchAll("itemGuid='{$guid}' AND relateAs='RELATED_FILE'"); if (count($rowset)) { $row = $rowset->current(); $parentCatalogGuid = $row->relatedGuid; if (!empty($systemName)) { $fileName = $systemName; } $sDir1 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName; $sDir2 = KUTU_ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName; $sDir = ''; if (file_exists($sDir1)) { $sDir = $sDir1; } else { if (file_exists($sDir2)) { $sDir = $sDir2; } } if (!empty($sDir)) { $outpath = $sDir . '.txt'; switch ($mimeType) { case 'application/pdf': $pdfExtractor = $this->_pdfExtractor; system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo 'content PDF: '. $sDir.' ' . strlen($value); if (strlen($value) > 20) { return $this->clean_string_input($value); } else { //echo 'content file kosong'; return ''; } } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } break; case 'text/html': case 'text/plain': $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir); return $docHtml->getFieldValue('body'); break; case 'application/x-javascript': case 'application/octet-stream': case 'application/msword': if (strpos(strtolower($fileName), '.doc')) { $extractor = $this->_wordExtractor; system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo $value; return $value; } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } } else { return ''; } break; default: return ''; break; } } } return ''; }
/** * Given a path to a HTML document returns a lucene document with filename and contents set. * @param $path * @return Zend_Search_Lucene_Document */ function createHTMLDocument($path) { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($path); $doc->addField(Zend_Search_Lucene_Field::Text('filename', basename($path))); return $doc; }
protected function _spider($url) { $queue = array(); $visited = array(); array_push($queue, $url); while (!empty($queue)) { $doc = null; $url = array_shift($queue); if ($url = $this->_sanitizeUrl($url)) { if (!in_array($url, $visited)) { $visited[] = $url; Bbx_Log::write('Spidering url ' . $url, null, Bbx_Search::LOG); $cachePath = APPLICATION_PATH . '/../www/cached' . $url . '.html'; if (file_exists($cachePath)) { Bbx_Log::write('Found file in cache', null, Bbx_Search::LOG); try { $doc = Zend_Search_Lucene_Document_Html::loadHTMLFile($cachePath, false, 'utf-8'); } catch (Exception $e) { Bbx_Log::write('Unable to open file: ' . $cachePath, null, Bbx_Search::LOG); } } else { $this->_client->setUri($this->_getAbsoluteUrl($url)); try { $response = $this->_client->request(); $status = $response->getStatus(); Bbx_Log::write('Client response code ' . $status, null, Bbx_Search::LOG); if ($status == '200') { $data = $response->getBody(); $doc = Zend_Search_Lucene_Document_Html::loadHTML($data, false, 'utf-8'); } } catch (Exception $e) { Bbx_Log::write('Request failed: ' . $e->getMessage(), null, Bbx_Search::LOG); } } if ($doc !== null) { $this->_search()->indexDoc($doc, $url); $this->_indexed++; $links = array_diff($doc->getLinks(), $this->_visited); if (count($visited) < $this->_maxLinks) { $queue = array_merge($queue, $links); } else { Bbx_Log::write('Reached max number of links (' . $this->_maxLinks . '), exiting', null, Bbx_Search::LOG); exit; } } } } } }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { if (!empty($this->metaFields)) { $ajxpNode->loadNodeInfo(false, false, "all"); } else { $ajxpNode->loadNodeInfo(); } $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent) { $body = $this->extractIndexableContent($ajxpNode); if (!empty($body)) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", $body)); } } $index->addDocument($doc); return $doc; }
/** * @param AJXP_Node $ajxpNode * @param Zend_Search_Lucene_Interface $index * @throws Exception * @return Zend_Search_Lucene_Document */ public function createIndexedDocument($ajxpNode, &$index) { $ajxpNode->loadNodeInfo(); $ext = strtolower(pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION)); $parseContent = $this->indexContent; if ($parseContent && $ajxpNode->bytesize > $this->getFilteredOption("PARSE_CONTENT_MAX_SIZE")) { $parseContent = false; } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_HTML")))) { $doc = @Zend_Search_Lucene_Document_Html::loadHTMLFile($ajxpNode->getUrl()); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Docx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Docx::loadDocxFile($realFile); } elseif ($parseContent && $ext == "docx" && class_exists("Zend_Search_Lucene_Document_Pptx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Pptx::loadPptxFile($realFile); } elseif ($parseContent && $ext == "xlsx" && class_exists("Zend_Search_Lucene_Document_Xlsx")) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $doc = @Zend_Search_Lucene_Document_Xlsx::loadXlsxFile($realFile); } else { $doc = new Zend_Search_Lucene_Document(); } if ($doc == null) { throw new Exception("Could not load document"); } $doc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl()), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Text("basename", basename($ajxpNode->getPath())), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_node", "yes"), SystemTextEncoding::getEncoding()); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "shared")); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_modiftime", date("Ymd", $ajxpNode->ajxp_modiftime))); $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_bytesize", $ajxpNode->bytesize)); $ajxpMime = $ajxpNode->ajxp_mime; if (empty($ajxpMime)) { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION))); } else { $doc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_mime", $ajxpNode->ajxp_mime)); } // Store a cached copy of the metadata $serializedMeta = base64_encode(serialize($ajxpNode->metadata)); $doc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); if (isset($ajxpNode->indexableMetaKeys["shared"])) { foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) { if ($ajxpNode->{$sharedField}) { $doc->addField(Zend_search_Lucene_Field::keyword($sharedField, $ajxpNode->{$sharedField})); } } } foreach ($this->metaFields as $field) { if ($ajxpNode->{$field} != null) { $doc->addField(Zend_Search_Lucene_Field::Text("ajxp_meta_{$field}", $ajxpNode->{$field}), SystemTextEncoding::getEncoding()); } } if (isset($ajxpNode->indexableMetaKeys["user"]) && count($ajxpNode->indexableMetaKeys["user"]) && AuthService::usersEnabled() && AuthService::getLoggedUser() != null) { $privateDoc = new Zend_Search_Lucene_Document(); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_url", $ajxpNode->getUrl(), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("node_path", str_replace("/", "AJXPFAKESEP", $ajxpNode->getPath()), SystemTextEncoding::getEncoding())); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_scope", "user")); $privateDoc->addField(Zend_Search_Lucene_Field::Keyword("ajxp_user", AuthService::getLoggedUser()->getId())); foreach ($ajxpNode->indexableMetaKeys["user"] as $userField) { if ($ajxpNode->{$userField}) { $privateDoc->addField(Zend_search_Lucene_Field::keyword($userField, $ajxpNode->{$userField})); } } $privateDoc->addField(Zend_Search_Lucene_Field::Binary("serialized_metadata", $serializedMeta)); $index->addDocument($privateDoc); } if ($parseContent && in_array($ext, explode(",", $this->getFilteredOption("PARSE_CONTENT_TXT")))) { $doc->addField(Zend_Search_Lucene_Field::unStored("body", file_get_contents($ajxpNode->getUrl()))); } $unoconv = $this->getFilteredOption("UNOCONV"); $pipe = false; if ($parseContent && !empty($unoconv) && in_array($ext, array("doc", "odt", "xls", "ods"))) { $targetExt = "txt"; if (in_array($ext, array("xls", "ods"))) { $targetExt = "csv"; } else { if (in_array($ext, array("odp", "ppt"))) { $targetExt = "pdf"; $pipe = true; } } $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); $unoconv = "HOME=" . AJXP_Utils::getAjxpTmpDir() . " " . $unoconv . " --stdout -f {$targetExt} " . escapeshellarg($realFile); if ($pipe) { $newTarget = str_replace(".{$ext}", ".pdf", $realFile); $unoconv .= " > {$newTarget}"; register_shutdown_function("unlink", $newTarget); } $output = array(); exec($unoconv, $output, $return); if (!$pipe) { $out = implode("\n", $output); $enc = 'ISO-8859-1'; $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out); $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString)); } else { $ext = "pdf"; } } $pdftotext = $this->getFilteredOption("PDFTOTEXT"); if ($parseContent && !empty($pdftotext) && in_array($ext, array("pdf"))) { $realFile = call_user_func(array($ajxpNode->wrapperClassName, "getRealFSReference"), $ajxpNode->getUrl()); if ($pipe && isset($newTarget) && is_file($newTarget)) { $realFile = $newTarget; } $cmd = $pdftotext . " " . escapeshellarg($realFile) . " -"; $output = array(); exec($cmd, $output, $return); $out = implode("\n", $output); $enc = 'UTF8'; $asciiString = iconv($enc, 'ASCII//TRANSLIT//IGNORE', $out); $doc->addField(Zend_Search_Lucene_Field::unStored("body", $asciiString)); } $index->addDocument($doc); return $doc; }
private function _extractText($guid, $systemName, $fileName, $mimeType, $lang) { $db = $this->getDbHandler($lang); $query = "SELECT * FROM KutuRelatedItem where itemGuid='{$guid}' AND relateAs='RELATED_FILE'"; $results = $db->query($query); $rowset = $results->fetchAll(PDO::FETCH_OBJ); if (count($rowset)) { $row = $rowset[0]; $parentCatalogGuid = $row->relatedGuid; if (!empty($systemName)) { $fileName = $systemName; } $sDir1 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $fileName; $sDir2 = ROOT_DIR . DIRECTORY_SEPARATOR . 'uploads' . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . $parentCatalogGuid . DIRECTORY_SEPARATOR . $fileName; $sDir = ''; if (file_exists($sDir1)) { $sDir = $sDir1; } else { if (file_exists($sDir2)) { $sDir = $sDir2; } } if (!empty($sDir)) { $outpath = $sDir . '.txt'; switch ($mimeType) { case 'application/pdf': $pdfExtractor = $this->_pdfExtractor; system("{$pdfExtractor} " . $sDir . ' ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); echo 'content PDF: ' . $sDir . ' ' . strlen($value) . "\n"; if (strlen($value) > 20) { return (new Pandamp_Utility_Posts())->sanitize_post_content($value); } else { echo "content file kosong\n"; return ''; } } if ($ret == 127) { print "Could not find pdftotext tool.\n"; } return ''; if ($ret == 1) { print "Could not find pdf file.\n"; } return ''; break; case 'text/html': case 'text/plain': $docHtml = Zend_Search_Lucene_Document_Html::loadHTMLFile($sDir); return $docHtml->getFieldValue('body'); break; case 'application/x-javascript': case 'application/octet-stream': case 'application/msword': if (strpos(strtolower($fileName), '.doc')) { $extractor = $this->_wordExtractor; system("{$extractor} -m cp850.txt " . $sDir . ' > ' . $outpath, $ret); if ($ret == 0) { $value = file_get_contents($outpath); unlink($outpath); //echo $value; return $value; } if ($ret == 127) { //print "Could not find pdftotext tool."; return ''; } if ($ret == 1) { //print "Could not find pdf file."; return ''; } } else { return ''; } break; default: return ''; break; } } } return; }