/** * Object constructor * * @param string $data * @param boolean $storeContent * @throws NotIndexedException */ private function __construct($data, $storeContent) { //TODO check PDF >1.5 metadata extraction //do the content extraction $parser = new Parser(); try { $pdf = $parser->parseContent($data); $body = $pdf->getText(); // Store contents if ($storeContent) { $this->addField(Document\Field::Text('body', $body, 'UTF-8')); } else { $this->addField(Document\Field::UnStored('body', $body, 'UTF-8')); } $details = $pdf->getDetails(); // Store meta data properties foreach ($details as $key => $value) { $key = strtolower($key); if ($key === 'author') { $key = 'creator'; } $this->addField(Document\Field::Text($key, $value, 'UTF-8')); } } catch (\Exception $ex) { throw new NotIndexedException(null, null, $ex); } }
public function testUnStored() { $field = Document\Field::UnStored('field', 'value'); $this->assertEquals($field->boost, 1); $this->assertEquals($field->encoding, 'UTF-8'); $this->assertEquals($field->isBinary, false); $this->assertEquals($field->isIndexed, true); $this->assertEquals($field->isStored, false); $this->assertEquals($field->isTokenized, true); $this->assertEquals($field->name, 'field'); $this->assertEquals($field->value, 'value'); }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws ExtensionNotLoadedException * @throws RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('Open Document Text processing functionality requires Zip extension to be loaded'); } // Document data holders $documentHeadlines = array(); $documentParagraphs = array(); // Open OpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $content = $package->getFromName('content.xml'); if ($content === false) { throw new RuntimeException('Invalid archive or corrupted .odt file.'); } // Prevent php from loading remote resources $loadEntities = libxml_disable_entity_loader(true); $sxe = simplexml_load_string($content, 'SimpleXMLElement', LIBXML_NOBLANKS | LIBXML_COMPACT); // Restore entity loader state libxml_disable_entity_loader($loadEntities); foreach ($sxe->xpath('//text:h') as $headline) { $h = strip_tags($headline->asXML()); $documentHeadlines[] = $h; } foreach ($sxe->xpath('//text:p') as $paragraph) { $p = strip_tags($paragraph->asXML()); $documentParagraphs[] = $p; } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store contents if ($storeContent) { $this->addField(Field::Text('headlines', implode(' ', $documentHeadlines), 'UTF-8')); $this->addField(Field::Text('body', implode('', $documentParagraphs), 'UTF-8')); } else { $this->addField(Field::UnStored('headlines', implode(' ', $documentHeadlines), 'UTF-8')); $this->addField(Field::UnStored('body', implode('', $documentParagraphs), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * インデックスファイルを生成 */ public static function updateIndex() { if (empty(self::$igo)) { self::$igo = new Tagger(array('dict_dir' => LIB_DIR . 'ipadic', 'reduce_mode' => true)); } Analyzer::setDefault(new Utf8()); // 索引の作成 $index = Lucene::create(CACHE_DIR . self::INDEX_NAME); foreach (Listing::pages() as $page) { if (empty($page)) { continue; } $wiki = Factory::Wiki($page); // 読む権限がない場合スキップ if (!$wiki->isReadable() || $wiki->isHidden()) { continue; } /* // HTML出力 $html[] = '<html><head>'; $html[] = '<meta http-equiv="Content-type" content="text/html; charset=UTF-8"/>'; $html[] = '<title>' . $wiki->title() . '</title>'; $html[] = '</head>'; $html[] = '<body>' . $wiki->render() . '</body>'; $html[] = '</html>'; */ $doc = new LuceneDoc(); $doc->addField(Field::Text('title', $wiki->title())); // Store document URL to identify it in the search results $doc->addField(Field::Text('url', $wiki->uri())); // Index document contents //$contents = join(" ", self::$igo->wakati(strip_tags($wiki->render()))); $contents = strip_tags($wiki->render()); $doc->addField(Field::UnStored('contents', $contents)); // 索引へ文書の登録 $index->addDocument($doc); } $index->optimize(); }
/** * Object constructor * * @param string $data HTML string (may be HTML fragment, ) * @param boolean $isFile * @param boolean $storeContent * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. */ private function __construct($data, $isFile, $storeContent, $defaultEncoding = '') { $this->_doc = new \DOMDocument(); $this->_doc->substituteEntities = true; if ($isFile) { $htmlData = file_get_contents($data); } else { $htmlData = $data; } ErrorHandler::start(E_WARNING); $this->_doc->loadHTML($htmlData); ErrorHandler::stop(); if ($this->_doc->encoding === null) { // Document encoding is not recognized /** @todo improve HTML vs HTML fragment recognition */ if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { // It's an HTML document // Add additional HEAD section and recognize document $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); ErrorHandler::start(E_WARNING); $this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); ErrorHandler::stop(); // Remove additional HEAD section $xpath = new \DOMXPath($this->_doc); $head = $xpath->query('/html/head')->item(0); $head->parentNode->removeChild($head); } else { // It's an HTML fragment ErrorHandler::start(E_WARNING); $this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) . '</body></html>'); ErrorHandler::stop(); } } /** @todo Add correction of wrong HTML encoding recognition processing * The case is: * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, * even $this->_doc->encoding demonstrates another recognized encoding */ $xpath = new \DOMXPath($this->_doc); $docTitle = ''; $titleNodes = $xpath->query('/html/head/title'); foreach ($titleNodes as $titleNode) { // title should always have only one entry, but we process all nodeset entries $docTitle .= $titleNode->nodeValue . ' '; } $this->addField(Field::Text('title', $docTitle, 'UTF-8')); $metaNodes = $xpath->query('/html/head/meta[@name]'); foreach ($metaNodes as $metaNode) { $this->addField(Field::Text($metaNode->getAttribute('name'), $metaNode->getAttribute('content'), 'UTF-8')); } $docBody = ''; $bodyNodes = $xpath->query('/html/body'); foreach ($bodyNodes as $bodyNode) { // body should always have only one entry, but we process all nodeset entries $this->_retrieveNodeText($bodyNode, $docBody); } if ($storeContent) { $this->addField(Field::Text('body', $docBody, 'UTF-8')); } else { $this->addField(Field::UnStored('body', $docBody, 'UTF-8')); } $linkNodes = $this->_doc->getElementsByTagName('a'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) { $this->_links[] = $href; } } $linkNodes = $this->_doc->getElementsByTagName('area'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '' && (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow')) { $this->_links[] = $href; } } $this->_links = array_unique($this->_links); $linkNodes = $xpath->query('/html/head/link'); foreach ($linkNodes as $linkNode) { if (($href = $linkNode->getAttribute('href')) != '') { $this->_headerLinks[] = $href; } } $this->_headerLinks = array_unique($this->_headerLinks); }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded'); } // Document data holders $slides = array(); $slideNotes = array(); $documentBody = array(); $coreProperties = array(); // Open AbstractOpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .pptx file.'); } $relations = simplexml_load_string($relationsXml); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Search for slides... $slideRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels"))); foreach ($slideRelations->Relationship as $slideRel) { if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) { // Found slide! $slides[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])))); // Search for slide notes $slideNotesRelations = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels"))); foreach ($slideNotesRelations->Relationship as $slideNoteRel) { if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) { // Found slide notes! $slideNotes[str_replace('rId', '', (string) $slideRel["Id"])] = simplexml_load_string($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])))); break; } } } } break; } } // Sort slides ksort($slides); ksort($slideNotes); // Extract contents from slides foreach ($slides as $slideKey => $slide) { // Register namespaces $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); // Fetch all text $textElements = $slide->xpath('//a:t'); foreach ($textElements as $textElement) { $documentBody[] = (string) $textElement; } // Extract contents from slide notes if (isset($slideNotes[$slideKey])) { // Fetch slide note $slideNote = $slideNotes[$slideKey]; // Register namespaces $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); // Fetch all text $textElements = $slideNote->xpath('//a:t'); foreach ($textElements as $textElement) { $documentBody[] = (string) $textElement; } } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded'); } // Document data holders $sharedStrings = array(); $worksheets = array(); $documentBody = array(); $coreProperties = array(); // Open AbstractOpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .xlsx file.'); } $relations = XmlSecurity::scan($relationsXml); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Read relations for workbook... $workbookRelations = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels"))); $workbookRelations->registerXPathNamespace("rel", AbstractOpenXML::SCHEMA_RELATIONSHIP); // Read shared strings $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . self::SCHEMA_SHAREDSTRINGS . "']"); $sharedStringsPath = (string) $sharedStringsPath[0]['Target']; $xmlStrings = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath))); if (isset($xmlStrings) && isset($xmlStrings->si)) { foreach ($xmlStrings->si as $val) { if (isset($val->t)) { $sharedStrings[] = (string) $val->t; } elseif (isset($val->r)) { $sharedStrings[] = $this->_parseRichText($val); } } } // Loop relations for workbook and extract worksheets... foreach ($workbookRelations->Relationship as $workbookRelation) { if ($workbookRelation["Type"] == self::SCHEMA_WORKSHEETRELATION) { $worksheets[str_replace('rId', '', (string) $workbookRelation["Id"])] = XmlSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])))); } } break; } } // Sort worksheets ksort($worksheets); // Extract contents from worksheets foreach ($worksheets as $sheetKey => $worksheet) { foreach ($worksheet->sheetData->row as $row) { foreach ($row->c as $c) { // Determine data type $dataType = (string) $c["t"]; switch ($dataType) { case "s": // Value is a shared string if ((string) $c->v != '') { $value = $sharedStrings[intval($c->v)]; } else { $value = ''; } break; case "b": // Value is boolean $value = (string) $c->v; if ($value == '0') { $value = false; } elseif ($value == '1') { $value = true; } else { $value = (bool) $c->v; } break; case "inlineStr": // Value is rich text inline $value = $this->_parseRichText($c->is); break; case "e": // Value is an error message if ((string) $c->v != '') { $value = (string) $c->v; } else { $value = ''; } break; default: // Value is a string $value = (string) $c->v; // Check for numeric values if (is_numeric($value) && $dataType != 's') { if ($value == (int) $value) { $value = (int) $value; } elseif ($value == (double) $value) { $value = (double) $value; } elseif ($value == (double) $value) { $value = (double) $value; } } } $documentBody[] = $value; } } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * Object constructor * * @param string $fileName * @param boolean $storeContent * @throws \ZendSearch\Lucene\Exception\ExtensionNotLoadedException * @throws \ZendSearch\Lucene\Exception\RuntimeException */ private function __construct($fileName, $storeContent) { if (!class_exists('ZipArchive', false)) { throw new ExtensionNotLoadedException('MS Office documents processing functionality requires Zip extension to be loaded'); } // Document data holders $documentBody = array(); $coreProperties = array(); // Open AbstractOpenXML package $package = new \ZipArchive(); $package->open($fileName); // Read relations and search for officeDocument $relationsXml = $package->getFromName('_rels/.rels'); if ($relationsXml === false) { throw new RuntimeException('Invalid archive or corrupted .docx file.'); } $relations = XMLSecurity::scan($relationsXml); foreach ($relations->Relationship as $rel) { if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { // Found office document! Read in contents... $contents = XMLSecurity::scan($package->getFromName($this->absoluteZipPath(dirname($rel['Target']) . '/' . basename($rel['Target'])))); $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML); $paragraphs = $contents->xpath('//w:body/w:p'); foreach ($paragraphs as $paragraph) { $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); if ($runs === false) { // Paragraph doesn't contain any text or breaks continue; } foreach ($runs as $run) { if ($run->getName() == 'br') { // Break element $documentBody[] = ' '; } else { $documentBody[] = (string) $run; } } // Add space after each paragraph. So they are not bound together. $documentBody[] = ' '; } break; } } // Read core properties $coreProperties = $this->extractMetaData($package); // Close file $package->close(); // Store filename $this->addField(Field::Text('filename', $fileName, 'UTF-8')); // Store contents if ($storeContent) { $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8')); } else { $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8')); } // Store meta data properties foreach ($coreProperties as $key => $value) { $this->addField(Field::Text($key, $value, 'UTF-8')); } // Store title (if not present in meta data) if (!isset($coreProperties['title'])) { $this->addField(Field::Text('title', $fileName, 'UTF-8')); } }
/** * @ORM\PostPersist */ public function updateLuceneIndex() { $index = self::getLuceneIndex(); // remove existing entries foreach ($index->find('pk:' . $this->getId()) as $hit) { $index->delete($hit->id); } // don't index expired and non-activated jobs if ($this->isExpired() || !$this->getIsActivated()) { return; } $doc = new Document(); // store job primary key to identify it in the search results $doc->addField(Document\Field::Keyword('pk', $this->getId())); // index job fields $doc->addField(Document\Field::UnStored('position', $this->getPosition(), 'utf-8')); $doc->addField(Document\Field::UnStored('company', $this->getCompany(), 'utf-8')); $doc->addField(Document\Field::UnStored('location', $this->getLocation(), 'utf-8')); $doc->addField(Document\Field::UnStored('description', $this->getDescription(), 'utf-8')); // add job to the index $index->addDocument($doc); $index->commit(); }
/** * index a file * * @param File $file the file to be indexed * @param bool $commit * * @return bool true when something was stored in the index, false otherwise (eg, folders are not indexed) * @throws NotIndexedException when an unsupported file type is encountered */ public function indexFile(File $file, $commit = true) { // we decide how to index on mime type or file extension $mimeType = $file->getMimeType(); $fileExtension = strtolower(pathinfo($file->getName(), PATHINFO_EXTENSION)); // initialize plain lucene document $doc = new Document(); // index content for local files only $storage = $file->getStorage(); if ($storage->isLocal()) { $path = $storage->getLocalFile($file->getInternalPath()); //try to use special lucene document types if ('text/html' === $mimeType) { //TODO could be indexed, even if not local $doc = HTML::loadHTML($file->getContent()); } else { if ('text/' === substr($mimeType, 0, 5) || 'application/x-tex' === $mimeType) { $body = $file->getContent(); if ($body != '') { $doc->addField(Document\Field::UnStored('body', $body)); } } else { if ('application/pdf' === $mimeType) { $doc = Pdf::loadPdf($file->getContent()); // the zend classes only understand docx and not doc files } else { if ($fileExtension === 'docx') { $doc = Document\Docx::loadDocxFile($path); //} else if ('application/msexcel' === $mimeType) { } else { if ($fileExtension === 'xlsx') { $doc = Document\Xlsx::loadXlsxFile($path); //} else if ('application/mspowerpoint' === $mimeType) { } else { if ($fileExtension === 'pptx') { $doc = Document\Pptx::loadPptxFile($path); } else { if ($fileExtension === 'odt') { $doc = Odt::loadOdtFile($path); } else { if ($fileExtension === 'ods') { $doc = Ods::loadOdsFile($path); } else { throw new NotIndexedException(); } } } } } } } } } // Store filecache id as unique id to lookup by when deleting $doc->addField(Document\Field::Keyword('fileId', $file->getId())); // Store document path for the search results $doc->addField(Document\Field::Text('path', $file->getPath(), 'UTF-8')); $doc->addField(Document\Field::unIndexed('mtime', $file->getMTime())); $doc->addField(Document\Field::unIndexed('size', $file->getSize())); $doc->addField(Document\Field::unIndexed('mimetype', $mimeType)); $this->index->updateFile($doc, $file->getId(), $commit); return true; }
/** * Indexa dados nos arquivos de json */ public function index() { $dir = realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR . "data" . DIRECTORY_SEPARATOR; $jsonDir = $dir . "json"; $indexDir = $dir . "index"; // ler aquivos json $files = scandir($jsonDir); foreach ($files as $file) { if ($file == '.' || $file == '..') { continue; } // Se arquivo existe if (is_file($jsonDir . DIRECTORY_SEPARATOR . $file)) { $json = json_decode(file_get_contents($jsonDir . DIRECTORY_SEPARATOR . $file)); $indexName = substr($file, 0, -5); // Cria index $index = Lucene\Lucene::create($indexDir . DIRECTORY_SEPARATOR . $indexName); // Cria documento e define campos para indexar foreach ($json as $entry) { $doc = new Lucene\Document(); $doc->addField(Lucene\Document\Field::Text('url', $entry->title)); $doc->addField(Lucene\Document\Field::UnStored('contents', $entry->text)); $index->addDocument($doc); } } } }