/** * Execute the console command. * * @return mixed */ public function fire() { $watches = array(); $in = inotify_init(); // add watches starting from root directory $root = Path::fromRelative(''); $this->addWatches($in, $root, $watches); printf("\nReading for events\n"); while (true) { $events = inotify_read($in); foreach ($events as $event) { $path = $watches[$event['wd']]; $expanded = $this->expandMask($event['mask']); $eventName = trim(implode(', ', $expanded), ', '); // if the event has a name attached, then index that if ($event['name']) { $newPathName = $path->getPathname() . '/' . $event['name']; $newPath = new Path($newPathName); Indexer::index($newPath, 1); // this may be a new directory, so add a watch to it anyway if ($newPath->exists() && $newPath->isDir()) { try { $wd = inotify_add_watch($in, $newPath->getPathname(), $this->computedMask); $watches[$wd] = $newPath; } catch (Exception $e) { echo 'Caught exception: ', $e->getMessage(), "\n"; } } } else { // event must apply to this directory, so index it, 1 level deep Indexer::index($path, 1); } } } }
/** * Execute the console command. * * @return mixed */ public function fire() { $basePath = Config::get('app.manga_path'); $path = new Path($basePath); $count = 0; Indexer::index($path, null, $count); $this->info(sprintf('Added %s new paths', $count)); }
function do_main() { $doc = $this->oDocument; $docid = $doc->getId(); if (Permission::userIsSystemAdministrator()) { $full_path = $doc->getFullPath(); if (Indexer::isDocumentScheduled($docid)) { Indexer::unqueueDocument($docid); $this->addInfoMessage(sprintf(_kt("Document '%s' has been removed from the indexing queue."), $full_path)); } else { Indexer::index($doc, 'A'); $this->addInfoMessage(sprintf(_kt("Document '%s' has been added to the indexing queue."), $full_path)); } } redirect("view.php?fDocumentId={$docid}"); exit; }
/** * Process a document - extract text and index it * Refactored from indexDocuments() * * @param unknown_type $docinfo */ public function processDocument($document, $docinfo) { global $default; static $extractorCache = array(); // increment indexed documents count Indexer::incrementCount(); // if document is a zero byte file, let's just unqueue and return if ($document->getFileSize() == 0) { Indexer::unqueueDocument($docinfo['document_id'], sprintf(_kt("Zero Byte documents do not need to be indexed: %d"), $docinfo['document_id'])); return; } $docId = $docinfo['document_id']; $extension = $docinfo['filetypes']; $mimeType = $docinfo['mimetypes']; $extractorClass = $docinfo['extractor']; $indexDocument = in_array($docinfo['what'], array('A', 'C')); $indexDiscussion = in_array($docinfo['what'], array('A', 'D')); $this->indexingHistory = ''; $tempPath = $this->tempPath; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension, $mimeType, $extractorClass), 'debug'); if (empty($extractorClass)) { /* if no extractor is found and we don't need to index discussions, then we can remove the item from the queue. */ if ($indexDiscussion) { $indexDocument = false; $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info'); } else { Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"), $docId)); return; } } else { /* If an extractor is available, we must ensure it is enabled. */ if (!$this->isExtractorEnabled($extractorClass)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info'); return; } } if ($this->debug) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"), $docId), 'info'); } if ($this->restartCurrentBatch) { Indexer::unqueueDocument($docId); Indexer::index($docId, 'A'); return; } $filename = $document->getFileName(); if (substr($filename, 0, 1) == '~' || substr($filename, -1) == '~') { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."), $docId), 'error'); return; } $removeFromQueue = true; if ($indexDocument) { if (array_key_exists($extractorClass, $extractorCache)) { $extractor = $extractorCache[$extractorClass]; } else { $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass); } if (!$extractor instanceof DocumentExtractor) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."), $extractorClass), 'error'); return; } $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); $sourceFile = $this->storageManager->temporaryFile($document); if (empty($sourceFile) || !is_file($sourceFile)) { Indexer::unqueueDocument($docId, sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."), $sourceFile, $docId), 'error'); continue; } if ($extractor->needsIntermediateSourceFile()) { //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION); $intermediate = $tempPath . '/' . $docId . '.' . $extension; $result = @copy($sourceFile, $intermediate); if ($result === false) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"), $docId), 'error'); // problem. lets try again later. probably permission related. log the issue. continue; } $sourceFile = $intermediate; } $extractor->setSourceFile($sourceFile); $extractor->setMimeType($mimeType); $extractor->setExtension($extension); $extractor->setDocument($document); $extractor->setIndexingStatus(null); $extractor->setExtractionStatus(null); $targetFile = tempnam($tempPath, 'ktindexer'); $extractor->setTargetFile($targetFile); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"), $docId, $sourceFile, $targetFile), 'debug'); $this->executeHook($extractor, 'pre_extract'); $this->executeHook($extractor, 'pre_extract', $mimeType); $removeFromQueue = false; if ($extractor->extractTextContent()) { // the extractor may need to create another target file $targetFile = $extractor->getTargetFile(); $extractor->setExtractionStatus(true); $this->executeHook($extractor, 'pre_index'); $this->executeHook($extractor, 'pre_index', $mimeType); $title = $document->getName(); if ($indexDiscussion) { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error'); } else { $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"), $docId), 'error'); } $extractor->setIndexingStatus($indexStatus); } } else { if (!$this->filterText($targetFile)) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"), $docId), 'error'); } else { $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); $removeFromQueue = $indexStatus; if (!$indexStatus) { $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"), $docId), 'error'); $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error'); } $extractor->setIndexingStatus($indexStatus); } } $this->executeHook($extractor, 'post_index', $mimeType); $this->executeHook($extractor, 'post_index'); } else { $extractor->setExtractionStatus(false); $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"), $docId), 'error'); $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error'); } $this->executeHook($extractor, 'post_extract', $mimeType); $this->executeHook($extractor, 'post_extract'); if ($extractor->needsIntermediateSourceFile()) { @unlink($sourceFile); } @unlink($targetFile); } else { $indexStatus = $this->indexDiscussion($docId); $removeFromQueue = $indexStatus; } if ($removeFromQueue) { Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"), $docId)); } else { if ($this->debug) { $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"), $docId)); } } }
public function index() { $dir = realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR . "data" . DIRECTORY_SEPARATOR; $jsonDir = $dir . "json"; $indexDir = $dir . "index"; // ler aquivos json $files = scandir($jsonDir); foreach ($files as $file) { if ($file == '.' || $file == '..') { continue; } // Se arquivo existe if (is_file($jsonDir . DIRECTORY_SEPARATOR . $file)) { $json = json_decode(file_get_contents($jsonDir . DIRECTORY_SEPARATOR . $file)); $indexName = substr($file, 0, -5); // Cria index $index = Lucene\Lucene::create($indexDir . DIRECTORY_SEPARATOR . $indexName); // Cria documento e define campos para indexar foreach ($json as $entry) { $doc = new Lucene\Document(); $doc->addField(Lucene\Document\Field::Text('url', $entry->title)); $doc->addField(Lucene\Document\Field::UnStored('contents', $entry->text)); $index->addDocument($doc); } } } } } $ix = new Indexer(); $ix->index();
$server->post('/reindex', function ($request, $response, $next) use($indexer) { echo "Starting reindex...\n"; if ($request->file) { $file = realpath($request->file); $indexer->delete_file_index($file); $indexer->index_file($file); } else { $indexer->delete_index(); $indexer->index(); } echo "Indexing completed.\n"; $next(); }); $server->post('/index-php', function ($request, $response, $next) use($indexer_php) { $indexer_php->delete_index(); $indexer_php->index(); echo "Indexing Complete\n"; $next(); }); $server->post('/updateindex', function ($request, $response, $next) use($indexer) { echo "Updating reindex...\n"; $indexer->index(); echo "Indexing completed.\n"; $next(); }); $server->get('/search', function ($request, $response, $next) use($db) { $params = $request->httpRequest->getQuery(); $query = array(); if (isset($params['namespace'])) { $query['namespace'] = $params['namespace']; }
public function index($domain) { $indexer = new Indexer($this->getAccountId($domain)); $indexer->clear(); $indexer->index(); }
function reindexDocument($oDocument) { Indexer::index($oDocument); }
<?php require __DIR__ . '/vendor/autoload.php'; require __DIR__ . '/inc/DB.php'; require __DIR__ . '/inc/Indexer.php'; require __DIR__ . '/inc/IndexerNodeTraverserVisitor.php'; $indexer = new Indexer(realpath($argv[1])); $indexer->index();
$i = 0; foreach ($rows as $row) { $docId = $row['id']; if (!$indexer->isDocumentIndexed($docId)) { $notIndexed[] = $docId; } if ($i % 100 == 0) { print '.'; } if ($i++ % 4000 == 0) { print "\n"; } } print "\nReporting...\n"; if (empty($notIndexed)) { print "All documents are indexed\n"; } else { print "\n-----START-----\n\"Document Id\",\"Title\",\"Full Path\"\n"; $notIndexed = implode(',', $notIndexed); $sql = "select d.id, dm.name as title, d.full_path from documents d inner join document_metadata_version dm on d.metadata_version_id = dm.id where d.id in ({$notIndexed}) "; $rows = DBUtil::getResultArray($sql); foreach ($rows as $row) { print '"' . $row['id'] . '","' . $row['title'] . '","' . $row['full_path'] . '"' . "\n"; if ($reindex) { Indexer::index($docId); $GLOBALS["_OBJECTCACHE"] = array(); } } print "-----END-----\n\nDone\n"; } exit;