/** * Commit all pushed documents to the Solr Server */ public function commit() { $this->_setSolrService(); if (count($this->documents)) { try { $this->_solrService->addDocuments($this->documents); $this->_solrService->commit(); $this->_solrService->optimize(); $this->_clear(); } catch (Exception $e) { trigger_error($e->getMessage(), $e->getCode()); } } else { trigger_error("There is not document for committing"); } }
printMessage(' Skipped file "' . $filePath . '". Reason: Mime type "' . $mimeType . '" has no processor.'); break; } } else { printMessage(' Skipped file "' . $filePath . '". Reason: File not found.'); } } } else { printMessage(' Skipped indexing of document files. Reason: no base path to files or document is not published.'); } $documents[] = $solrDocument; if (0 === count($documents) % $commitRange) { printMessage(' Committing data set of ' . $commitRange . ' values.'); $solr->addDocuments($documents); $solr->commit(); $documents = array(); printMessage(' Committing done.'); } } if (count($documents) > 0) { printMessage(' Committing data set of ' . count($documents) . ' values.'); $solr->addDocuments($documents); $solr->commit(); printMessage(' Committing done.'); } printMessage(' Optimizing.'); $solr->optimize(); printMessage(' Optimizing done.'); $stopTime = time(); $time = $stopTime - $startTime; echo 'Time to index all documents: ' . gmstrftime('%H:%M:%S', $time) . $EOL;
public function testOptimize() { // set a mock transport $mockTransport = $this->getMockHttpTransportInterface(); // setup expected call and response $mockTransport->expects($this->once())->method('performPostRequest')->will($this->returnValue(Apache_Solr_HttpTransport_ResponseTest::get200Response())); $fixture = new Apache_Solr_Service(); $fixture->setHttpTransport($mockTransport); $fixture->optimize(); }
function db_store_article($article, $PageID = 0, $updating = false) { global $db; global $config; $update = false; $id = 0; // If we are editing an existing reference then we already know its id if (isset($article->reference_id)) { $id = $article->reference_id; } else { $id = db_find_article($article); } if ($id != 0) { if ($updating) { $update = true; } else { return $id; } } // Try and trap empty references if ($id == 0) { $ok = false; if (isset($article->title)) { $ok = $article->title != ''; } if (!$ok) { return 0; } } if (!isset($article->genre)) { $article->genre = 'article'; } $keys = array(); $values = array(); // Article metadata foreach ($article as $k => $v) { switch ($k) { // Ignore as it's an array case 'authors': break; case 'date': $keys[] = 'date'; $values[] = $db->qstr($v); if (!isset($article->year)) { $keys[] = 'year'; $values[] = $db->qstr(year_from_date($v)); } break; // Don't store BHL URL here // Don't store BHL URL here case 'url': if (preg_match('/^http:\\/\\/(www\\.)?biodiversitylibrary.org\\/page\\/(?<pageid>[0-9]+)/', $v)) { } else { // extract Handle if it exists if (preg_match('/^http:\\/\\/hdl.handle.net\\/(?<hdl>.*)$/', $v, $m)) { $keys[] = 'hdl'; $values[] = $db->qstr($m['hdl']); } else { $keys[] = $k; $values[] = $db->qstr($v); } } break; // Things we store as is // Things we store as is case 'title': case 'secondary_title': case 'volume': case 'series': case 'issue': case 'spage': case 'epage': case 'year': case 'date': case 'issn': case 'genre': case 'doi': case 'hdl': case 'lsid': case 'oclc': case 'pdf': case 'abstract': case 'pmid': $keys[] = $k; $values[] = $db->qstr($v); break; // Things we ignore // Things we ignore default: break; } } // Date if (!isset($article->date) && isset($article->year)) { $keys[] = 'date'; $values[] = $db->qstr($article->year . '-00-00'); } // BHL PageID if ($PageID != 0) { $keys[] = 'PageID'; $values[] = $PageID; } // SICI $s = new Sici(); $sici = $s->create($article); if ($sici != '') { $keys[] = 'sici'; $values[] = $db->qstr($sici); } if ($update) { // Versioning? // Delete links (author, pages, etc) // Don't delete page range as we may loose plates, etc. outside range /* $sql = 'DELETE FROM rdmp_reference_page_joiner WHERE reference_id=' . $id; $result = $db->Execute($sql); if ($result == false) die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); */ $sql = 'DELETE FROM rdmp_author_reference_joiner WHERE reference_id = ' . $id; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } // update (updated timestamp will be automatically updated) $sql = 'UPDATE rdmp_reference SET '; $num_values = count($keys); for ($i = 0; $i < $num_values; $i++) { if ($i > 0) { $sql .= ', '; } $sql .= $keys[$i] . '=' . $values[$i]; } $sql .= ' WHERE reference_id=' . $id; /* $cache_file = @fopen('/tmp/update.sql', "w+") or die("could't open file"); @fwrite($cache_file, $sql); fclose($cache_file); */ $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } else { // Adding article for first time so add 'created' and 'updated' timestamp $keys[] = 'created'; $values[] = 'NOW()'; $keys[] = 'updated'; $values[] = 'NOW()'; $sql = 'INSERT INTO rdmp_reference (' . implode(",", $keys) . ') VALUES (' . implode(",", $values) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } $id = $db->Insert_ID(); // Store reference_cluster_id which we can use to group duplicates, by default // reference_cluster_id = reference_id $sql = 'UPDATE rdmp_reference SET reference_cluster_id=' . $id . ' WHERE reference_id=' . $id; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } // Indexing------------------------------------------------------------------------------------- if (1) { // solr // this code is redundant with code in reference.php but I use different objects // here and there (doh!). Also once we've added old stuff to solr this is the only place we // should be calling solr $solr = new Apache_Solr_Service('localhost', '8983', '/solr'); if (!$solr->ping()) { echo 'Solr service not responding.'; exit; } $item = array(); $item['id'] = 'reference/' . $id; $item['title'] = $article->title; $item['publication_outlet'] = $article->secondary_title; $item['year'] = $article->year; $authors = array(); foreach ($article->authors as $a) { $authors[] = $a->forename . ' ' . $a->surname; } $item['authors'] = $authors; $citation = ''; $citation .= ' ' . $article->year; $citation .= ' ' . $article->title; $citation .= ' ' . $article->secondary_title; $citation .= ' ' . $article->volume; if (isset($article->issue)) { $citation .= '(' . $article->issue . ')'; } $citation .= ':'; $citation .= ' '; $citation .= $article->spage; if (isset($article->epage)) { $citation .= '-' . $article->epage; } $item['citation'] = $citation; $text = ''; $num_authors = count($article->authors); $count = 0; if ($num_authors > 0) { foreach ($article->authors as $author) { $text .= $author->forename . ' ' . $author->lastname; if (isset($author->suffix)) { $text .= ' ' . $author->suffix; } $count++; if ($count == 2 && $num_authors > 3) { $text .= ' et al.'; break; } if ($count < $num_authors - 1) { $text .= ', '; } else { if ($count < $num_authors) { $text .= ' and '; } } } } $item['citation'] = $text . ' ' . $citation; $parts = array(); $parts[] = $item; //print_r($parts); // add to solr $documents = array(); foreach ($parts as $item => $fields) { $part = new Apache_Solr_Document(); foreach ($fields as $key => $value) { if (is_array($value)) { foreach ($value as $datum) { $part->setMultiValue($key, $datum); } } else { $part->{$key} = $value; } } $documents[] = $part; } // // // Load the documents into the index // try { $solr->addDocuments($documents); $solr->commit(); $solr->optimize(); } catch (Exception $e) { echo $e->getMessage(); } } else { $sql = 'DELETE FROM rdmp_text_index WHERE (object_uri=' . $db->qstr($config['web_root'] . 'reference/' . $id) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } // Only do this if we have a title, as sometimes we don't (e.g. CrossRef lacks metadata) if (isset($article->title)) { $sql = 'INSERT INTO rdmp_text_index(object_type, object_id, object_uri, object_text) VALUES ("title"' . ', ' . $id . ', ' . $db->qstr($config['web_root'] . 'reference/' . $id) . ', ' . $db->qstr($article->title) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } } // Versioning----------------------------------------------------------------------------------- // Store this object in version table so we can recover it if we overwrite item $ip = getip(); $sql = 'INSERT INTO rdmp_reference_version(reference_id, ip, json) VALUES(' . $id . ', ' . 'INET_ATON(\'' . $ip . '\')' . ',' . $db->qstr(json_encode($article)) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } // Author(s)------------------------------------------------------------------------------------ // Store author as and link to the article if (isset($article->authors)) { db_store_authors($id, $article->authors); } // Store page range (only if not updating, otherwise we may loose plates, etc. // that aren't in page range) if ($PageID != 0 && !$update) { $page_range = array(); if (isset($article->spage) && isset($article->epage)) { $page_range = bhl_page_range($PageID, $article->epage - $article->spage + 1); } else { // No epage, so just get spage (to do: how do we tell user we don't have page range?) $page_range = bhl_page_range($PageID, 0); } //print_r($page_range); $count = 0; foreach ($page_range as $page) { $sql = 'INSERT INTO rdmp_reference_page_joiner (reference_id, PageID, page_order) VALUES (' . $id . ',' . $page . ',' . $count++ . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } } // Tweet---------------------------------------------------------------------------------------- if (!$update) { if ($config['twitter']) { $url = $config['web_root'] . 'reference/' . $id . ' ' . '#bhlib'; // url + hashtag $url_len = strlen($url); $status = ''; if (isset($article->title)) { $status = $article->title; $status_len = strlen($status); $extra = 140 - $status_len - $url_len - 1; if ($extra < 0) { $status_len += $extra; $status_len -= 1; $status = substr($status, 0, $status_len); $status .= '…'; } } $status .= ' ' . $url; tweet($status); } } return $id; }
public function optimize() { $this->service->optimize(); }