function db_store_article($article, $PageID = 0, $updating = false) { global $db; global $config; $update = false; $id = 0; // If we are editing an existing reference then we already know its id if (isset($article->reference_id)) { $id = $article->reference_id; } else { $id = db_find_article($article); } if ($id != 0) { if ($updating) { $update = true; } else { return $id; } } // Try and trap empty references if ($id == 0) { $ok = false; if (isset($article->title)) { $ok = $article->title != ''; } if (!$ok) { return 0; } } if (!isset($article->genre)) { $article->genre = 'article'; } $keys = array(); $values = array(); // Article metadata foreach ($article as $k => $v) { switch ($k) { // Ignore as it's an array case 'authors': break; case 'date': $keys[] = 'date'; $values[] = $db->qstr($v); if (!isset($article->year)) { $keys[] = 'year'; $values[] = $db->qstr(year_from_date($v)); } break; // Don't store BHL URL here // Don't store BHL URL here case 'url': if (preg_match('/^http:\\/\\/(www\\.)?biodiversitylibrary.org\\/page\\/(?<pageid>[0-9]+)/', $v)) { } else { // extract Handle if it exists if (preg_match('/^http:\\/\\/hdl.handle.net\\/(?<hdl>.*)$/', $v, $m)) { $keys[] = 'hdl'; $values[] = $db->qstr($m['hdl']); } else { $keys[] = $k; $values[] = $db->qstr($v); } } break; // Things we store as is // Things we store as is case 'title': case 'secondary_title': case 'volume': case 'series': case 'issue': case 'spage': case 'epage': case 'year': case 'date': case 'issn': case 'genre': case 'doi': case 'hdl': case 'lsid': case 'oclc': case 'pdf': case 'abstract': case 'pmid': $keys[] = $k; $values[] = $db->qstr($v); break; // Things we ignore // Things we ignore default: break; } } // Date if (!isset($article->date) && isset($article->year)) { $keys[] = 'date'; $values[] = $db->qstr($article->year . '-00-00'); } // BHL PageID if ($PageID != 0) { $keys[] = 'PageID'; $values[] = $PageID; } // SICI $s = new Sici(); $sici = $s->create($article); if ($sici != '') { $keys[] = 'sici'; $values[] = $db->qstr($sici); } if ($update) { // Versioning? // Delete links (author, pages, etc) // Don't delete page range as we may loose plates, etc. outside range /* $sql = 'DELETE FROM rdmp_reference_page_joiner WHERE reference_id=' . $id; $result = $db->Execute($sql); if ($result == false) die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); */ $sql = 'DELETE FROM rdmp_author_reference_joiner WHERE reference_id = ' . $id; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } // update (updated timestamp will be automatically updated) $sql = 'UPDATE rdmp_reference SET '; $num_values = count($keys); for ($i = 0; $i < $num_values; $i++) { if ($i > 0) { $sql .= ', '; } $sql .= $keys[$i] . '=' . $values[$i]; } $sql .= ' WHERE reference_id=' . $id; /* $cache_file = @fopen('/tmp/update.sql', "w+") or die("could't open file"); @fwrite($cache_file, $sql); fclose($cache_file); */ $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } else { // Adding article for first time so add 'created' and 'updated' timestamp $keys[] = 'created'; $values[] = 'NOW()'; $keys[] = 'updated'; $values[] = 'NOW()'; $sql = 'INSERT INTO rdmp_reference (' . implode(",", $keys) . ') VALUES (' . implode(",", $values) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } $id = $db->Insert_ID(); // Store reference_cluster_id which we can use to group duplicates, by default // reference_cluster_id = reference_id $sql = 'UPDATE rdmp_reference SET reference_cluster_id=' . $id . ' WHERE reference_id=' . $id; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } // Indexing------------------------------------------------------------------------------------- if (1) { // solr // this code is redundant with code in reference.php but I use different objects // here and there (doh!). Also once we've added old stuff to solr this is the only place we // should be calling solr $solr = new Apache_Solr_Service('localhost', '8983', '/solr'); if (!$solr->ping()) { echo 'Solr service not responding.'; exit; } $item = array(); $item['id'] = 'reference/' . $id; $item['title'] = $article->title; $item['publication_outlet'] = $article->secondary_title; $item['year'] = $article->year; $authors = array(); foreach ($article->authors as $a) { $authors[] = $a->forename . ' ' . $a->surname; } $item['authors'] = $authors; $citation = ''; $citation .= ' ' . $article->year; $citation .= ' ' . $article->title; $citation .= ' ' . $article->secondary_title; $citation .= ' ' . $article->volume; if (isset($article->issue)) { $citation .= '(' . $article->issue . ')'; } $citation .= ':'; $citation .= ' '; $citation .= $article->spage; if (isset($article->epage)) { $citation .= '-' . $article->epage; } $item['citation'] = $citation; $text = ''; $num_authors = count($article->authors); $count = 0; if ($num_authors > 0) { foreach ($article->authors as $author) { $text .= $author->forename . ' ' . $author->lastname; if (isset($author->suffix)) { $text .= ' ' . $author->suffix; } $count++; if ($count == 2 && $num_authors > 3) { $text .= ' et al.'; break; } if ($count < $num_authors - 1) { $text .= ', '; } else { if ($count < $num_authors) { $text .= ' and '; } } } } $item['citation'] = $text . ' ' . $citation; $parts = array(); $parts[] = $item; //print_r($parts); // add to solr $documents = array(); foreach ($parts as $item => $fields) { $part = new Apache_Solr_Document(); foreach ($fields as $key => $value) { if (is_array($value)) { foreach ($value as $datum) { $part->setMultiValue($key, $datum); } } else { $part->{$key} = $value; } } $documents[] = $part; } // // // Load the documents into the index // try { $solr->addDocuments($documents); $solr->commit(); $solr->optimize(); } catch (Exception $e) { echo $e->getMessage(); } } else { $sql = 'DELETE FROM rdmp_text_index WHERE (object_uri=' . $db->qstr($config['web_root'] . 'reference/' . $id) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } // Only do this if we have a title, as sometimes we don't (e.g. CrossRef lacks metadata) if (isset($article->title)) { $sql = 'INSERT INTO rdmp_text_index(object_type, object_id, object_uri, object_text) VALUES ("title"' . ', ' . $id . ', ' . $db->qstr($config['web_root'] . 'reference/' . $id) . ', ' . $db->qstr($article->title) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } } // Versioning----------------------------------------------------------------------------------- // Store this object in version table so we can recover it if we overwrite item $ip = getip(); $sql = 'INSERT INTO rdmp_reference_version(reference_id, ip, json) VALUES(' . $id . ', ' . 'INET_ATON(\'' . $ip . '\')' . ',' . $db->qstr(json_encode($article)) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } // Author(s)------------------------------------------------------------------------------------ // Store author as and link to the article if (isset($article->authors)) { db_store_authors($id, $article->authors); } // Store page range (only if not updating, otherwise we may loose plates, etc. // that aren't in page range) if ($PageID != 0 && !$update) { $page_range = array(); if (isset($article->spage) && isset($article->epage)) { $page_range = bhl_page_range($PageID, $article->epage - $article->spage + 1); } else { // No epage, so just get spage (to do: how do we tell user we don't have page range?) $page_range = bhl_page_range($PageID, 0); } //print_r($page_range); $count = 0; foreach ($page_range as $page) { $sql = 'INSERT INTO rdmp_reference_page_joiner (reference_id, PageID, page_order) VALUES (' . $id . ',' . $page . ',' . $count++ . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } } } // Tweet---------------------------------------------------------------------------------------- if (!$update) { if ($config['twitter']) { $url = $config['web_root'] . 'reference/' . $id . ' ' . '#bhlib'; // url + hashtag $url_len = strlen($url); $status = ''; if (isset($article->title)) { $status = $article->title; $status_len = strlen($status); $extra = 140 - $status_len - $url_len - 1; if ($extra < 0) { $status_len += $extra; $status_len -= 1; $status = substr($status, 0, $status_len); $status .= '…'; } } $status .= ' ' . $url; tweet($status); } } return $id; }
//for ($reference_id=66009;$reference_id<=66058;$reference_id++) // 85560-85454 //for ($reference_id=85454;$reference_id<=85560;$reference_id++) //for ($reference_id=85581;$reference_id<=85582;$reference_id++) $ids = array(45062, 44966, 44952, 44816, 44626, 44625, 47587); $ids = array(85596); $ids = array(85604); $ids = array(51688); $ids = array(85711, 85717); //$ids=array(86103); $ids = array(80076, 80077, 80078, 80079, 80080, 80082, 80083, 80084, 80085, 80089, 80090, 80091, 80093, 80095, 80100, 80103, 80104, 80106, 80109, 80112, 80115, 80117); //foreach ($ids as $reference_id) for ($reference_id = 127989; $reference_id <= 127989; $reference_id++) { $article = db_retrieve_reference($reference_id); $page_range = array(); if (isset($article->spage) && isset($article->epage)) { $page_range = bhl_page_range($article->PageID, $article->epage - $article->spage + 1); } else { // No epage, so just get spage (to do: how do we tell user we don't have page range?) $page_range = bhl_page_range($article->PageID, 0); } //print_r($page_range); echo "DELETE FROM rdmp_reference_page_joiner WHERE reference_id={$reference_id};\n"; $count = 0; foreach ($page_range as $page) { $sql = 'INSERT INTO rdmp_reference_page_joiner (reference_id, PageID, page_order) VALUES (' . $reference_id . ',' . $page . ',' . $count++ . ');'; echo $sql . "\n"; //$result = $db->Execute($sql); //if ($result == false) die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } }