Ejemplo n.º 1
0
function db_store_article($article, $PageID = 0, $updating = false)
{
    global $db;
    global $config;
    $update = false;
    $id = 0;
    // If we are editing an existing reference then we already know its id
    if (isset($article->reference_id)) {
        $id = $article->reference_id;
    } else {
        $id = db_find_article($article);
    }
    if ($id != 0) {
        if ($updating) {
            $update = true;
        } else {
            return $id;
        }
    }
    // Try and trap empty references
    if ($id == 0) {
        $ok = false;
        if (isset($article->title)) {
            $ok = $article->title != '';
        }
        if (!$ok) {
            return 0;
        }
    }
    if (!isset($article->genre)) {
        $article->genre = 'article';
    }
    $keys = array();
    $values = array();
    // Article metadata
    foreach ($article as $k => $v) {
        switch ($k) {
            // Ignore as it's an array
            case 'authors':
                break;
            case 'date':
                $keys[] = 'date';
                $values[] = $db->qstr($v);
                if (!isset($article->year)) {
                    $keys[] = 'year';
                    $values[] = $db->qstr(year_from_date($v));
                }
                break;
                // Don't store BHL URL here
            // Don't store BHL URL here
            case 'url':
                if (preg_match('/^http:\\/\\/(www\\.)?biodiversitylibrary.org\\/page\\/(?<pageid>[0-9]+)/', $v)) {
                } else {
                    // extract Handle if it exists
                    if (preg_match('/^http:\\/\\/hdl.handle.net\\/(?<hdl>.*)$/', $v, $m)) {
                        $keys[] = 'hdl';
                        $values[] = $db->qstr($m['hdl']);
                    } else {
                        $keys[] = $k;
                        $values[] = $db->qstr($v);
                    }
                }
                break;
                // Things we store as is
            // Things we store as is
            case 'title':
            case 'secondary_title':
            case 'volume':
            case 'series':
            case 'issue':
            case 'spage':
            case 'epage':
            case 'year':
            case 'date':
            case 'issn':
            case 'genre':
            case 'doi':
            case 'hdl':
            case 'lsid':
            case 'oclc':
            case 'pdf':
            case 'abstract':
            case 'pmid':
                $keys[] = $k;
                $values[] = $db->qstr($v);
                break;
                // Things we ignore
            // Things we ignore
            default:
                break;
        }
    }
    // Date
    if (!isset($article->date) && isset($article->year)) {
        $keys[] = 'date';
        $values[] = $db->qstr($article->year . '-00-00');
    }
    // BHL PageID
    if ($PageID != 0) {
        $keys[] = 'PageID';
        $values[] = $PageID;
    }
    // SICI
    $s = new Sici();
    $sici = $s->create($article);
    if ($sici != '') {
        $keys[] = 'sici';
        $values[] = $db->qstr($sici);
    }
    if ($update) {
        // Versioning?
        // Delete links	(author, pages, etc)
        // Don't delete page range as we may loose plates, etc. outside range
        /*
        $sql = 'DELETE FROM rdmp_reference_page_joiner WHERE reference_id=' . $id;
        $result = $db->Execute($sql);
        if ($result == false) die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
        */
        $sql = 'DELETE FROM rdmp_author_reference_joiner WHERE reference_id = ' . $id;
        $result = $db->Execute($sql);
        if ($result == false) {
            die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
        }
        // update (updated timestamp will be automatically updated)
        $sql = 'UPDATE rdmp_reference SET ';
        $num_values = count($keys);
        for ($i = 0; $i < $num_values; $i++) {
            if ($i > 0) {
                $sql .= ', ';
            }
            $sql .= $keys[$i] . '=' . $values[$i];
        }
        $sql .= ' WHERE reference_id=' . $id;
        /*		$cache_file = @fopen('/tmp/update.sql', "w+") or die("could't open file");
        		@fwrite($cache_file, $sql);
        		fclose($cache_file);
        */
        $result = $db->Execute($sql);
        if ($result == false) {
            die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
        }
    } else {
        // Adding article for first time so add 'created' and 'updated' timestamp
        $keys[] = 'created';
        $values[] = 'NOW()';
        $keys[] = 'updated';
        $values[] = 'NOW()';
        $sql = 'INSERT INTO rdmp_reference (' . implode(",", $keys) . ') VALUES (' . implode(",", $values) . ')';
        $result = $db->Execute($sql);
        if ($result == false) {
            die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
        }
        $id = $db->Insert_ID();
        // Store reference_cluster_id which we can use to group duplicates, by default
        // reference_cluster_id = reference_id
        $sql = 'UPDATE rdmp_reference SET reference_cluster_id=' . $id . ' WHERE reference_id=' . $id;
        $result = $db->Execute($sql);
        if ($result == false) {
            die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
        }
    }
    // Indexing-------------------------------------------------------------------------------------
    if (1) {
        // solr
        // this code is redundant with code in reference.php but I use different objects
        // here and there (doh!). Also once we've added old stuff to solr this is the only place we
        // should be calling solr
        $solr = new Apache_Solr_Service('localhost', '8983', '/solr');
        if (!$solr->ping()) {
            echo 'Solr service not responding.';
            exit;
        }
        $item = array();
        $item['id'] = 'reference/' . $id;
        $item['title'] = $article->title;
        $item['publication_outlet'] = $article->secondary_title;
        $item['year'] = $article->year;
        $authors = array();
        foreach ($article->authors as $a) {
            $authors[] = $a->forename . ' ' . $a->surname;
        }
        $item['authors'] = $authors;
        $citation = '';
        $citation .= ' ' . $article->year;
        $citation .= ' ' . $article->title;
        $citation .= ' ' . $article->secondary_title;
        $citation .= ' ' . $article->volume;
        if (isset($article->issue)) {
            $citation .= '(' . $article->issue . ')';
        }
        $citation .= ':';
        $citation .= ' ';
        $citation .= $article->spage;
        if (isset($article->epage)) {
            $citation .= '-' . $article->epage;
        }
        $item['citation'] = $citation;
        $text = '';
        $num_authors = count($article->authors);
        $count = 0;
        if ($num_authors > 0) {
            foreach ($article->authors as $author) {
                $text .= $author->forename . ' ' . $author->lastname;
                if (isset($author->suffix)) {
                    $text .= ' ' . $author->suffix;
                }
                $count++;
                if ($count == 2 && $num_authors > 3) {
                    $text .= ' et al.';
                    break;
                }
                if ($count < $num_authors - 1) {
                    $text .= ', ';
                } else {
                    if ($count < $num_authors) {
                        $text .= ' and ';
                    }
                }
            }
        }
        $item['citation'] = $text . ' ' . $citation;
        $parts = array();
        $parts[] = $item;
        //print_r($parts);
        // add to solr
        $documents = array();
        foreach ($parts as $item => $fields) {
            $part = new Apache_Solr_Document();
            foreach ($fields as $key => $value) {
                if (is_array($value)) {
                    foreach ($value as $datum) {
                        $part->setMultiValue($key, $datum);
                    }
                } else {
                    $part->{$key} = $value;
                }
            }
            $documents[] = $part;
        }
        //
        //
        // Load the documents into the index
        //
        try {
            $solr->addDocuments($documents);
            $solr->commit();
            $solr->optimize();
        } catch (Exception $e) {
            echo $e->getMessage();
        }
    } else {
        $sql = 'DELETE FROM rdmp_text_index WHERE (object_uri=' . $db->qstr($config['web_root'] . 'reference/' . $id) . ')';
        $result = $db->Execute($sql);
        if ($result == false) {
            die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
        }
        // Only do this if we have a title, as sometimes we don't (e.g. CrossRef lacks metadata)
        if (isset($article->title)) {
            $sql = 'INSERT INTO rdmp_text_index(object_type, object_id, object_uri, object_text)
			VALUES ("title"' . ', ' . $id . ', ' . $db->qstr($config['web_root'] . 'reference/' . $id) . ', ' . $db->qstr($article->title) . ')';
            $result = $db->Execute($sql);
            if ($result == false) {
                die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
            }
        }
    }
    // Versioning-----------------------------------------------------------------------------------
    // Store this object in version table so we can recover it if we overwrite item
    $ip = getip();
    $sql = 'INSERT INTO rdmp_reference_version(reference_id, ip, json) VALUES(' . $id . ', ' . 'INET_ATON(\'' . $ip . '\')' . ',' . $db->qstr(json_encode($article)) . ')';
    $result = $db->Execute($sql);
    if ($result == false) {
        die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
    }
    // Author(s)------------------------------------------------------------------------------------
    // Store author as and link to the article
    if (isset($article->authors)) {
        db_store_authors($id, $article->authors);
    }
    // Store page range (only if not updating, otherwise we may loose plates, etc.
    // that aren't in page range)
    if ($PageID != 0 && !$update) {
        $page_range = array();
        if (isset($article->spage) && isset($article->epage)) {
            $page_range = bhl_page_range($PageID, $article->epage - $article->spage + 1);
        } else {
            // No epage, so just get spage (to do: how do we tell user we don't have page range?)
            $page_range = bhl_page_range($PageID, 0);
        }
        //print_r($page_range);
        $count = 0;
        foreach ($page_range as $page) {
            $sql = 'INSERT INTO rdmp_reference_page_joiner (reference_id, PageID, page_order) 
			VALUES (' . $id . ',' . $page . ',' . $count++ . ')';
            $result = $db->Execute($sql);
            if ($result == false) {
                die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
            }
        }
    }
    // Tweet----------------------------------------------------------------------------------------
    if (!$update) {
        if ($config['twitter']) {
            $url = $config['web_root'] . 'reference/' . $id . ' ' . '#bhlib';
            // url + hashtag
            $url_len = strlen($url);
            $status = '';
            if (isset($article->title)) {
                $status = $article->title;
                $status_len = strlen($status);
                $extra = 140 - $status_len - $url_len - 1;
                if ($extra < 0) {
                    $status_len += $extra;
                    $status_len -= 1;
                    $status = substr($status, 0, $status_len);
                    $status .= '…';
                }
            }
            $status .= ' ' . $url;
            tweet($status);
        }
    }
    return $id;
}
Ejemplo n.º 2
0
//for ($reference_id=66009;$reference_id<=66058;$reference_id++)
// 85560-85454
//for ($reference_id=85454;$reference_id<=85560;$reference_id++)
//for ($reference_id=85581;$reference_id<=85582;$reference_id++)
$ids = array(45062, 44966, 44952, 44816, 44626, 44625, 47587);
$ids = array(85596);
$ids = array(85604);
$ids = array(51688);
$ids = array(85711, 85717);
//$ids=array(86103);
$ids = array(80076, 80077, 80078, 80079, 80080, 80082, 80083, 80084, 80085, 80089, 80090, 80091, 80093, 80095, 80100, 80103, 80104, 80106, 80109, 80112, 80115, 80117);
//foreach ($ids as $reference_id)
for ($reference_id = 127989; $reference_id <= 127989; $reference_id++) {
    $article = db_retrieve_reference($reference_id);
    $page_range = array();
    if (isset($article->spage) && isset($article->epage)) {
        $page_range = bhl_page_range($article->PageID, $article->epage - $article->spage + 1);
    } else {
        // No epage, so just get spage (to do: how do we tell user we don't have page range?)
        $page_range = bhl_page_range($article->PageID, 0);
    }
    //print_r($page_range);
    echo "DELETE FROM rdmp_reference_page_joiner WHERE reference_id={$reference_id};\n";
    $count = 0;
    foreach ($page_range as $page) {
        $sql = 'INSERT INTO rdmp_reference_page_joiner (reference_id, PageID, page_order) VALUES (' . $reference_id . ',' . $page . ',' . $count++ . ');';
        echo $sql . "\n";
        //$result = $db->Execute($sql);
        //if ($result == false) die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
    }
}