示例#1
0
function compare($str1, $str2)
{
    $n1 = strlen($str1);
    $n2 = strlen($str2);
    $C = LCSLength($str1, $str2);
    $subsequence_length = $C[$n1][$n2];
    $symdiff = 1.0 - ($n1 + $n2 - 2 * $subsequence_length) / ($n1 + $n2);
    return $symdiff;
}
示例#2
0
function find_citation($citation, &$result, $threshold = 0.8)
{
    global $config;
    global $couch;
    $q = clean_string($citation);
    $rows_per_page = 5;
    $url = '/_design/citation/_search/all?q=' . urlencode($q) . '&limit=' . $rows_per_page;
    $resp = $couch->send("GET", "/" . $config['couchdb_options']['database'] . "/" . $url);
    $obj = json_decode($resp);
    if (isset($obj->error)) {
    } else {
        $result->query_ok = true;
        if ($obj->total_rows > 0) {
            $best_hit = 0;
            $q = strtolower($q);
            foreach ($obj->rows as $row) {
                $hit = $row->fields->default;
                $hit_original = $hit;
                $hit = clean_string($hit);
                $hit = strtolower($hit);
                $query_length = strlen($q);
                $hit_length = strlen($hit);
                $C = LCSLength($hit, $q);
                // length of subsequence as percentage of query string
                $subsequence_length = round(100.0 * $C[$hit_length][$query_length] / $query_length);
                $symdiff = 1.0 - ($query_length + $hit_length - 2 * $C[$hit_length][$query_length]) / ($query_length + $hit_length);
                if ($symdiff > $threshold) {
                    if ($symdiff >= $best_hit) {
                        $best_hit = $symdiff;
                        $match = new stdclass();
                        $match->text = $citation;
                        $match->hit = $hit_original;
                        $match->match = true;
                        $match->id = $row->id;
                        $match->score = $row->order[0];
                        $match->symdiff = $symdiff;
                        if ($symdiff > $best_hit) {
                            $result->results = array();
                        }
                        $result->results[] = $match;
                    }
                }
            }
        }
    }
    return count($result->results) > 1;
}
示例#3
0
/**
 * @brief Obtain ISSN for a journal
 *
 * @param journal Journal name
 * @param threshold Threshold for matching name (default = 0.75)
 *
 * If exact match not found we use approximate string matching to find the best match. The
 * journal name is stripped of short words ("of", "the") and punctuation, then a MySQL LIKE
 * query finds a candidate list. From this list we take title with the best Dice score.
 * 
 * @return ISSN, if it exists, otherwise an empty string
 *
 */
function issn_from_journal_title($journal, $threshold = 0.75)
{
    global $db;
    global $left;
    global $right;
    global $debug;
    $issn = '';
    $journal = trim($journal);
    // First try and exact match
    $sql = 'SELECT * FROM issn WHERE (title = ' . $db->Quote($journal) . ')';
    $result = $db->Execute($sql);
    if ($result == false) {
        die("failed: " . $sql);
    }
    if ($result->NumRows() == 1) {
        $issn = $result->fields['issn'];
    } else {
        // No exact match, try an approximate match
        // Clean up
        $query = $journal;
        // short pronouns are likely to cause problems as people may get them wrong (ie., "of" and "for")
        $query = str_replace(' of ', ' ', $query);
        $query = str_replace(' for ', ' ', $query);
        $query = preg_replace('/^The /', '', $query);
        $query = str_replace('&', 'and', $query);
        $query = str_replace(',', '', $query);
        $query = str_replace(':', '', $query);
        $query = str_replace('\'', '', $query);
        $query = str_replace('.', '', $query);
        $query = str_replace(' ', '% ', $query);
        $query = '%' . $query;
        $query .= '%';
        $sql = 'SELECT * FROM issn WHERE (title LIKE ' . $db->Quote($query) . ')';
        //echo $sql;
        $result = $db->Execute($sql);
        if ($result == false) {
            die("failed: " . $sql);
        }
        // Build results list
        $hits = array();
        while (!$result->EOF) {
            $left = $right = '';
            $qStr = $journal;
            $qStr = str_replace('.', '', $qStr);
            $hStr = $result->fields['title'];
            $hStr = str_replace('.', '', $hStr);
            $C = LCSLength($qStr, $hStr);
            printDiff($C, $qStr, $hStr, strlen($qStr), strlen($hStr));
            $score = $C[strlen($qStr)][strlen($hStr)];
            $score = 1.0 - (double) (strlen($qStr) + strlen($hStr) - 2 * $score) / (double) (strlen($qStr) + strlen($hStr));
            //$score *= 100;
            $hit = array('hit' => $result->fields['title'], 'hitDisplay' => $right, 'score' => $score, 'issn' => $result->fields['issn']);
            array_push($hits, $hit);
            $result->MoveNext();
        }
        // sort
        $scores = array();
        foreach ($hits as $key => $row) {
            $scores[$key] = $row['score'];
        }
        array_multisort($scores, SORT_NUMERIC, SORT_DESC, $hits);
        if ($debug) {
            echo '<table border="1" cellpadding="2">';
            echo '<tr style="font-family:Arial;font-size:12px;"><th>Journal</th><th>Score</th><th>ISSN</th></tr>';
            foreach ($hits as $hit) {
                echo '<tr style="font-family:Arial;font-size:12px;">';
                echo '<td>';
                echo "<span style=\"background:white;color:black;\">", $hit['hitDisplay'], "</span>";
                echo '</td>';
                echo '<td>';
                echo $hit['score'];
                echo '</td>';
                echo '<td>';
                echo '<a href="http://journalseek.net/cgi-bin/journalseek/journalsearch.cgi?field=issn&query=' . $hit['issn'] . '" target="_blank">' . $hit['issn'] . '</a>';
                echo '</td>';
                echo '</tr>';
            }
            echo '</table>';
        }
        if (count($hits) > 0) {
            // Do we have a hit (above some threshhold)
            if ($hits[0]['score'] >= $threshold) {
                $issn = $hits[0]['issn'];
            }
        }
    }
    return $issn;
}
示例#4
0
/**
 * @brief Approximate string search for title
 *
 * Assumes n-gram index available for MySQL, see
 * http://iphylo.blogspot.com/2009/10/n-gram-fulltext-indexing-in-mysql.html for details on installing
 * this.
 *
 * @param str Title string to search for
 * @param threshold Percentage of str that we require to be in longest common subsequence (default is 75%)
 *
 * @return Array of matching titles, together with scores
 */
function bhl_title_lookup($str, $threshold = 70)
{
    global $db;
    $matches = array();
    $locs = array();
    $str = clean_string($str);
    $str_length = strlen($str);
    $sql = 'SELECT TitleID, ShortTitle, MATCH(ShortTitle) AGAINST(' . $db->qstr($str) . ')
AS score FROM bhl_title
WHERE MATCH(ShortTitle) AGAINST(' . $db->qstr($str) . ') LIMIT 10';
    //echo $sql;
    $lcs = array();
    $count = 0;
    $result = $db->Execute($sql);
    if ($result == false) {
        die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql);
    }
    while (!$result->EOF) {
        // Get subsequence length
        $cleaned_hit = clean_string($result->fields['ShortTitle']);
        $cleaned_hit_length = strlen($cleaned_hit);
        $C = LCSLength($cleaned_hit, $str);
        // length of subsequence as percentage of query string
        $subsequence_length = round(100.0 * $C[$cleaned_hit_length][$str_length] / $str_length);
        // length of subsequence as percentage of hit
        $hit_subsequence_length = round(100.0 * $C[$cleaned_hit_length][$str_length] / $cleaned_hit_length);
        //echo $cleaned_hit . ' ' . $subsequence_length . ' ' . $hit_subsequence_length . '<br/>';
        if ($subsequence_length >= $threshold && $hit_subsequence_length >= 33) {
            array_push($matches, array('TitleID' => $result->fields['TitleID'], 'ShortTitle' => $result->fields['ShortTitle'], 'score' => $result->fields['score'], 'sl' => $subsequence_length, 'subsequence' => $C[$cleaned_hit_length][$str_length], 'x' => $str, 'y' => $cleaned_hit));
            array_push($lcs, array('row' => $count, 'subsequence' => $C[$cleaned_hit_length][$str_length]));
        }
        $count++;
        $result->MoveNext();
    }
    //print_r($lcs);
    $scores = array();
    $index = array();
    foreach ($lcs as $key => $row) {
        $scores[$key] = $row['subsequence'];
        $index[$key] = $key;
    }
    array_multisort($scores, SORT_DESC, $index);
    //print_r($scores);
    //print_r($index);
    return $matches;
}