function compare($str1, $str2) { $n1 = strlen($str1); $n2 = strlen($str2); $C = LCSLength($str1, $str2); $subsequence_length = $C[$n1][$n2]; $symdiff = 1.0 - ($n1 + $n2 - 2 * $subsequence_length) / ($n1 + $n2); return $symdiff; }
function find_citation($citation, &$result, $threshold = 0.8) { global $config; global $couch; $q = clean_string($citation); $rows_per_page = 5; $url = '/_design/citation/_search/all?q=' . urlencode($q) . '&limit=' . $rows_per_page; $resp = $couch->send("GET", "/" . $config['couchdb_options']['database'] . "/" . $url); $obj = json_decode($resp); if (isset($obj->error)) { } else { $result->query_ok = true; if ($obj->total_rows > 0) { $best_hit = 0; $q = strtolower($q); foreach ($obj->rows as $row) { $hit = $row->fields->default; $hit_original = $hit; $hit = clean_string($hit); $hit = strtolower($hit); $query_length = strlen($q); $hit_length = strlen($hit); $C = LCSLength($hit, $q); // length of subsequence as percentage of query string $subsequence_length = round(100.0 * $C[$hit_length][$query_length] / $query_length); $symdiff = 1.0 - ($query_length + $hit_length - 2 * $C[$hit_length][$query_length]) / ($query_length + $hit_length); if ($symdiff > $threshold) { if ($symdiff >= $best_hit) { $best_hit = $symdiff; $match = new stdclass(); $match->text = $citation; $match->hit = $hit_original; $match->match = true; $match->id = $row->id; $match->score = $row->order[0]; $match->symdiff = $symdiff; if ($symdiff > $best_hit) { $result->results = array(); } $result->results[] = $match; } } } } } return count($result->results) > 1; }
/** * @brief Obtain ISSN for a journal * * @param journal Journal name * @param threshold Threshold for matching name (default = 0.75) * * If exact match not found we use approximate string matching to find the best match. The * journal name is stripped of short words ("of", "the") and punctuation, then a MySQL LIKE * query finds a candidate list. From this list we take title with the best Dice score. * * @return ISSN, if it exists, otherwise an empty string * */ function issn_from_journal_title($journal, $threshold = 0.75) { global $db; global $left; global $right; global $debug; $issn = ''; $journal = trim($journal); // First try and exact match $sql = 'SELECT * FROM issn WHERE (title = ' . $db->Quote($journal) . ')'; $result = $db->Execute($sql); if ($result == false) { die("failed: " . $sql); } if ($result->NumRows() == 1) { $issn = $result->fields['issn']; } else { // No exact match, try an approximate match // Clean up $query = $journal; // short pronouns are likely to cause problems as people may get them wrong (ie., "of" and "for") $query = str_replace(' of ', ' ', $query); $query = str_replace(' for ', ' ', $query); $query = preg_replace('/^The /', '', $query); $query = str_replace('&', 'and', $query); $query = str_replace(',', '', $query); $query = str_replace(':', '', $query); $query = str_replace('\'', '', $query); $query = str_replace('.', '', $query); $query = str_replace(' ', '% ', $query); $query = '%' . $query; $query .= '%'; $sql = 'SELECT * FROM issn WHERE (title LIKE ' . $db->Quote($query) . ')'; //echo $sql; $result = $db->Execute($sql); if ($result == false) { die("failed: " . $sql); } // Build results list $hits = array(); while (!$result->EOF) { $left = $right = ''; $qStr = $journal; $qStr = str_replace('.', '', $qStr); $hStr = $result->fields['title']; $hStr = str_replace('.', '', $hStr); $C = LCSLength($qStr, $hStr); printDiff($C, $qStr, $hStr, strlen($qStr), strlen($hStr)); $score = $C[strlen($qStr)][strlen($hStr)]; $score = 1.0 - (double) (strlen($qStr) + strlen($hStr) - 2 * $score) / (double) (strlen($qStr) + strlen($hStr)); //$score *= 100; $hit = array('hit' => $result->fields['title'], 'hitDisplay' => $right, 'score' => $score, 'issn' => $result->fields['issn']); array_push($hits, $hit); $result->MoveNext(); } // sort $scores = array(); foreach ($hits as $key => $row) { $scores[$key] = $row['score']; } array_multisort($scores, SORT_NUMERIC, SORT_DESC, $hits); if ($debug) { echo '<table border="1" cellpadding="2">'; echo '<tr style="font-family:Arial;font-size:12px;"><th>Journal</th><th>Score</th><th>ISSN</th></tr>'; foreach ($hits as $hit) { echo '<tr style="font-family:Arial;font-size:12px;">'; echo '<td>'; echo "<span style=\"background:white;color:black;\">", $hit['hitDisplay'], "</span>"; echo '</td>'; echo '<td>'; echo $hit['score']; echo '</td>'; echo '<td>'; echo '<a href="http://journalseek.net/cgi-bin/journalseek/journalsearch.cgi?field=issn&query=' . $hit['issn'] . '" target="_blank">' . $hit['issn'] . '</a>'; echo '</td>'; echo '</tr>'; } echo '</table>'; } if (count($hits) > 0) { // Do we have a hit (above some threshhold) if ($hits[0]['score'] >= $threshold) { $issn = $hits[0]['issn']; } } } return $issn; }
/** * @brief Approximate string search for title * * Assumes n-gram index available for MySQL, see * http://iphylo.blogspot.com/2009/10/n-gram-fulltext-indexing-in-mysql.html for details on installing * this. * * @param str Title string to search for * @param threshold Percentage of str that we require to be in longest common subsequence (default is 75%) * * @return Array of matching titles, together with scores */ function bhl_title_lookup($str, $threshold = 70) { global $db; $matches = array(); $locs = array(); $str = clean_string($str); $str_length = strlen($str); $sql = 'SELECT TitleID, ShortTitle, MATCH(ShortTitle) AGAINST(' . $db->qstr($str) . ') AS score FROM bhl_title WHERE MATCH(ShortTitle) AGAINST(' . $db->qstr($str) . ') LIMIT 10'; //echo $sql; $lcs = array(); $count = 0; $result = $db->Execute($sql); if ($result == false) { die("failed [" . __FILE__ . ":" . __LINE__ . "]: " . $sql); } while (!$result->EOF) { // Get subsequence length $cleaned_hit = clean_string($result->fields['ShortTitle']); $cleaned_hit_length = strlen($cleaned_hit); $C = LCSLength($cleaned_hit, $str); // length of subsequence as percentage of query string $subsequence_length = round(100.0 * $C[$cleaned_hit_length][$str_length] / $str_length); // length of subsequence as percentage of hit $hit_subsequence_length = round(100.0 * $C[$cleaned_hit_length][$str_length] / $cleaned_hit_length); //echo $cleaned_hit . ' ' . $subsequence_length . ' ' . $hit_subsequence_length . '<br/>'; if ($subsequence_length >= $threshold && $hit_subsequence_length >= 33) { array_push($matches, array('TitleID' => $result->fields['TitleID'], 'ShortTitle' => $result->fields['ShortTitle'], 'score' => $result->fields['score'], 'sl' => $subsequence_length, 'subsequence' => $C[$cleaned_hit_length][$str_length], 'x' => $str, 'y' => $cleaned_hit)); array_push($lcs, array('row' => $count, 'subsequence' => $C[$cleaned_hit_length][$str_length])); } $count++; $result->MoveNext(); } //print_r($lcs); $scores = array(); $index = array(); foreach ($lcs as $key => $row) { $scores[$key] = $row['subsequence']; $index[$key] = $key; } array_multisort($scores, SORT_DESC, $index); //print_r($scores); //print_r($index); return $matches; }