function OneQuery($query_key, $text, $limit = 1, $properties = null) { global $config; // clean text $text = str_replace(':', '', $text); $text = str_replace('"', '', $text); // BioStor search API $url = 'http://biostor.org/api.php?q=' . urlencode($text); //file_put_contents('/tmp/q.txt', $url, FILE_APPEND); $json = get($url); //file_put_contents('/tmp/q.txt', $json, FILE_APPEND); if ($json != '') { $obj = json_decode($json); if (isset($obj->rows)) { //foreach ($obj->rows as $row) //$row = $obj->rows[0]; for ($i = 0; $i < 3; $i++) { $row = $obj->rows[$i]; // check $v1 = finger_print($text); $v2 = finger_print($row->fields->default); $lcs = new LongestCommonSequence($v1, $v2); $d = $lcs->score(); // echo $d; $score = min($d / strlen($v1), $d / strlen($v2)); if ($score > 0.8) { $hit = new stdclass(); $hit->id = str_replace('biostor/', '', $row->id); $hit->name = $row->doc->title; $hit->score = $score; $hit->match = true; $this->StoreHit($query_key, $hit); } } } } }
function cluster($strings) { $result = new stdclass(); $n = count($strings); // clean for ($i = 0; $i < $n; $i++) { $strings[$i] = finger_print(trim($strings[$i])); } $map = array(); $inv_map = array(); $count = 0; foreach ($strings as $k => $v) { $map[$k] = $count; $inv_map[$count] = $k; $count++; } // Create adjacency matrix and fill with 0's $X = array(); for ($i = 0; $i < $n; $i++) { $X[$i] = array(); for ($j = 0; $j < $n; $j++) { $X[$i][$j] = 0; } } $nodes = ''; $edges = ''; // Compare names using approximate string matching $i = 0; foreach ($strings as $k1 => $v1) { $nodes .= "node {$i} [label=\"" . $v1 . "\"];\n"; if ($i < $n - 1) { $j = 0; foreach ($strings as $k2 => $v2) { if ($j > $i && $j < $n) { // Find longest common subsequence for this pair of cleaned names $lcs = new LongestCommonSequence($v1, $v2); $d = $lcs->score(); // Filter by longest common substring (to ensure we have a "meaningful" // match), that is, so that we avoid subsequences that have little continuity $str = ''; $lcstr = LongestCommonSubstring($v1, $v2, $str); if ($lcstr >= 4) { // Ignore matches just on date, we want more than that if (is_numeric(trim($str))) { } else { // If longest common subsequence is > 70% of the length of both strings // we accept it. if ($d / strlen($v1) >= 0.7 || $d / strlen($v2) >= 0.7) { $X[$map[$k1]][$map[$k2]] = 1; $X[$map[$k2]][$map[$k1]] = 1; $edges .= $i . " -- " . $j . " [label=\"" . $lcstr . "\"];\n"; } } } else { // If just a short match is it the start if the string (e.g., an abbreviation) $abbreviation = false; if (strlen($v1) == $d) { if (strpos($v2, $v1, 0) === false) { } else { $abbreviation = true; } } else { if (strpos($v1, $v2, 0) === false) { } else { $abbreviation = true; } } // Accept abbreviation if ($abbreviation) { $X[$map[$k1]][$map[$k2]] = 1; $X[$map[$k2]][$map[$k1]] = 1; $edges .= $i . " -- " . $j . " [label=\"" . $lcstr . "\"];\n"; } } } $j++; } $i++; } } $result->graph = "graph {\n" . $nodes . $edges . "}\n"; //echo $graph; // Get components of adjacency matrix $components = get_components($X); $result->clusters = array(); foreach ($components as $component) { $cluster = array(); foreach ($component as $k => $v) { $member = new stdclass(); $member->id = $inv_map[$v]; $member->string = $strings[$inv_map[$v]]; $cluster[] = $member; } $result->clusters[] = $cluster; } //print_r($map); if (0) { print_r($c); } return $result; }