function OneQuery($query_key, $text, $limit = 1, $properties = null) { global $config; // clean text $text = str_replace(':', '', $text); $text = str_replace('"', '', $text); // BioStor search API $url = 'http://biostor.org/api.php?q=' . urlencode($text); //file_put_contents('/tmp/q.txt', $url, FILE_APPEND); $json = get($url); //file_put_contents('/tmp/q.txt', $json, FILE_APPEND); if ($json != '') { $obj = json_decode($json); if (isset($obj->rows)) { //foreach ($obj->rows as $row) //$row = $obj->rows[0]; for ($i = 0; $i < 3; $i++) { $row = $obj->rows[$i]; // check $v1 = finger_print($text); $v2 = finger_print($row->fields->default); $lcs = new LongestCommonSequence($v1, $v2); $d = $lcs->score(); // echo $d; $score = min($d / strlen($v1), $d / strlen($v2)); if ($score > 0.8) { $hit = new stdclass(); $hit->id = str_replace('biostor/', '', $row->id); $hit->name = $row->doc->title; $hit->score = $score; $hit->match = true; $this->StoreHit($query_key, $hit); } } } } }
function cluster($strings) { $result = new stdclass(); $n = count($strings); // clean for ($i = 0; $i < $n; $i++) { $strings[$i] = finger_print(trim($strings[$i])); } $map = array(); $inv_map = array(); $count = 0; foreach ($strings as $k => $v) { $map[$k] = $count; $inv_map[$count] = $k; $count++; } // Create adjacency matrix and fill with 0's $X = array(); for ($i = 0; $i < $n; $i++) { $X[$i] = array(); for ($j = 0; $j < $n; $j++) { $X[$i][$j] = 0; } } $nodes = ''; $edges = ''; // Compare names using approximate string matching $i = 0; foreach ($strings as $k1 => $v1) { $nodes .= "node {$i} [label=\"" . $v1 . "\"];\n"; if ($i < $n - 1) { $j = 0; foreach ($strings as $k2 => $v2) { if ($j > $i && $j < $n) { // Find longest common subsequence for this pair of cleaned names $lcs = new LongestCommonSequence($v1, $v2); $d = $lcs->score(); // Filter by longest common substring (to ensure we have a "meaningful" // match), that is, so that we avoid subsequences that have little continuity $str = ''; $lcstr = LongestCommonSubstring($v1, $v2, $str); if ($lcstr >= 4) { // Ignore matches just on date, we want more than that if (is_numeric(trim($str))) { } else { // If longest common subsequence is > 70% of the length of both strings // we accept it. if ($d / strlen($v1) >= 0.7 || $d / strlen($v2) >= 0.7) { $X[$map[$k1]][$map[$k2]] = 1; $X[$map[$k2]][$map[$k1]] = 1; $edges .= $i . " -- " . $j . " [label=\"" . $lcstr . "\"];\n"; } } } else { // If just a short match is it the start if the string (e.g., an abbreviation) $abbreviation = false; if (strlen($v1) == $d) { if (strpos($v2, $v1, 0) === false) { } else { $abbreviation = true; } } else { if (strpos($v1, $v2, 0) === false) { } else { $abbreviation = true; } } // Accept abbreviation if ($abbreviation) { $X[$map[$k1]][$map[$k2]] = 1; $X[$map[$k2]][$map[$k1]] = 1; $edges .= $i . " -- " . $j . " [label=\"" . $lcstr . "\"];\n"; } } } $j++; } $i++; } } $result->graph = "graph {\n" . $nodes . $edges . "}\n"; //echo $graph; // Get components of adjacency matrix $components = get_components($X); $result->clusters = array(); foreach ($components as $component) { $cluster = array(); foreach ($component as $k => $v) { $member = new stdclass(); $member->id = $inv_map[$v]; $member->string = $strings[$inv_map[$v]]; $cluster[] = $member; } $result->clusters[] = $cluster; } //print_r($map); if (0) { print_r($c); } return $result; }
$html = '<div>' . $this->left . '<br/>' . $this->right . '</div>'; return $html; } function printDiff($C, $X, $Y, $i, $j) { if ($i > 0 and $j > 0 and $X[$i - 1] == $Y[$j - 1]) { $this->printDiff($C, $X, $Y, $i - 1, $j - 1); //echo " " , $X{$i-1}; $this->left .= "<span style=\"background:rgb(100,255,100);color:black;\">" . $X[$i - 1] . "</span>"; $this->right .= "<span style=\"background:rgb(100,255,100);color:black;\">" . $X[$i - 1] . "</span>"; } else { if ($j > 0 and ($i == 0 or $C[$i][$j - 1] >= $C[$i - 1][$j])) { $this->printDiff($C, $X, $Y, $i, $j - 1); //echo "+ " , $Y{$j-1}; $this->right .= $Y[$j - 1]; } else { if ($i > 0 and ($j == 0 or $C[$i][$j - 1] < $C[$i - 1][$j])) { $this->printDiff($C, $X, $Y, $i - 1, $j); //echo "- " , $X{$i-1}; $this->left .= $X[$i - 1]; } } } } } // test $s1 = 'hello blue marine'; $s2 = 'yellow blue submarine'; $lcs = new LongestCommonSequence($s1, $s2); echo $lcs->score(); echo $lcs->display();