<?php require_once "../libs/ontology/onto_utils.php"; ?> <HTML> <BODY> <?php if (empty($_GET['k'])) { echo "<h3>You have to specify a keyword string in the 'k' param</h3>"; } else { echo "<p>Input Kwd string is<br>[" . $_GET['k'] . "]</p>"; echo "<h3>Checking tokenizer</h3>"; $tokens = splitKwdString($_GET['k']); $nTokens = count($tokens); if ($nTokens <= 0) { echo "<p>Found no tokens in string???</p>"; } else { echo "<p>Found " . $nTokens . " tokens in string:</p>"; for ($i = 0; $i < $nTokens; $i++) { echo "<p>[<b>" . $tokens[$i] . "</b>]</p>"; } } // Now check the n-Gram builder echo "<br /><br /><h3>Checking nGram builder</h3>"; $ngrams = buildNGramsFromTokens($tokens); $nNgrams = count($ngrams); if ($nNgrams <= 0) { echo "<p>Found no nGrams in tokens???</p>"; } else { echo "<p>Found " . $nNgrams . " nGrams:</p>"; echo "<table><tr><td>Len</td><td>Start</td><td>nGram</td></tr>";
function getCategoryIDsForKwds($kwdStr, $countsWithImages) { //:boolean //global $facets; global $db; $countCol = $countsWithImages ? "n_matches_w_img" : "n_matches"; $retval = array('cats' => array(), 'kwds' => array(), 'msg' => null, 'query' => null); // Split on the quotes, then spaces to get token/words $tokens = splitKwdString($kwdStr); // Try longest n-grams first, and if get categories, drop all those words. // This means the n-grams must have the string, the start index and length. // If just order n-grams by length, then can proceed through list. // Must also filter later n-grams contained by a matched n-gram, but see these // easily by finding any n-gram with start-index from matched_start_index down // to (matched_start_index - matched_length). // For looping to work, must either be while(more) based or have markers to ignore. // If put words into list as unigrams, then have unified loop. // When done with loop, should have some set of categories. // Now have to find ascendant/descendant relations and prune lower ones. // Last, should we consider facets in this? Not sure; for now, no. // Double check error condition - no params in. $nTokens = count($tokens); if ($nTokens <= 0) { $retval['msg'] = 'No tokens, so just returning nothing!'; return $retval; } $ngrams = array(); // if only one word, build an ngram item from the one token if ($nTokens == 1) { $ngrams[] = array('len' => 1, 'start' => 0, 'ngram' => strtolower($tokens[0])); $nNgrams = 1; } else { $ngrams = buildNGramsFromTokens($tokens); $nNgrams = count($ngrams); if ($nNgrams <= 0) { // Should complain somehow $retval['kwds'] = requoteMultiTermKwdTokens($tokens); $retval['msg'] = 'Could not create nGrams from tokens???'; return $retval; } } // TODO set up as a prepared and parameterized query. $tqCatsForKwds = "select c.id cid, c.parent_id pid, c.facet_id fid," . " LOWER(hk.token) token, CHAR_LENGTH(hk.token) as tlen," . " c." . $countCol . " count from categories c, hooks hk" . " where c.id=hk.cat_id AND ("; // Should we put in the like matches as well? We can match against the tokens // and figure out which are proper matches and which not. // Not now - TODO? for ($i = 0; $i < $nNgrams; $i++) { if ($i > 0) { $tqCatsForKwds .= " OR "; } $tqCatsForKwds .= "hk.token='" . $ngrams[$i]['ngram'] . "'"; } // We sort by token length to pull out longest n-grams. // We use a secondary sort on count, so if we get multiple matches, we // choose the category with the most associated objects. $tqCatsForKwds .= ") ORDER BY tlen desc, count desc"; // Only for debug!!! $retval['query'] = $tqCatsForKwds; $catsresult =& $db->query($tqCatsForKwds); if (PEAR::isError($catsresult)) { error_log("getCategoryIDsForKwds() Query error: " . $catsresult->getMessage()); error_log("getCategoryIDsForKwds() Query : " . $tqCatsForKwds); // Fall back to just returning the keywords as input. $retval['kwds'] = requoteMultiTermKwdTokens($tokens); $retval['msg'] = "getCategoryIDsForKwds() Query error: " . $catsresult->getMessage(); return $retval; } $catsFound = array(); $retval['msg'] .= "Query returned " . $catsresult->numRows() . " rows..."; while ($row = $catsresult->fetchRow()) { // We have to make sure that the token is still in the list we're considering. // If we have "West African" and match the full token, we do not want to also // match the token "African". // This will also filter out multiple matches of a given token (preferring first). $iMatch = -1; for ($i = 0; $i < $nNgrams; $i++) { if (!strcmp($ngrams[$i]['ngram'], $row['token'])) { $iMatch = $i; break; } } if ($iMatch >= 0) { $retval['msg'] .= "<br />nGram match for row with token:" . $row['token'] . " cat:" . $row['cid']; // We found a match. Add the category to the cats list $catsFound[] = $row['cid']; // Now remove this and all overlapping ngrams from the list $newNGrams = array(); $matchStart = $ngrams[$iMatch]['start']; $matchEnd = $matchStart + $ngrams[$iMatch]['len'] - 1; // Let's trim the tokens for this nGram from the tokens list for ($i = $matchStart; $i <= $matchEnd; $i++) { unset($tokens[$i]); } if (count($tokens) == 0) { $retval['msg'] .= "<br />Matched all tokens to cats"; break; // We're done } for ($i = 0; $i < $nNgrams; $i++) { // If this is the one we matched, skip it if ($i == $iMatch) { continue; } // OR if curr start is within the range of the matched one, skip it $currStart = $ngrams[$i]['start']; if ($currStart >= $matchStart && $currStart <= $matchEnd) { continue; } // OR if curr end is within the range of the matched one $currEnd = $currStart + $ngrams[$i]['len'] - 1; if ($currEnd >= $matchStart && $currEnd <= $matchEnd) { continue; } // If we reach here, the current nGram does not overlap with the matched one $newNGrams[] = $ngrams[$i]; } $ngrams = $newNGrams; $nNgrams = count($newNGrams); // If we've covered all the n-grams, then there's no point considering // more of the query results. if ($nNgrams <= 0) { break; } } else { $retval['msg'] .= "<br />No nGram match for row with token:" . $row['token'] . " cat:" . $row['cid']; } } $retval['cats'] = $catsFound; // collapse all the unset values and re-index $retval['kwds'] = requoteMultiTermKwdTokens(array_values($tokens)); return $retval; }