Ejemplo n.º 1
0
} else {
    echo "<p>Input Kwd string is<br>[" . $_GET['k'] . "]</p>";
    echo "<h3>Checking tokenizer</h3>";
    $tokens = splitKwdString($_GET['k']);
    $nTokens = count($tokens);
    if ($nTokens <= 0) {
        echo "<p>Found no tokens in string???</p>";
    } else {
        echo "<p>Found " . $nTokens . " tokens in string:</p>";
        for ($i = 0; $i < $nTokens; $i++) {
            echo "<p>[<b>" . $tokens[$i] . "</b>]</p>";
        }
    }
    // Now check the n-Gram builder
    echo "<br /><br /><h3>Checking nGram builder</h3>";
    $ngrams = buildNGramsFromTokens($tokens);
    $nNgrams = count($ngrams);
    if ($nNgrams <= 0) {
        echo "<p>Found no nGrams in tokens???</p>";
    } else {
        echo "<p>Found " . $nNgrams . " nGrams:</p>";
        echo "<table><tr><td>Len</td><td>Start</td><td>nGram</td></tr>";
        for ($i = 0; $i < $nNgrams; $i++) {
            echo "<tr><td>" . $ngrams[$i]['len'] . "</td><td>" . $ngrams[$i]['start'] . "</td><td>" . $ngrams[$i]['ngram'] . "</td></tr>";
        }
        echo "</table>";
    }
    /*
     */
}
echo '<hr><p>Enter Keyword: <form method="get">
Ejemplo n.º 2
0
function getCategoryIDsForKwds($kwdStr, $countsWithImages)
{
    //:boolean
    //global $facets;
    global $db;
    $countCol = $countsWithImages ? "n_matches_w_img" : "n_matches";
    $retval = array('cats' => array(), 'kwds' => array(), 'msg' => null, 'query' => null);
    // Split on the quotes, then spaces to get token/words
    $tokens = splitKwdString($kwdStr);
    // Try longest n-grams first, and if get categories, drop all those words.
    // This means the n-grams must have the string, the start index and length.
    // If just order n-grams by length, then can proceed through list.
    // Must also filter later n-grams contained by a matched n-gram, but see these
    // easily by finding any n-gram with start-index from matched_start_index down
    // to (matched_start_index - matched_length).
    // For looping to work, must either be while(more) based or have markers to ignore.
    // If put words into list as unigrams, then have unified loop.
    // When done with loop, should have some set of categories.
    // Now have to find ascendant/descendant relations and prune lower ones.
    // Last, should we consider facets in this? Not sure; for now, no.
    // Double check error condition - no params in.
    $nTokens = count($tokens);
    if ($nTokens <= 0) {
        $retval['msg'] = 'No tokens, so just returning nothing!';
        return $retval;
    }
    $ngrams = array();
    // if only one word, build an ngram item from the one token
    if ($nTokens == 1) {
        $ngrams[] = array('len' => 1, 'start' => 0, 'ngram' => strtolower($tokens[0]));
        $nNgrams = 1;
    } else {
        $ngrams = buildNGramsFromTokens($tokens);
        $nNgrams = count($ngrams);
        if ($nNgrams <= 0) {
            // Should complain somehow
            $retval['kwds'] = requoteMultiTermKwdTokens($tokens);
            $retval['msg'] = 'Could not create nGrams from tokens???';
            return $retval;
        }
    }
    // TODO set up as a prepared and parameterized query.
    $tqCatsForKwds = "select c.id cid, c.parent_id pid, c.facet_id fid," . " LOWER(hk.token) token, CHAR_LENGTH(hk.token) as tlen," . " c." . $countCol . " count from categories c, hooks hk" . " where c.id=hk.cat_id AND (";
    // Should we put in the like matches as well? We can match against the tokens
    // and figure out which are proper matches and which not.
    // Not now - TODO?
    for ($i = 0; $i < $nNgrams; $i++) {
        if ($i > 0) {
            $tqCatsForKwds .= " OR ";
        }
        $tqCatsForKwds .= "hk.token='" . $ngrams[$i]['ngram'] . "'";
    }
    // We sort by token length to pull out longest n-grams.
    // We use a secondary sort on count, so if we get multiple matches, we
    // choose the category with the most associated objects.
    $tqCatsForKwds .= ") ORDER BY tlen desc, count desc";
    // Only for debug!!!
    $retval['query'] = $tqCatsForKwds;
    $catsresult =& $db->query($tqCatsForKwds);
    if (PEAR::isError($catsresult)) {
        error_log("getCategoryIDsForKwds() Query error: " . $catsresult->getMessage());
        error_log("getCategoryIDsForKwds() Query : " . $tqCatsForKwds);
        // Fall back to just returning the keywords as input.
        $retval['kwds'] = requoteMultiTermKwdTokens($tokens);
        $retval['msg'] = "getCategoryIDsForKwds() Query error: " . $catsresult->getMessage();
        return $retval;
    }
    $catsFound = array();
    $retval['msg'] .= "Query returned " . $catsresult->numRows() . " rows...";
    while ($row = $catsresult->fetchRow()) {
        // We have to make sure that the token is still in the list we're considering.
        // If we have "West African" and match the full token, we do not want to also
        // match the token "African".
        // This will also filter out multiple matches of a given token (preferring first).
        $iMatch = -1;
        for ($i = 0; $i < $nNgrams; $i++) {
            if (!strcmp($ngrams[$i]['ngram'], $row['token'])) {
                $iMatch = $i;
                break;
            }
        }
        if ($iMatch >= 0) {
            $retval['msg'] .= "<br />nGram match for row with token:" . $row['token'] . " cat:" . $row['cid'];
            // We found a match. Add the category to the cats list
            $catsFound[] = $row['cid'];
            // Now remove this and all overlapping ngrams from the list
            $newNGrams = array();
            $matchStart = $ngrams[$iMatch]['start'];
            $matchEnd = $matchStart + $ngrams[$iMatch]['len'] - 1;
            // Let's trim the tokens for this nGram from the tokens list
            for ($i = $matchStart; $i <= $matchEnd; $i++) {
                unset($tokens[$i]);
            }
            if (count($tokens) == 0) {
                $retval['msg'] .= "<br />Matched all tokens to cats";
                break;
                // We're done
            }
            for ($i = 0; $i < $nNgrams; $i++) {
                // If this is the one we matched, skip it
                if ($i == $iMatch) {
                    continue;
                }
                // OR if curr start is within the range of the matched one, skip it
                $currStart = $ngrams[$i]['start'];
                if ($currStart >= $matchStart && $currStart <= $matchEnd) {
                    continue;
                }
                // OR if curr end is within the range of the matched one
                $currEnd = $currStart + $ngrams[$i]['len'] - 1;
                if ($currEnd >= $matchStart && $currEnd <= $matchEnd) {
                    continue;
                }
                // If we reach here, the current nGram does not overlap with the matched one
                $newNGrams[] = $ngrams[$i];
            }
            $ngrams = $newNGrams;
            $nNgrams = count($newNGrams);
            // If we've covered all the n-grams, then there's no point considering
            // more of the query results.
            if ($nNgrams <= 0) {
                break;
            }
        } else {
            $retval['msg'] .= "<br />No nGram match for row with token:" . $row['token'] . " cat:" . $row['cid'];
        }
    }
    $retval['cats'] = $catsFound;
    // collapse all the unset values and re-index
    $retval['kwds'] = requoteMultiTermKwdTokens(array_values($tokens));
    return $retval;
}