PHP splitKwdString Examples

Programming Language: PHP

Method/Function: splitKwdString

Examples at hotexamples.com: 2

PHP splitKwdString - 2 examples found. These are the top rated real world PHP examples of splitKwdString extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: testOntoUtils.php Project: khadem/delphi-museum-project

<?php

require_once "../libs/ontology/onto_utils.php";
?>
<HTML>
<BODY>
<?php 
if (empty($_GET['k'])) {
    echo "<h3>You have to specify a keyword string in the 'k' param</h3>";
} else {
    echo "<p>Input Kwd string is<br>[" . $_GET['k'] . "]</p>";
    echo "<h3>Checking tokenizer</h3>";
    $tokens = splitKwdString($_GET['k']);
    $nTokens = count($tokens);
    if ($nTokens <= 0) {
        echo "<p>Found no tokens in string???</p>";
    } else {
        echo "<p>Found " . $nTokens . " tokens in string:</p>";
        for ($i = 0; $i < $nTokens; $i++) {
            echo "<p>[<b>" . $tokens[$i] . "</b>]</p>";
        }
    }
    // Now check the n-Gram builder
    echo "<br /><br /><h3>Checking nGram builder</h3>";
    $ngrams = buildNGramsFromTokens($tokens);
    $nNgrams = count($ngrams);
    if ($nNgrams <= 0) {
        echo "<p>Found no nGrams in tokens???</p>";
    } else {
        echo "<p>Found " . $nNgrams . " nGrams:</p>";
        echo "<table><tr><td>Len</td><td>Start</td><td>nGram</td></tr>";

Example #2

Show file

File: ontoServices.php Project: khadem/delphi-museum-project

function getCategoryIDsForKwds($kwdStr, $countsWithImages)
{
    //:boolean
    //global $facets;
    global $db;
    $countCol = $countsWithImages ? "n_matches_w_img" : "n_matches";
    $retval = array('cats' => array(), 'kwds' => array(), 'msg' => null, 'query' => null);
    // Split on the quotes, then spaces to get token/words
    $tokens = splitKwdString($kwdStr);
    // Try longest n-grams first, and if get categories, drop all those words.
    // This means the n-grams must have the string, the start index and length.
    // If just order n-grams by length, then can proceed through list.
    // Must also filter later n-grams contained by a matched n-gram, but see these
    // easily by finding any n-gram with start-index from matched_start_index down
    // to (matched_start_index - matched_length).
    // For looping to work, must either be while(more) based or have markers to ignore.
    // If put words into list as unigrams, then have unified loop.
    // When done with loop, should have some set of categories.
    // Now have to find ascendant/descendant relations and prune lower ones.
    // Last, should we consider facets in this? Not sure; for now, no.
    // Double check error condition - no params in.
    $nTokens = count($tokens);
    if ($nTokens <= 0) {
        $retval['msg'] = 'No tokens, so just returning nothing!';
        return $retval;
    }
    $ngrams = array();
    // if only one word, build an ngram item from the one token
    if ($nTokens == 1) {
        $ngrams[] = array('len' => 1, 'start' => 0, 'ngram' => strtolower($tokens[0]));
        $nNgrams = 1;
    } else {
        $ngrams = buildNGramsFromTokens($tokens);
        $nNgrams = count($ngrams);
        if ($nNgrams <= 0) {
            // Should complain somehow
            $retval['kwds'] = requoteMultiTermKwdTokens($tokens);
            $retval['msg'] = 'Could not create nGrams from tokens???';
            return $retval;
        }
    }
    // TODO set up as a prepared and parameterized query.
    $tqCatsForKwds = "select c.id cid, c.parent_id pid, c.facet_id fid," . " LOWER(hk.token) token, CHAR_LENGTH(hk.token) as tlen," . " c." . $countCol . " count from categories c, hooks hk" . " where c.id=hk.cat_id AND (";
    // Should we put in the like matches as well? We can match against the tokens
    // and figure out which are proper matches and which not.
    // Not now - TODO?
    for ($i = 0; $i < $nNgrams; $i++) {
        if ($i > 0) {
            $tqCatsForKwds .= " OR ";
        }
        $tqCatsForKwds .= "hk.token='" . $ngrams[$i]['ngram'] . "'";
    }
    // We sort by token length to pull out longest n-grams.
    // We use a secondary sort on count, so if we get multiple matches, we
    // choose the category with the most associated objects.
    $tqCatsForKwds .= ") ORDER BY tlen desc, count desc";
    // Only for debug!!!
    $retval['query'] = $tqCatsForKwds;
    $catsresult =& $db->query($tqCatsForKwds);
    if (PEAR::isError($catsresult)) {
        error_log("getCategoryIDsForKwds() Query error: " . $catsresult->getMessage());
        error_log("getCategoryIDsForKwds() Query : " . $tqCatsForKwds);
        // Fall back to just returning the keywords as input.
        $retval['kwds'] = requoteMultiTermKwdTokens($tokens);
        $retval['msg'] = "getCategoryIDsForKwds() Query error: " . $catsresult->getMessage();
        return $retval;
    }
    $catsFound = array();
    $retval['msg'] .= "Query returned " . $catsresult->numRows() . " rows...";
    while ($row = $catsresult->fetchRow()) {
        // We have to make sure that the token is still in the list we're considering.
        // If we have "West African" and match the full token, we do not want to also
        // match the token "African".
        // This will also filter out multiple matches of a given token (preferring first).
        $iMatch = -1;
        for ($i = 0; $i < $nNgrams; $i++) {
            if (!strcmp($ngrams[$i]['ngram'], $row['token'])) {
                $iMatch = $i;
                break;
            }
        }
        if ($iMatch >= 0) {
            $retval['msg'] .= "<br />nGram match for row with token:" . $row['token'] . " cat:" . $row['cid'];
            // We found a match. Add the category to the cats list
            $catsFound[] = $row['cid'];
            // Now remove this and all overlapping ngrams from the list
            $newNGrams = array();
            $matchStart = $ngrams[$iMatch]['start'];
            $matchEnd = $matchStart + $ngrams[$iMatch]['len'] - 1;
            // Let's trim the tokens for this nGram from the tokens list
            for ($i = $matchStart; $i <= $matchEnd; $i++) {
                unset($tokens[$i]);
            }
            if (count($tokens) == 0) {
                $retval['msg'] .= "<br />Matched all tokens to cats";
                break;
                // We're done
            }
            for ($i = 0; $i < $nNgrams; $i++) {
                // If this is the one we matched, skip it
                if ($i == $iMatch) {
                    continue;
                }
                // OR if curr start is within the range of the matched one, skip it
                $currStart = $ngrams[$i]['start'];
                if ($currStart >= $matchStart && $currStart <= $matchEnd) {
                    continue;
                }
                // OR if curr end is within the range of the matched one
                $currEnd = $currStart + $ngrams[$i]['len'] - 1;
                if ($currEnd >= $matchStart && $currEnd <= $matchEnd) {
                    continue;
                }
                // If we reach here, the current nGram does not overlap with the matched one
                $newNGrams[] = $ngrams[$i];
            }
            $ngrams = $newNGrams;
            $nNgrams = count($newNGrams);
            // If we've covered all the n-grams, then there's no point considering
            // more of the query results.
            if ($nNgrams <= 0) {
                break;
            }
        } else {
            $retval['msg'] .= "<br />No nGram match for row with token:" . $row['token'] . " cat:" . $row['cid'];
        }
    }
    $retval['cats'] = $catsFound;
    // collapse all the unset values and re-index
    $retval['kwds'] = requoteMultiTermKwdTokens(array_values($tokens));
    return $retval;
}