Example #1
0
function selectSubstringHistogram($histogram, $inputText, $substringLength, $count)
{
    /*
     * Return the lowest scored substring from $inputText using a passed histogram
     */
    $substringScores = array();
    $split = explode(" ", standardizeText($inputText));
    //split standardized string into words
    $histogramScores = scoreHistogram($histogram);
    //score the histogram once to increase speed
    //iterate through all possible substrings of the specified length
    for ($i = 0; $i < count($split) - $substringLength + 1; $i++) {
        $substring = implode(" ", array_slice($split, $i, $substringLength));
        //grab a substring of the correct length
        $score = 0;
        $words = explode(" ", $substring);
        foreach ($words as $word) {
            $score += $histogramScores[$word];
        }
        $substringScores[$i] = $score;
    }
    asort($substringScores);
    //sort the frequency array by value but preserve keys
    reset($substringScores);
    //reset the key pointer so we can iterate correctly
    //grab the lowest scored substring
    $substring = implode(" ", array_slice($split, key($substringScores), $substringLength));
    //grab the $count'th lowest substring (in case of recalculation
    for ($i = 0; $i < $count; $i++) {
        if (!next($substringScores)) {
            //next() returns false at the end of the array
            return "";
            //if a unique substring is not found, return ""
        }
        $substring = implode(" ", array_slice($split, key($substringScores), $substringLength));
    }
    return $substring;
}
Example #2
0
<?php

include "includes/common.php";
$string = "this is a test this is a second test";
//1872
$words = explode(" ", $string);
//split our standardized input by spaces
$histogram = array_count_values($words);
//return an arrray of occurances
print_r($histogram);
echo "<br>";
$histogram = scoreHistogram($histogram);
print_r($histogram);
echo "<br>";
//selectSubstringHistogram($histogram, $inputText, $substringLength, $count)
$substring = selectSubstringHistogram($histogram, $string, 4, 0);
echo "<br>{$substring}<br>";
$sql = array();
foreach ($histogram as $word => $count) {
    $sql[] = '("' . $word . '", ' . $count . ')';
}
$blah = array_merge(array_keys($histogram), array_values($histogram));
//echo implode(',', $histogram);
print_r($blah);