function selectSubstringHistogram($histogram, $inputText, $substringLength, $count) { /* * Return the lowest scored substring from $inputText using a passed histogram */ $substringScores = array(); $split = explode(" ", standardizeText($inputText)); //split standardized string into words $histogramScores = scoreHistogram($histogram); //score the histogram once to increase speed //iterate through all possible substrings of the specified length for ($i = 0; $i < count($split) - $substringLength + 1; $i++) { $substring = implode(" ", array_slice($split, $i, $substringLength)); //grab a substring of the correct length $score = 0; $words = explode(" ", $substring); foreach ($words as $word) { $score += $histogramScores[$word]; } $substringScores[$i] = $score; } asort($substringScores); //sort the frequency array by value but preserve keys reset($substringScores); //reset the key pointer so we can iterate correctly //grab the lowest scored substring $substring = implode(" ", array_slice($split, key($substringScores), $substringLength)); //grab the $count'th lowest substring (in case of recalculation for ($i = 0; $i < $count; $i++) { if (!next($substringScores)) { //next() returns false at the end of the array return ""; //if a unique substring is not found, return "" } $substring = implode(" ", array_slice($split, key($substringScores), $substringLength)); } return $substring; }
<?php include "includes/common.php"; $string = "this is a test this is a second test"; //1872 $words = explode(" ", $string); //split our standardized input by spaces $histogram = array_count_values($words); //return an arrray of occurances print_r($histogram); echo "<br>"; $histogram = scoreHistogram($histogram); print_r($histogram); echo "<br>"; //selectSubstringHistogram($histogram, $inputText, $substringLength, $count) $substring = selectSubstringHistogram($histogram, $string, 4, 0); echo "<br>{$substring}<br>"; $sql = array(); foreach ($histogram as $word => $count) { $sql[] = '("' . $word . '", ' . $count . ')'; } $blah = array_merge(array_keys($histogram), array_values($histogram)); //echo implode(',', $histogram); print_r($blah);