예제 #1
0
/***** Pull data into array $bing as key => value pairs($url2 => $description) */
while ($row2 = mysql_fetch_assoc($result_bing)) {
    $url2 = $row2['Url'];
    $description = $row2['Description'];
    $bing[$url2] = $description;
}
// var_dump($bing);
/******* Remove all punctuation(ereg_replace, replaces anything that is'nt a upper/lower case letter
 * with nothing) and stopwords, did'nt bother stemming words, 
 * hoped it might help
 * with speed*/
foreach ($bing as $url => $string) {
    $punctuation[$url] = ereg_replace("[^A-Za-z _]", "", $bing[$url]);
}
foreach ($punctuation as $url => $string) {
    $collection[$url] = stopwords($punctuation[$url]);
}
/******************** Start the clustering ***************************/
/*** Array $words contains all words(lowercase and trimmed of any white space in $collection   ***/
foreach ($collection as $url => $strin) {
    $each_url = explode(" ", strtolower($strin));
    foreach ($each_url as $ul => $wrd) {
        $words[] = strtolower(trim($each_url[$ul]));
    }
}
/**** array_unique removes all duplicates from $word ****/
$ind_words = array_unique($words);
$num_ind = count($ind_words);
foreach ($collection as $dat) {
    $trimmed = strtolower(trim($dat));
    $words_in_doc = explode(" ", $trimmed);
예제 #2
0
 function Blekko_Rewrite($search_api)
 {
     /****************     Initial Query Blekko API     *************/
     $get3 = file_get_contents("http://blekko.com/?q=" . urlencode($search_api) . "+/ps=50+/json&auth=b58f6ba2");
     $decode3 = json_decode($get3, TRUE);
     // TRUE for in array format
     foreach ($decode3['RESULT'] as $res) {
         // foreach loop, to loop through each array value (result) as $res
         $Description = strip_tags(mysql_real_escape_string($res['snippet']));
         mysql_query("INSERT INTO Blekko_rewrite (Description)\n                            VALUES ('{$Description}')");
     }
     $query_blekko = "SELECT * FROM Blekko_rewrite";
     $result_blekko = mysql_query($query_blekko);
     $i = 0;
     while ($row = mysql_fetch_assoc($result_blekko)) {
         $Description = $row['Description'];
         $blekko_rewrite[$i] = $Description;
         $i++;
     }
     $k = 0;
     for ($k = 0; $k < 50; $k++) {
         $preprocessing[$k] = ereg_replace("[^A-Za-z0-9 _]", "", $blekko_rewrite[$k]);
         $preprocessings[$k] = stopwords($preprocessing[$k]);
     }
     /************** TF-IDF   *************************/
     $data = $preprocessings;
     $i = 0;
     $num_docs = count($data);
     // 20 lines
     $num_word_each_doc = 0;
     for ($i = 0; $i < $num_docs; $i++) {
         $each_doc[$i] = explode(" ", $data[$i]);
     }
     $counter_all = 0;
     for ($i = 0; $i < $num_docs; $i++) {
         $num_word_each_doc = count($each_doc[$i]);
         for ($j = 0; $j < $num_word_each_doc; $j++) {
             $words[] = strtolower(trim($each_doc[$i][$j]));
         }
     }
     $ind_words = array_unique($words);
     $num_ind = count($ind_words);
     foreach ($data as $dat) {
         $trimmed = strtolower(trim($dat));
         $words_in_doc = explode(" ", $trimmed);
         $ind_words_doc = array_unique($words_in_doc);
         foreach ($ind_words_doc as $indwrds) {
             foreach ($words_in_doc as $wrds) {
                 if (strcasecmp(trim($indwrds), trim($wrds)) == 0) {
                     $count_ind_words[$indwrds]++;
                 }
             }
         }
     }
     $count_ind_words_highest = $count_ind_words;
     arsort($count_ind_words_highest);
     $max_count = reset($count_ind_words_highest);
     foreach ($data as $dat) {
         $trimmed = trim($dat);
         $words_in_doc = explode(" ", $trimmed);
         $ind_words_doc = array_unique($words_in_doc);
         $ind_wor_line[] = implode(" ", $ind_words_doc);
     }
     // var_dump($count_ind_words);
     foreach ($ind_wor_line as $iwl) {
         $trimmed = strtolower(trim($iwl));
         $words_in_doc = explode(" ", $trimmed);
         $ind_words_doc = array_unique($words_in_doc);
         foreach ($ind_words_doc as $indwrds) {
             foreach ($words_in_doc as $wrds) {
                 if (strcasecmp(trim($indwrds), trim($wrds)) == 0) {
                     $doc_freq_per_term[$indwrds]++;
                 }
             }
         }
     }
     //	 echo "<br>";
     //   echo "<br>";
     //  var_dump($doc_freq_per_term);
     $total_docs = $num_docs;
     $tf = $count / $max_count;
     foreach ($ind_words as $ind_wrd) {
         $tf_idf[$ind_wrd] = $count_ind_words[$ind_wrd] / $max_count * log($total_docs / $doc_freq_per_term[$ind_wrd]);
     }
     arsort($tf_idf);
     $top_ten = array_slice($tf_idf, 0, 2);
     //  var_dump($top_ten);
     $i = 0;
     foreach ($top_ten as $key => $value) {
         $query_rewrite[$i] = $key;
         $i++;
     }
     // var_dump($query_rewrite);
     $append = trim(implode(" ", $query_rewrite));
     echo "<br>";
     echo "<b>Rewritten query from Blekko: </b>";
     $search_append = "" . $search_api . "  " . $append . "";
     echo $search_append;
     echo "<br>";
     echo "<br>";
     $get3 = file_get_contents("http://blekko.com/?q=" . urlencode($search_append) . "+/ps=50+/json&auth=b58f6ba2");
     $decode3 = json_decode($get3, TRUE);
     // TRUE for in array format
     $i = 0;
     // incremental variable for search result numbering
     $k = 51;
     // decremental variable for assigning decending scores
     foreach ($decode3['RESULT'] as $res) {
         // foreach loop, to loop through each array value (result) as $res
         $i++;
         // incrementation
         $k--;
         $Rank = $i;
         $Score = $k;
         $Url = strip_tags(mysql_real_escape_string($res['url']));
         $Title = strip_tags(mysql_real_escape_string($res['url_title']));
         $Description = strip_tags(mysql_real_escape_string($res['snippet']));
         $DisplayUrl = strip_tags(mysql_real_escape_string($res['display_url']));
         mysql_query("INSERT INTO Blekko ( Rank, Url, Title, Description, DisplayUrl, Score)\n                            VALUES ('{$Rank}', '{$Url}', '{$Title}', '{$Description}', '{$DisplayUrl}', '{$Score}')");
     }
 }