/***** Pull data into array $bing as key => value pairs($url2 => $description) */ while ($row2 = mysql_fetch_assoc($result_bing)) { $url2 = $row2['Url']; $description = $row2['Description']; $bing[$url2] = $description; } // var_dump($bing); /******* Remove all punctuation(ereg_replace, replaces anything that is'nt a upper/lower case letter * with nothing) and stopwords, did'nt bother stemming words, * hoped it might help * with speed*/ foreach ($bing as $url => $string) { $punctuation[$url] = ereg_replace("[^A-Za-z _]", "", $bing[$url]); } foreach ($punctuation as $url => $string) { $collection[$url] = stopwords($punctuation[$url]); } /******************** Start the clustering ***************************/ /*** Array $words contains all words(lowercase and trimmed of any white space in $collection ***/ foreach ($collection as $url => $strin) { $each_url = explode(" ", strtolower($strin)); foreach ($each_url as $ul => $wrd) { $words[] = strtolower(trim($each_url[$ul])); } } /**** array_unique removes all duplicates from $word ****/ $ind_words = array_unique($words); $num_ind = count($ind_words); foreach ($collection as $dat) { $trimmed = strtolower(trim($dat)); $words_in_doc = explode(" ", $trimmed);
function Blekko_Rewrite($search_api) { /**************** Initial Query Blekko API *************/ $get3 = file_get_contents("http://blekko.com/?q=" . urlencode($search_api) . "+/ps=50+/json&auth=b58f6ba2"); $decode3 = json_decode($get3, TRUE); // TRUE for in array format foreach ($decode3['RESULT'] as $res) { // foreach loop, to loop through each array value (result) as $res $Description = strip_tags(mysql_real_escape_string($res['snippet'])); mysql_query("INSERT INTO Blekko_rewrite (Description)\n VALUES ('{$Description}')"); } $query_blekko = "SELECT * FROM Blekko_rewrite"; $result_blekko = mysql_query($query_blekko); $i = 0; while ($row = mysql_fetch_assoc($result_blekko)) { $Description = $row['Description']; $blekko_rewrite[$i] = $Description; $i++; } $k = 0; for ($k = 0; $k < 50; $k++) { $preprocessing[$k] = ereg_replace("[^A-Za-z0-9 _]", "", $blekko_rewrite[$k]); $preprocessings[$k] = stopwords($preprocessing[$k]); } /************** TF-IDF *************************/ $data = $preprocessings; $i = 0; $num_docs = count($data); // 20 lines $num_word_each_doc = 0; for ($i = 0; $i < $num_docs; $i++) { $each_doc[$i] = explode(" ", $data[$i]); } $counter_all = 0; for ($i = 0; $i < $num_docs; $i++) { $num_word_each_doc = count($each_doc[$i]); for ($j = 0; $j < $num_word_each_doc; $j++) { $words[] = strtolower(trim($each_doc[$i][$j])); } } $ind_words = array_unique($words); $num_ind = count($ind_words); foreach ($data as $dat) { $trimmed = strtolower(trim($dat)); $words_in_doc = explode(" ", $trimmed); $ind_words_doc = array_unique($words_in_doc); foreach ($ind_words_doc as $indwrds) { foreach ($words_in_doc as $wrds) { if (strcasecmp(trim($indwrds), trim($wrds)) == 0) { $count_ind_words[$indwrds]++; } } } } $count_ind_words_highest = $count_ind_words; arsort($count_ind_words_highest); $max_count = reset($count_ind_words_highest); foreach ($data as $dat) { $trimmed = trim($dat); $words_in_doc = explode(" ", $trimmed); $ind_words_doc = array_unique($words_in_doc); $ind_wor_line[] = implode(" ", $ind_words_doc); } // var_dump($count_ind_words); foreach ($ind_wor_line as $iwl) { $trimmed = strtolower(trim($iwl)); $words_in_doc = explode(" ", $trimmed); $ind_words_doc = array_unique($words_in_doc); foreach ($ind_words_doc as $indwrds) { foreach ($words_in_doc as $wrds) { if (strcasecmp(trim($indwrds), trim($wrds)) == 0) { $doc_freq_per_term[$indwrds]++; } } } } // echo "<br>"; // echo "<br>"; // var_dump($doc_freq_per_term); $total_docs = $num_docs; $tf = $count / $max_count; foreach ($ind_words as $ind_wrd) { $tf_idf[$ind_wrd] = $count_ind_words[$ind_wrd] / $max_count * log($total_docs / $doc_freq_per_term[$ind_wrd]); } arsort($tf_idf); $top_ten = array_slice($tf_idf, 0, 2); // var_dump($top_ten); $i = 0; foreach ($top_ten as $key => $value) { $query_rewrite[$i] = $key; $i++; } // var_dump($query_rewrite); $append = trim(implode(" ", $query_rewrite)); echo "<br>"; echo "<b>Rewritten query from Blekko: </b>"; $search_append = "" . $search_api . " " . $append . ""; echo $search_append; echo "<br>"; echo "<br>"; $get3 = file_get_contents("http://blekko.com/?q=" . urlencode($search_append) . "+/ps=50+/json&auth=b58f6ba2"); $decode3 = json_decode($get3, TRUE); // TRUE for in array format $i = 0; // incremental variable for search result numbering $k = 51; // decremental variable for assigning decending scores foreach ($decode3['RESULT'] as $res) { // foreach loop, to loop through each array value (result) as $res $i++; // incrementation $k--; $Rank = $i; $Score = $k; $Url = strip_tags(mysql_real_escape_string($res['url'])); $Title = strip_tags(mysql_real_escape_string($res['url_title'])); $Description = strip_tags(mysql_real_escape_string($res['snippet'])); $DisplayUrl = strip_tags(mysql_real_escape_string($res['display_url'])); mysql_query("INSERT INTO Blekko ( Rank, Url, Title, Description, DisplayUrl, Score)\n VALUES ('{$Rank}', '{$Url}', '{$Title}', '{$Description}', '{$DisplayUrl}', '{$Score}')"); } }