function populate_goldenlist() { $file = fopen("data/relevance_judgments.txt", "r") or exit("Unable to open file!"); $goldenlist = array(); $index = 0; $tmpStr = ""; while (!feof($file)) { $line = fgets($file); //break up our line by spaces $entries = explode(" ", $line); //let's zero-index these so they match the query values $goldenlist[$index][0] = intval($entries[0]) - 151; //trim it so it will match with our search result arrays if (isset($entries[1])) { $tmpStr = remove_leading($entries[1]); //get rid of trailing whitespace $tmpStr = trim($tmpStr); //now lose the backslash $tmpStr = remove_trailing_backslash($tmpStr); //any other tags $goldenlist[$index][1] = strip_tags($tmpStr); } $line = ""; $entries = ""; $tmpStr = ""; $index++; } fclose($file); return $goldenlist; }
function searchGoogle($query, $numResults) { $googleResultArray = array(); $googleAcctKey = 'hc+TUI3f03XThqcHJk4auJCOdaVHJ0FSSCAZFc8tvAc='; $googleRootUri = 'http://www.google.com/custom'; $googlecounter = 0; //using url encode here causes weirdness $html = file_get_html($googleRootUri . '?start=0&num=' . $numResults . '&q=' . $query . '&client=google-csbe&cx=' . $googleAcctKey); foreach ($html->find('a.l') as $e) { ++$googlecounter; } for ($i = 0; $i < $googlecounter; $i++) { $googleurl = $html->find('a.l'); $googletitle = $html->find('a.l'); $googlesnippet = $html->find('div.std'); $tmpStr = remove_leading($googleurl[$i]->href); $tmpStr = remove_trailing_backslash($tmpStr); $googleResultArray[$i][0] = trim($tmpStr); $tempStr = substr($googlesnippet[$i]->innertext, 0, strpos($googlesnippet[$i]->innertext, "<br><span class=\"a\">")); $googleResultArray[$i][1] = strip_tags($tempStr); $googleResultArray[$i][2] = $i + 1; $googleResultArray[$i][3] = 0.0; $googleResultArray[$i][4] = 0; $googleResultArray[$i][5] = "Google"; $googleResultArray[$i][6] = 1; $googleResultArray[$i][7] = strip_tags($googletitle[$i]->innertext); } //clear dom to deal with the memory leak issue $html->clear(); unset($html); $googleResultArray = check_duplicates($googleResultArray); return $googleResultArray; }