function populate_goldenlist()
{
    $file = fopen("data/relevance_judgments.txt", "r") or exit("Unable to open file!");
    $goldenlist = array();
    $index = 0;
    $tmpStr = "";
    while (!feof($file)) {
        $line = fgets($file);
        //break up our line by spaces
        $entries = explode(" ", $line);
        //let's zero-index these so they match the query values
        $goldenlist[$index][0] = intval($entries[0]) - 151;
        //trim it so it will match with our search result arrays
        if (isset($entries[1])) {
            $tmpStr = remove_leading($entries[1]);
            //get rid of trailing whitespace
            $tmpStr = trim($tmpStr);
            //now lose the backslash
            $tmpStr = remove_trailing_backslash($tmpStr);
            //any other tags
            $goldenlist[$index][1] = strip_tags($tmpStr);
        }
        $line = "";
        $entries = "";
        $tmpStr = "";
        $index++;
    }
    fclose($file);
    return $goldenlist;
}
示例#2
0
function searchGoogle($query, $numResults)
{
    $googleResultArray = array();
    $googleAcctKey = 'hc+TUI3f03XThqcHJk4auJCOdaVHJ0FSSCAZFc8tvAc=';
    $googleRootUri = 'http://www.google.com/custom';
    $googlecounter = 0;
    //using url encode here causes weirdness
    $html = file_get_html($googleRootUri . '?start=0&num=' . $numResults . '&q=' . $query . '&client=google-csbe&cx=' . $googleAcctKey);
    foreach ($html->find('a.l') as $e) {
        ++$googlecounter;
    }
    for ($i = 0; $i < $googlecounter; $i++) {
        $googleurl = $html->find('a.l');
        $googletitle = $html->find('a.l');
        $googlesnippet = $html->find('div.std');
        $tmpStr = remove_leading($googleurl[$i]->href);
        $tmpStr = remove_trailing_backslash($tmpStr);
        $googleResultArray[$i][0] = trim($tmpStr);
        $tempStr = substr($googlesnippet[$i]->innertext, 0, strpos($googlesnippet[$i]->innertext, "<br><span class=\"a\">"));
        $googleResultArray[$i][1] = strip_tags($tempStr);
        $googleResultArray[$i][2] = $i + 1;
        $googleResultArray[$i][3] = 0.0;
        $googleResultArray[$i][4] = 0;
        $googleResultArray[$i][5] = "Google";
        $googleResultArray[$i][6] = 1;
        $googleResultArray[$i][7] = strip_tags($googletitle[$i]->innertext);
    }
    //clear dom to deal with the memory leak issue
    $html->clear();
    unset($html);
    $googleResultArray = check_duplicates($googleResultArray);
    return $googleResultArray;
}