function applyPorterStemming(array $searchArray) { $stemsUnique = array(); $searchDiff = stopwordRemoval($searchArray); foreach ($searchDiff as $word) { $stems[] .= PorterStemmer::Stem($word) . "\n"; } // Remove duplicate words $stemsUnique = array_unique($stems); // Return stems return $stemsUnique; }
function getSynonyms(array $searchArray) { // Remove stopwords: do not want to find synonyms for these $searchDiff = stopwordRemoval($searchArray); // For each word in the search array, find synonyms foreach ($searchDiff as $key => $value) { $result = array(); // Initialise variables for http request $apikey = "yoNJkbNlb7pQEYhSflTz"; $language = "en_US"; $endpoint = "http://thesaurus.altervista.org/thesaurus/v1"; $word = "{$value}"; // Invoke the remote service $session = curl_init(); curl_setopt($session, CURLOPT_URL, "{$endpoint}?word=" . urlencode($word) . "&language={$language}&key={$apikey}&output=json"); curl_setopt($session, CURLOPT_RETURNTRANSFER, 1); $data = curl_exec($session); $info = curl_getinfo($session); curl_close($session); // Check request has been processed and parse response if ($info['http_code'] == 200) { $result = json_decode($data, true); // Put all synonyms into a string foreach ($result['response'] as $value) { $terms = strtolower(str_replace("(antonym)", " ", $value["list"]["synonyms"])); } // Explode the string into an array of synonym tokens $termsArray = explode("|", $terms); foreach ($termsArray as $value) { //$synonyms .= "<a href='http://elenamagno.host56.com//MetaSearchEngine/search.php?RadioGroup2=stemmingOff&searchText=$value&submit=+Search+&RadioGroup1=non-aggregate'>" . $value . "</a>" . "  "; //$synonyms .= "<a href='http://localhost/MetaSearchEngine/search.php?RadioGroup2=stemmingOff&searchText=$value // &submit=+Search+&RadioGroup1=non-aggregate'>" . $value . "</a>" . "  "; // same: this will search according to the last settings $synonyms .= "<a href='http://localhost/MetaSearchEngine/search.php?RadioGroup2=" . $_GET['RadioGroup2'] . "&searchText=" . $value . "&submit=+Search+&RadioGroup1=" . $_GET['RadioGroup1'] . "'>" . $value . "</a>" . "  "; } } // else // $synonyms = "Http Error: ".$info['http_code']; } return $synonyms; }
function getIndex(array $arrayTopK) { $snippetCollection = array(); // Create a new array with docID pointing to the snippet foreach ($arrayTopK as $value) { $docID = $value['docID']; $snippetCollection[$docID] = $value['snippet']; // Count tot documents (to use in tf-idf) $docCount = count($snippetCollection); } foreach ($snippetCollection as $docID => $snippet) { $snippetString .= $docID . "=>" . $snippet . "<br/>"; // for test $terms = explode(" ", strtolower(preg_replace("/[^a-zA-Z]+/", " ", $snippet))); // Count the term occurences within each document (before removing duplicates) $termFrequency[$docID] = array_count_values($terms); // Remove stopwords: do not want to include these in dictionary $termsDiff = stopwordRemoval($terms); foreach ($termsDiff as $term) { // Remove strings shorter than 3 letters (take care of us, ie, au, etc) if (strlen($term) >= 3) { if (!isset($dictionary[$term]['docFrequency'])) { // Initialise member docFrequency count at zero $dictionary[$term]['docFrequency'] = 0; } // Increase member docFrequency count each time the term is found $dictionary[$term]['docFrequency']++; // per term, and per docID, add the termFrequency (postings are docID + termFrequency) $dictionary[$term]['termFrequency'][$docID] = $termFrequency[$docID][$term]; } } } // Sort by key ksort($dictionary); // Return the inverstedIndex which contains dicitonary and termFrequency $topKIndex = array('docCount' => $docCount, 'dictionary' => $dictionary); return $topKIndex; }