/** * Each test we set up a new Russian Tokenizer object */ function setUp() { $this->test_objects['FILE1'] = PhraseParser::getTokenizer("ru"); }
/** * Given an array of pre_terms returns the characters n-grams for the * given terms where n is the length Yioop uses for the language in * question. If a stemmer is used for language then n-gramming is not * done and this just returns an empty array this method differs from * getCharGramsTerm in that it may do checking of certain words and * not char gram them. For example, it won't char gram urls. * * @param array $pre_terms the terms to make n-grams for * @param string $lang locale tag to determine n to be used for n-gramming * * @return array the n-grams for the terms in question */ static function charGramTerms($pre_terms, $lang) { global $CHARGRAMS; mb_internal_encoding("UTF-8"); if ($pre_terms == array()) { return array(); } $terms = array(); $tokenizer = PhraseParser::getTokenizer($lang); if (isset($CHARGRAMS[$lang])) { foreach ($pre_terms as $pre_term) { if ($pre_term == "") { continue; } if (substr($pre_term, 0, 4) == 'http') { $terms[] = $pre_term; // don't chargram urls continue; } $ngrams = self::getCharGramsTerm(array($pre_term), $lang); if (count($ngrams) > 0) { $terms = array_merge($terms, $ngrams); } } } else { $terms =& $pre_terms; } return $terms; }
/** * Gets doc summaries of documents containing given words and meeting the * additional provided criteria * @param array $word_structs an array of word_structs. Here a word_struct * is an associative array with at least the following fields * KEYS -- an array of word keys * QUOTE_POSITIONS -- an array of positions of words that appeared in * quotes (so need to be matched exactly) * DISALLOW_PHRASES -- an array of words the document must not contain * WEIGHT -- a weight to multiple scores returned from this iterator by * INDEX_NAME -- an index timestamp to get results from * @param int $limit number of first document in order to return * @param int $num number of documents to return summaries of * @param array& $filter an array of hashes of domains to filter from * results * @param bool $use_cache_if_allowed if true and USE_CACHE is true then * an attempt will be made to look up the results in either * the file cache or memcache. Otherwise, items will be recomputed * and then potentially restored in cache * @param int $raw ($raw == 0) normal grouping, ($raw > 0) * no grouping done on data. if ($raw == 1) no lookups of summaries * done * @param array $queue_servers a list of urls of yioop machines which might * be used during lookup * @param string $original_query if set, the original query that corresponds * to $word_structs * @param string $save_timestamp_name if this timestamp is not empty, then * save iterate position, so can resume on future queries that make * use of the timestamp. If used then $limit ignored and get next $num * docs after $save_timestamp 's previous iterate position. * @param bool $limit_news if true the number of media:news items to * allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT * * @return array document summaries */ function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true) { global $CACHE; $indent = " "; $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2 . $in2; if (QUERY_STATISTICS) { $lookup_time = microtime(); } $use_proximity = false; $time = time(); if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) { $use_proximity = true; } if (!isset($filter['time'])) { $filter['time'] = 0; } $filter_time = $filter['time']; unset($filter['time']); //iterators don't expect time field $pages = array(); $generation = 0; $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES; if ($save_timestamp_name != "") { $to_retrieve = $num; $limit = 0; $start_slice = 0; } if (USE_CACHE && $save_timestamp_name == "") { $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name; $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num); if ($use_cache_if_allowed) { $cache_success = true; $results = $CACHE->get($summary_hash); if (!isset($results['TIME']) || $filter_time > $results['TIME']) { //if filter has changed since cached, then invalidate cache $results = false; } if (isset($results['TIME'])) { $cached_time = $time - $results['TIME']; } else { $cached_time = $time; } if ($cached_time > MAX_QUERY_CACHE_TIME) { $results = false; } if (isset($results['PAGES'])) { $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name; $has_changeable_results = false; $seen_times = array(); foreach ($results['PAGES'] as $page) { if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) { continue; } $seen_times[] = $page[self::CRAWL_TIME]; $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt"; if (!file_exists($current_closed)) { //either feed result or from active crawl $has_changeable_results = true; break; } } if ($has_changeable_results) { if ($cached_time > MIN_QUERY_CACHE_TIME) { $results = false; } } } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; } if ($results !== false) { return $results; } } } $old_to_retrieve = $to_retrieve; $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news); $num_retrieved = 0; $pages = array(); if (is_object($query_iterator)) { while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) { $pages += $next_docs; $num_retrieved = count($pages); } } if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) { // used for archive crawls of crawl mixes $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt"; $iterators = $query_iterator->save_iterators; $cnt_iterators = count($iterators); $save_point = array(); for ($i = 0; $i < $cnt_iterators; $i++) { $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord(); } $results["SAVE_POINT"] = $save_point; file_put_contents($save_file, serialize($save_point)); $this->db->setWorldPermissionsRecursive($save_file); } $pages = array_values($pages); $result_count = count($pages); $sort_time = 0; if ($raw == 0) { // initialize scores $sort_start = microtime(); $max_user_ranks = 0; for ($i = 0; $i < $result_count; $i++) { $pages[$i]["OUT_SCORE"] = 0; if (isset($pages[$i][self::USER_RANKS])) { $j = count($pages[$i][self::USER_RANKS]); if ($max_user_ranks < $j) { $max_user_ranks = $j; } } } if ($max_user_ranks > 0) { for ($i = 0; $i < $result_count; $i++) { for ($j = 0; $j < $max_user_ranks; $j++) { if (isset($pages[$i][self::USER_RANKS][$j])) { $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j]; } else { $pages[$i]["USCORE{$j}"] = 0; } } } } $subscore_fields = array(self::DOC_RANK, self::RELEVANCE); if ($use_proximity) { $subscore_fields[] = self::PROXIMITY; } if ($max_user_ranks > 0) { for ($j = 0; $j < $max_user_ranks; $j++) { $subscore_fields[] = "USCORE{$j}"; } } $num_fields = count($subscore_fields); // Compute Reciprocal Rank Fusion Score $alpha = 600 / $num_fields; if (isset($pages[0])) { foreach ($subscore_fields as $field) { orderCallback($pages[0], $pages[0], $field); usort($pages, "orderCallback"); $score = 0; for ($i = 0; $i < $result_count; $i++) { if ($i > 0) { if ($pages[$i - 1][$field] != $pages[$i][$field]) { $score++; } } $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score); } } orderCallback($pages[0], $pages[0], "OUT_SCORE"); } usort($pages, "orderCallback"); if ($use_proximity) { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } else { for ($i = 0; $i < $result_count; $i++) { $pages[$i][self::PROXIMITY] = 1; $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"]; } } $sort_time = changeInMicrotime($sort_start); } if ($num_retrieved < $to_retrieve) { $results['TOTAL_ROWS'] = $num_retrieved; } else { $results['TOTAL_ROWS'] = $query_iterator->num_docs; //this is only an approximation } if ($raw == 1 && $save_timestamp_name == "") { $pages = array_slice($pages, $start_slice); $pages = array_slice($pages, $limit - $start_slice, $num); $results['PAGES'] =& $pages; if ($old_to_retrieve != $to_retrieve) { $results['HARD_QUERY'] = $old_to_retrieve; } return $results; } if (QUERY_STATISTICS) { $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />"; $machine_times = AnalyticsManager::get("MACHINE_TIMES"); if ($machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />"; } $net_times = AnalyticsManager::get("NET_TIMES"); $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES"); if ($net_times && $max_machine_times) { $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />"; } if ($sort_time) { $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />"; } $summaries_time = microtime(); } $get_pages = array_slice($pages, $limit, $num); $to_get_count = count($get_pages); $groups_with_docs = false; if (preg_match("/\\bsite:doc\\b/", $original_query)) { $groups_with_docs = true; } $out_pages = array(); $cur_limit = $limit; while (count($out_pages) < $to_get_count && $get_pages) { $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs)); if ($save_timestamp_name != "") { break; } $cur_limit += $num; $get_pages = array_slice($pages, $cur_limit, $num); } $out_pages = array_slice($out_pages, 0, $num); if (QUERY_STATISTICS) { $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES"); if ($summary_times_string) { $round_summary_times = unserialize($summary_times_string); $summary_delta_time = changeInMicrotime($summaries_time); $summary_time_info = "{$summary_delta_time}<br /> {$in4}"; $sum_max_time = 0; foreach ($round_summary_times as $summary_times) { $i = 0; $max_time = 0; foreach ($summary_times as $summary_time) { $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}"; $max_time = $summary_time > $max_time ? $summary_time : $max_time; $i++; } $sum_max_time += $max_time; } $net_overhead = $summary_delta_time - $sum_max_time; $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead; } else { $summary_time_info = changeInMicrotime($summaries_time); } $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />"; } $results['PAGES'] =& $out_pages; $results['TIME'] = time(); $lang = guessLocaleFromString($original_query); $tokenizer = PhraseParser::getTokenizer($lang); //only use tokenizer if no meta word or disjuncts in query if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) { $results = $this->sortByThesaurusScore($results, $original_query, $lang); } if (USE_CACHE && $save_timestamp_name == "") { $CACHE->set($summary_hash, $results); } return $results; }
/** * Computes suggested related phrases from thesaurus based on part of * speech done on each query term. * * @param string $query query entered by user * @param string $lang locale tag for the query * @return string array $suggestion consisting of phrases suggested to * be similar in meaning to some sens of the query */ static function getInitialSuggestions($query, $lang) { $tokenizer = PhraseParser::getTokenizer($lang); $pos_query = $tokenizer->tagPartsOfSpeechPhrase($query); $max_len = 25; $replacement_phrases = array(); $suggestions = array(); $terms = preg_split("/\\s+|\\-/", trim($query)); $pos_terms = preg_split("/\\s+/", trim($pos_query), -1, PREG_SPLIT_NO_EMPTY); $num_pos_terms = count($pos_terms); $word_type = NULL; $similar_words = array(); $known_word_types = array("NN", "VB", "AJ", "AV"); for ($i = 0; $i < $num_pos_terms; $i++) { $pos = strpos($pos_terms[$i], '~'); $word_type = trim(substr($pos_terms[$i], $pos + 1)); if (!in_array($word_type, $known_word_types)) { $word_type = "NA"; } $current_word = substr($pos_terms[$i], 0, $pos); if ($word_type != "NA") { $similar_phrases = $tokenizer->scoredThesaurusMatches($current_word, $word_type, $query); $highest_scoring_sense_phrases = $similar_phrases ? array_shift($similar_phrases) : false; if ($highest_scoring_sense_phrases) { $replacement_phrases[$current_word] = $highest_scoring_sense_phrases; } } } $i = 0; foreach ($replacement_phrases as $words => $similar_phrases) { foreach ($similar_phrases as $phrase) { if (mb_strpos(trim($phrase), ' ') !== false) { $phrase = preg_replace('/~[\\w]+/', '', $phrase); } $modified_query = preg_replace('/' . $words . '/', trim($phrase), $query); if (mb_strlen($modified_query) < $max_len && mb_strpos($modified_query, $query) === false) { $suggestions[$i] = $modified_query; $i++; } } } return $suggestions; }
/** * Generates a centroid with which every sentence is ranked with cosine * ranking method and also generates a word cloud. * @param string $doc complete raw page to generate the summary from. * @param string $lang language of the page to decide which stop words to * call proper tokenizer.php of the specified language. * * @return array array of summary and word cloud */ static function getCentroidSummary($doc, $lang) { $doc = self::pageProcessing($doc); /* Format the document to remove characters other than periods and alphanumerics. */ $formatted_doc = self::formatDoc($doc); $stop_obj = PhraseParser::getTokenizer($lang); /* Splitting into sentences */ $out_sentences = self::getSentences($doc); $n = count($out_sentences); $sentences = array(); if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { for ($i = 0; $i < $n; $i++) { $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i])); } } else { $sentences = $out_sentences; } /* Splitting into terms */ $terms = array(); foreach ($sentences as $sentence) { $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang)); } $terms = array_filter($terms); $terms_counts = array_count_values($terms); arsort($terms_counts); $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS); $terms = array_unique(array_keys($terms_counts)); $t = count($terms); if ($t == 0) { return array("", ""); } /* Initialize Nk array(Number of sentences the term occurs) */ $nk = array(); $nk = array_fill(0, $t, 0); $nt = array(); /* Count TF for each word */ for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $t; $j++) { if (strpos($sentences[$i], $terms[$j]) !== false) { $nk[$j]++; } } } /* Calculate weights of each term for every sentence */ $w = array(); $idf = array(); $idf_temp = 0; for ($k = 0; $k < $t; $k++) { if ($nk[$k] == 0) { $idf_temp = 0; $tmp = 0; } else { $idf_temp = $n / $nk[$k]; $tmp = log($idf_temp); } $idf[$k] = $tmp; } /* Count TF for finding centroid */ $wc = array(); $max_nt = -1; $b = "\\b"; if (in_array($lang, array("zh-CN", "ja", "ko"))) { $b = ""; } for ($j = 0; $j < $t; $j++) { $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches); //$matches included for backwards compatibility $wc[$j] = $nt * $idf[$j]; if (is_nan($wc[$j]) || is_infinite($wc[$j])) { $wc[$j] = 0; } } /* Calculate centroid */ arsort($wc); $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); /* Initializing centroid weight array by 0 */ $wc = array_fill(0, $t, 0); /* Word cloud */ $i = 0; $word_cloud = array(); foreach ($centroid as $key => $value) { $wc[$key] = $value; if ($i < self::WORD_CLOUD_LEN) { $word_cloud[$i] = $terms[$key]; } $i++; } if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) { //if input short only use above to get a word cloud $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len); return array($formatted_doc, $word_cloud); } ksort($wc); /* Calculate similarity measure between centroid and each sentence */ $sim = array(); for ($i = 0; $i < $n; $i++) { $a = $b1 = $b2 = $c1 = $c2 = $d = 0; for ($k = 0; $k < $t; $k++) { $wck = $wc[$k]; $idfk = $idf[$k]; $tmp = substr_count($sentences[$i], $terms[$k]); $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0; $a += $wik * $wck * $idfk; $b1 += $wik * $wik; $c1 += $wck * $wck; } $b2 = sqrt($b1); $c2 = sqrt($c1); $d = $b2 * $c2; if ($d == 0) { $sim[$i] = 0; } else { $sim[$i] = $a / $d; } } arsort($sim); /* Getting how many sentences should be there in summary */ $top = self::summarySentenceCount($out_sentences, $sim); $sum_array = array(); $sum_array = array_slice($sim, 0, $top - 1, true); ksort($sum_array); /* Printing Summary */ $summary = ''; foreach ($sum_array as $key => $value) { $summary .= "{$out_sentences[$key]}" . ". "; } /* Summary of text summarization */ return array($summary, $word_cloud); }