Ejemplo n.º 1
0
 /**
  * Each test we set up a new Russian Tokenizer object
  */
 function setUp()
 {
     $this->test_objects['FILE1'] = PhraseParser::getTokenizer("ru");
 }
Ejemplo n.º 2
0
 /**
  * Given an array of pre_terms returns the characters n-grams for the
  * given terms where n is the length Yioop uses for the language in
  * question. If a stemmer is used for language then n-gramming is not
  * done and this just returns an empty array this method differs from
  * getCharGramsTerm in that it may do checking of certain words and
  * not char gram them. For example, it won't char gram urls.
  *
  * @param array $pre_terms the terms to make n-grams for
  * @param string $lang locale tag to determine n to be used for n-gramming
  *
  * @return array the n-grams for the terms in question
  */
 static function charGramTerms($pre_terms, $lang)
 {
     global $CHARGRAMS;
     mb_internal_encoding("UTF-8");
     if ($pre_terms == array()) {
         return array();
     }
     $terms = array();
     $tokenizer = PhraseParser::getTokenizer($lang);
     if (isset($CHARGRAMS[$lang])) {
         foreach ($pre_terms as $pre_term) {
             if ($pre_term == "") {
                 continue;
             }
             if (substr($pre_term, 0, 4) == 'http') {
                 $terms[] = $pre_term;
                 // don't chargram urls
                 continue;
             }
             $ngrams = self::getCharGramsTerm(array($pre_term), $lang);
             if (count($ngrams) > 0) {
                 $terms = array_merge($terms, $ngrams);
             }
         }
     } else {
         $terms =& $pre_terms;
     }
     return $terms;
 }
Ejemplo n.º 3
0
 /**
  * Gets doc summaries of documents containing given words and meeting the
  * additional provided criteria
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appeared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param int $limit number of first document in order to return
  * @param int $num number of documents to return summaries of
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
  *     an attempt will be made to look up the results in either
  *     the file cache or memcache. Otherwise, items will be recomputed
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw > 0)
  *     no grouping done on data. if ($raw == 1) no lookups of summaries
  *     done
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the original query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is not empty, then
  *     save iterate position, so can resume on future queries that make
  *     use of the timestamp. If used then $limit ignored and get next $num
  *     docs after $save_timestamp 's previous iterate position.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return array document summaries
  */
 function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     global $CACHE;
     $indent = "  ";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     if (QUERY_STATISTICS) {
         $lookup_time = microtime();
     }
     $use_proximity = false;
     $time = time();
     if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) {
         $use_proximity = true;
     }
     if (!isset($filter['time'])) {
         $filter['time'] = 0;
     }
     $filter_time = $filter['time'];
     unset($filter['time']);
     //iterators don't expect time field
     $pages = array();
     $generation = 0;
     $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     if ($save_timestamp_name != "") {
         $to_retrieve = $num;
         $limit = 0;
         $start_slice = 0;
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name;
         $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num);
         if ($use_cache_if_allowed) {
             $cache_success = true;
             $results = $CACHE->get($summary_hash);
             if (!isset($results['TIME']) || $filter_time > $results['TIME']) {
                 //if filter has changed since cached, then invalidate cache
                 $results = false;
             }
             if (isset($results['TIME'])) {
                 $cached_time = $time - $results['TIME'];
             } else {
                 $cached_time = $time;
             }
             if ($cached_time > MAX_QUERY_CACHE_TIME) {
                 $results = false;
             }
             if (isset($results['PAGES'])) {
                 $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name;
                 $has_changeable_results = false;
                 $seen_times = array();
                 foreach ($results['PAGES'] as $page) {
                     if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) {
                         continue;
                     }
                     $seen_times[] = $page[self::CRAWL_TIME];
                     $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt";
                     if (!file_exists($current_closed)) {
                         //either feed result or from active crawl
                         $has_changeable_results = true;
                         break;
                     }
                 }
                 if ($has_changeable_results) {
                     if ($cached_time > MIN_QUERY_CACHE_TIME) {
                         $results = false;
                     }
                 }
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
             }
             if ($results !== false) {
                 return $results;
             }
         }
     }
     $old_to_retrieve = $to_retrieve;
     $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news);
     $num_retrieved = 0;
     $pages = array();
     if (is_object($query_iterator)) {
         while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) {
             $pages += $next_docs;
             $num_retrieved = count($pages);
         }
     }
     if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) {
         // used for archive crawls of crawl mixes
         $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
         $iterators = $query_iterator->save_iterators;
         $cnt_iterators = count($iterators);
         $save_point = array();
         for ($i = 0; $i < $cnt_iterators; $i++) {
             $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord();
         }
         $results["SAVE_POINT"] = $save_point;
         file_put_contents($save_file, serialize($save_point));
         $this->db->setWorldPermissionsRecursive($save_file);
     }
     $pages = array_values($pages);
     $result_count = count($pages);
     $sort_time = 0;
     if ($raw == 0) {
         // initialize scores
         $sort_start = microtime();
         $max_user_ranks = 0;
         for ($i = 0; $i < $result_count; $i++) {
             $pages[$i]["OUT_SCORE"] = 0;
             if (isset($pages[$i][self::USER_RANKS])) {
                 $j = count($pages[$i][self::USER_RANKS]);
                 if ($max_user_ranks < $j) {
                     $max_user_ranks = $j;
                 }
             }
         }
         if ($max_user_ranks > 0) {
             for ($i = 0; $i < $result_count; $i++) {
                 for ($j = 0; $j < $max_user_ranks; $j++) {
                     if (isset($pages[$i][self::USER_RANKS][$j])) {
                         $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j];
                     } else {
                         $pages[$i]["USCORE{$j}"] = 0;
                     }
                 }
             }
         }
         $subscore_fields = array(self::DOC_RANK, self::RELEVANCE);
         if ($use_proximity) {
             $subscore_fields[] = self::PROXIMITY;
         }
         if ($max_user_ranks > 0) {
             for ($j = 0; $j < $max_user_ranks; $j++) {
                 $subscore_fields[] = "USCORE{$j}";
             }
         }
         $num_fields = count($subscore_fields);
         // Compute Reciprocal Rank Fusion Score
         $alpha = 600 / $num_fields;
         if (isset($pages[0])) {
             foreach ($subscore_fields as $field) {
                 orderCallback($pages[0], $pages[0], $field);
                 usort($pages, "orderCallback");
                 $score = 0;
                 for ($i = 0; $i < $result_count; $i++) {
                     if ($i > 0) {
                         if ($pages[$i - 1][$field] != $pages[$i][$field]) {
                             $score++;
                         }
                     }
                     $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score);
                 }
             }
             orderCallback($pages[0], $pages[0], "OUT_SCORE");
         }
         usort($pages, "orderCallback");
         if ($use_proximity) {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         } else {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::PROXIMITY] = 1;
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         }
         $sort_time = changeInMicrotime($sort_start);
     }
     if ($num_retrieved < $to_retrieve) {
         $results['TOTAL_ROWS'] = $num_retrieved;
     } else {
         $results['TOTAL_ROWS'] = $query_iterator->num_docs;
         //this is only an approximation
     }
     if ($raw == 1 && $save_timestamp_name == "") {
         $pages = array_slice($pages, $start_slice);
         $pages = array_slice($pages, $limit - $start_slice, $num);
         $results['PAGES'] =& $pages;
         if ($old_to_retrieve != $to_retrieve) {
             $results['HARD_QUERY'] = $old_to_retrieve;
         }
         return $results;
     }
     if (QUERY_STATISTICS) {
         $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
         $machine_times = AnalyticsManager::get("MACHINE_TIMES");
         if ($machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />";
         }
         $net_times = AnalyticsManager::get("NET_TIMES");
         $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
         if ($net_times && $max_machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />";
         }
         if ($sort_time) {
             $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />";
         }
         $summaries_time = microtime();
     }
     $get_pages = array_slice($pages, $limit, $num);
     $to_get_count = count($get_pages);
     $groups_with_docs = false;
     if (preg_match("/\\bsite:doc\\b/", $original_query)) {
         $groups_with_docs = true;
     }
     $out_pages = array();
     $cur_limit = $limit;
     while (count($out_pages) < $to_get_count && $get_pages) {
         $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs));
         if ($save_timestamp_name != "") {
             break;
         }
         $cur_limit += $num;
         $get_pages = array_slice($pages, $cur_limit, $num);
     }
     $out_pages = array_slice($out_pages, 0, $num);
     if (QUERY_STATISTICS) {
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $round_summary_times = unserialize($summary_times_string);
             $summary_delta_time = changeInMicrotime($summaries_time);
             $summary_time_info = "{$summary_delta_time}<br /> {$in4}";
             $sum_max_time = 0;
             foreach ($round_summary_times as $summary_times) {
                 $i = 0;
                 $max_time = 0;
                 foreach ($summary_times as $summary_time) {
                     $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}";
                     $max_time = $summary_time > $max_time ? $summary_time : $max_time;
                     $i++;
                 }
                 $sum_max_time += $max_time;
             }
             $net_overhead = $summary_delta_time - $sum_max_time;
             $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead;
         } else {
             $summary_time_info = changeInMicrotime($summaries_time);
         }
         $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />";
     }
     $results['PAGES'] =& $out_pages;
     $results['TIME'] = time();
     $lang = guessLocaleFromString($original_query);
     $tokenizer = PhraseParser::getTokenizer($lang);
     //only use tokenizer if no meta word or disjuncts in query
     if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) {
         $results = $this->sortByThesaurusScore($results, $original_query, $lang);
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $CACHE->set($summary_hash, $results);
     }
     return $results;
 }
Ejemplo n.º 4
0
 /**
  * Computes suggested related phrases from thesaurus based on part of
  * speech  done on each query term.
  *
  * @param string $query query entered by user
  * @param string $lang locale tag for the query
  * @return string array $suggestion consisting of phrases suggested to
  *     be similar in meaning to some sens of the query
  */
 static function getInitialSuggestions($query, $lang)
 {
     $tokenizer = PhraseParser::getTokenizer($lang);
     $pos_query = $tokenizer->tagPartsOfSpeechPhrase($query);
     $max_len = 25;
     $replacement_phrases = array();
     $suggestions = array();
     $terms = preg_split("/\\s+|\\-/", trim($query));
     $pos_terms = preg_split("/\\s+/", trim($pos_query), -1, PREG_SPLIT_NO_EMPTY);
     $num_pos_terms = count($pos_terms);
     $word_type = NULL;
     $similar_words = array();
     $known_word_types = array("NN", "VB", "AJ", "AV");
     for ($i = 0; $i < $num_pos_terms; $i++) {
         $pos = strpos($pos_terms[$i], '~');
         $word_type = trim(substr($pos_terms[$i], $pos + 1));
         if (!in_array($word_type, $known_word_types)) {
             $word_type = "NA";
         }
         $current_word = substr($pos_terms[$i], 0, $pos);
         if ($word_type != "NA") {
             $similar_phrases = $tokenizer->scoredThesaurusMatches($current_word, $word_type, $query);
             $highest_scoring_sense_phrases = $similar_phrases ? array_shift($similar_phrases) : false;
             if ($highest_scoring_sense_phrases) {
                 $replacement_phrases[$current_word] = $highest_scoring_sense_phrases;
             }
         }
     }
     $i = 0;
     foreach ($replacement_phrases as $words => $similar_phrases) {
         foreach ($similar_phrases as $phrase) {
             if (mb_strpos(trim($phrase), ' ') !== false) {
                 $phrase = preg_replace('/~[\\w]+/', '', $phrase);
             }
             $modified_query = preg_replace('/' . $words . '/', trim($phrase), $query);
             if (mb_strlen($modified_query) < $max_len && mb_strpos($modified_query, $query) === false) {
                 $suggestions[$i] = $modified_query;
                 $i++;
             }
         }
     }
     return $suggestions;
 }
Ejemplo n.º 5
0
 /**
  * Generates a centroid with which every sentence is ranked with cosine
  * ranking method and also generates a word cloud.
  * @param string $doc complete raw page to generate the summary from.
  * @param string $lang language of the page to decide which stop words to
  *     call proper tokenizer.php of the specified language.
  *
  * @return array array of summary and word cloud
  */
 static function getCentroidSummary($doc, $lang)
 {
     $doc = self::pageProcessing($doc);
     /* Format the document to remove characters other than periods and
           alphanumerics.
        */
     $formatted_doc = self::formatDoc($doc);
     $stop_obj = PhraseParser::getTokenizer($lang);
     /* Splitting into sentences */
     $out_sentences = self::getSentences($doc);
     $n = count($out_sentences);
     $sentences = array();
     if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
         for ($i = 0; $i < $n; $i++) {
             $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i]));
         }
     } else {
         $sentences = $out_sentences;
     }
     /*  Splitting into terms */
     $terms = array();
     foreach ($sentences as $sentence) {
         $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang));
     }
     $terms = array_filter($terms);
     $terms_counts = array_count_values($terms);
     arsort($terms_counts);
     $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS);
     $terms = array_unique(array_keys($terms_counts));
     $t = count($terms);
     if ($t == 0) {
         return array("", "");
     }
     /* Initialize Nk array(Number of sentences the term occurs) */
     $nk = array();
     $nk = array_fill(0, $t, 0);
     $nt = array();
     /* Count TF for each word */
     for ($i = 0; $i < $n; $i++) {
         for ($j = 0; $j < $t; $j++) {
             if (strpos($sentences[$i], $terms[$j]) !== false) {
                 $nk[$j]++;
             }
         }
     }
     /* Calculate weights of each term for every sentence */
     $w = array();
     $idf = array();
     $idf_temp = 0;
     for ($k = 0; $k < $t; $k++) {
         if ($nk[$k] == 0) {
             $idf_temp = 0;
             $tmp = 0;
         } else {
             $idf_temp = $n / $nk[$k];
             $tmp = log($idf_temp);
         }
         $idf[$k] = $tmp;
     }
     /* Count TF for finding centroid */
     $wc = array();
     $max_nt = -1;
     $b = "\\b";
     if (in_array($lang, array("zh-CN", "ja", "ko"))) {
         $b = "";
     }
     for ($j = 0; $j < $t; $j++) {
         $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches);
         //$matches included for backwards compatibility
         $wc[$j] = $nt * $idf[$j];
         if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
             $wc[$j] = 0;
         }
     }
     /* Calculate centroid */
     arsort($wc);
     $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
     /* Initializing centroid weight array by 0 */
     $wc = array_fill(0, $t, 0);
     /* Word cloud */
     $i = 0;
     $word_cloud = array();
     foreach ($centroid as $key => $value) {
         $wc[$key] = $value;
         if ($i < self::WORD_CLOUD_LEN) {
             $word_cloud[$i] = $terms[$key];
         }
         $i++;
     }
     if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) {
         //if input short only use above to get a word cloud
         $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len);
         return array($formatted_doc, $word_cloud);
     }
     ksort($wc);
     /* Calculate similarity measure between centroid and each sentence */
     $sim = array();
     for ($i = 0; $i < $n; $i++) {
         $a = $b1 = $b2 = $c1 = $c2 = $d = 0;
         for ($k = 0; $k < $t; $k++) {
             $wck = $wc[$k];
             $idfk = $idf[$k];
             $tmp = substr_count($sentences[$i], $terms[$k]);
             $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0;
             $a += $wik * $wck * $idfk;
             $b1 += $wik * $wik;
             $c1 += $wck * $wck;
         }
         $b2 = sqrt($b1);
         $c2 = sqrt($c1);
         $d = $b2 * $c2;
         if ($d == 0) {
             $sim[$i] = 0;
         } else {
             $sim[$i] = $a / $d;
         }
     }
     arsort($sim);
     /* Getting how many sentences should be there in summary */
     $top = self::summarySentenceCount($out_sentences, $sim);
     $sum_array = array();
     $sum_array = array_slice($sim, 0, $top - 1, true);
     ksort($sum_array);
     /* Printing Summary */
     $summary = '';
     foreach ($sum_array as $key => $value) {
         $summary .= "{$out_sentences[$key]}" . ". ";
     }
     /* Summary of text summarization */
     return array($summary, $word_cloud);
 }