PHP PhraseParser::getTokenizer примеры использования

Язык программирования: PHP

Класс/Тип: PhraseParser

Метод/Функция: getTokenizer

Примеров на hotexamples.com: 5

PHP PhraseParser::getTokenizer - 5 примеров найдено. Это лучшие примеры PHP кода для PhraseParser::getTokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

extractPhrasesInLists(6)

getTokenizer(5)

computeSafeSearchScore(4)

calculateMetas(3)

calculateLinkMetas(2)

canonicalizePunctuatedTerms(1)

extractPhrases(1)

extractPhrasesAndCount(1)

extractWordStringPageSummary(1)

getCharGramsTerm(1)

getCosineRank(1)

getIntersection(1)

reverseMaximalMatch(1)

segmentSegment(1)

stemCharGramSegment(1)

stemTerms(1)

Пример #1

Показать файл

Файл: ru_tokenizer_test.php Проект: yakar/yioop

 /**
  * Each test we set up a new Russian Tokenizer object
  */
 function setUp()
 {
     $this->test_objects['FILE1'] = PhraseParser::getTokenizer("ru");
 }

Пример #2

Показать файл

Файл: phrase_parser.php Проект: yakar/yioop

 /**
  * Given an array of pre_terms returns the characters n-grams for the
  * given terms where n is the length Yioop uses for the language in
  * question. If a stemmer is used for language then n-gramming is not
  * done and this just returns an empty array this method differs from
  * getCharGramsTerm in that it may do checking of certain words and
  * not char gram them. For example, it won't char gram urls.
  *
  * @param array $pre_terms the terms to make n-grams for
  * @param string $lang locale tag to determine n to be used for n-gramming
  *
  * @return array the n-grams for the terms in question
  */
 static function charGramTerms($pre_terms, $lang)
 {
     global $CHARGRAMS;
     mb_internal_encoding("UTF-8");
     if ($pre_terms == array()) {
         return array();
     }
     $terms = array();
     $tokenizer = PhraseParser::getTokenizer($lang);
     if (isset($CHARGRAMS[$lang])) {
         foreach ($pre_terms as $pre_term) {
             if ($pre_term == "") {
                 continue;
             }
             if (substr($pre_term, 0, 4) == 'http') {
                 $terms[] = $pre_term;
                 // don't chargram urls
                 continue;
             }
             $ngrams = self::getCharGramsTerm(array($pre_term), $lang);
             if (count($ngrams) > 0) {
                 $terms = array_merge($terms, $ngrams);
             }
         }
     } else {
         $terms =& $pre_terms;
     }
     return $terms;
 }

Пример #3

Показать файл

Файл: phrase_model.php Проект: yakar/yioop

 /**
  * Gets doc summaries of documents containing given words and meeting the
  * additional provided criteria
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appeared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param int $limit number of first document in order to return
  * @param int $num number of documents to return summaries of
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
  *     an attempt will be made to look up the results in either
  *     the file cache or memcache. Otherwise, items will be recomputed
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw > 0)
  *     no grouping done on data. if ($raw == 1) no lookups of summaries
  *     done
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the original query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is not empty, then
  *     save iterate position, so can resume on future queries that make
  *     use of the timestamp. If used then $limit ignored and get next $num
  *     docs after $save_timestamp 's previous iterate position.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return array document summaries
  */
 function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     global $CACHE;
     $indent = "&nbsp;&nbsp;";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     if (QUERY_STATISTICS) {
         $lookup_time = microtime();
     }
     $use_proximity = false;
     $time = time();
     if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) {
         $use_proximity = true;
     }
     if (!isset($filter['time'])) {
         $filter['time'] = 0;
     }
     $filter_time = $filter['time'];
     unset($filter['time']);
     //iterators don't expect time field
     $pages = array();
     $generation = 0;
     $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     if ($save_timestamp_name != "") {
         $to_retrieve = $num;
         $limit = 0;
         $start_slice = 0;
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name;
         $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num);
         if ($use_cache_if_allowed) {
             $cache_success = true;
             $results = $CACHE->get($summary_hash);
             if (!isset($results['TIME']) || $filter_time > $results['TIME']) {
                 //if filter has changed since cached, then invalidate cache
                 $results = false;
             }
             if (isset($results['TIME'])) {
                 $cached_time = $time - $results['TIME'];
             } else {
                 $cached_time = $time;
             }
             if ($cached_time > MAX_QUERY_CACHE_TIME) {
                 $results = false;
             }
             if (isset($results['PAGES'])) {
                 $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name;
                 $has_changeable_results = false;
                 $seen_times = array();
                 foreach ($results['PAGES'] as $page) {
                     if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) {
                         continue;
                     }
                     $seen_times[] = $page[self::CRAWL_TIME];
                     $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt";
                     if (!file_exists($current_closed)) {
                         //either feed result or from active crawl
                         $has_changeable_results = true;
                         break;
                     }
                 }
                 if ($has_changeable_results) {
                     if ($cached_time > MIN_QUERY_CACHE_TIME) {
                         $results = false;
                     }
                 }
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
             }
             if ($results !== false) {
                 return $results;
             }
         }
     }
     $old_to_retrieve = $to_retrieve;
     $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news);
     $num_retrieved = 0;
     $pages = array();
     if (is_object($query_iterator)) {
         while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) {
             $pages += $next_docs;
             $num_retrieved = count($pages);
         }
     }
     if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) {
         // used for archive crawls of crawl mixes
         $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
         $iterators = $query_iterator->save_iterators;
         $cnt_iterators = count($iterators);
         $save_point = array();
         for ($i = 0; $i < $cnt_iterators; $i++) {
             $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord();
         }
         $results["SAVE_POINT"] = $save_point;
         file_put_contents($save_file, serialize($save_point));
         $this->db->setWorldPermissionsRecursive($save_file);
     }
     $pages = array_values($pages);
     $result_count = count($pages);
     $sort_time = 0;
     if ($raw == 0) {
         // initialize scores
         $sort_start = microtime();
         $max_user_ranks = 0;
         for ($i = 0; $i < $result_count; $i++) {
             $pages[$i]["OUT_SCORE"] = 0;
             if (isset($pages[$i][self::USER_RANKS])) {
                 $j = count($pages[$i][self::USER_RANKS]);
                 if ($max_user_ranks < $j) {
                     $max_user_ranks = $j;
                 }
             }
         }
         if ($max_user_ranks > 0) {
             for ($i = 0; $i < $result_count; $i++) {
                 for ($j = 0; $j < $max_user_ranks; $j++) {
                     if (isset($pages[$i][self::USER_RANKS][$j])) {
                         $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j];
                     } else {
                         $pages[$i]["USCORE{$j}"] = 0;
                     }
                 }
             }
         }
         $subscore_fields = array(self::DOC_RANK, self::RELEVANCE);
         if ($use_proximity) {
             $subscore_fields[] = self::PROXIMITY;
         }
         if ($max_user_ranks > 0) {
             for ($j = 0; $j < $max_user_ranks; $j++) {
                 $subscore_fields[] = "USCORE{$j}";
             }
         }
         $num_fields = count($subscore_fields);
         // Compute Reciprocal Rank Fusion Score
         $alpha = 600 / $num_fields;
         if (isset($pages[0])) {
             foreach ($subscore_fields as $field) {
                 orderCallback($pages[0], $pages[0], $field);
                 usort($pages, "orderCallback");
                 $score = 0;
                 for ($i = 0; $i < $result_count; $i++) {
                     if ($i > 0) {
                         if ($pages[$i - 1][$field] != $pages[$i][$field]) {
                             $score++;
                         }
                     }
                     $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score);
                 }
             }
             orderCallback($pages[0], $pages[0], "OUT_SCORE");
         }
         usort($pages, "orderCallback");
         if ($use_proximity) {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         } else {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::PROXIMITY] = 1;
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         }
         $sort_time = changeInMicrotime($sort_start);
     }
     if ($num_retrieved < $to_retrieve) {
         $results['TOTAL_ROWS'] = $num_retrieved;
     } else {
         $results['TOTAL_ROWS'] = $query_iterator->num_docs;
         //this is only an approximation
     }
     if ($raw == 1 && $save_timestamp_name == "") {
         $pages = array_slice($pages, $start_slice);
         $pages = array_slice($pages, $limit - $start_slice, $num);
         $results['PAGES'] =& $pages;
         if ($old_to_retrieve != $to_retrieve) {
             $results['HARD_QUERY'] = $old_to_retrieve;
         }
         return $results;
     }
     if (QUERY_STATISTICS) {
         $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
         $machine_times = AnalyticsManager::get("MACHINE_TIMES");
         if ($machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />";
         }
         $net_times = AnalyticsManager::get("NET_TIMES");
         $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
         if ($net_times && $max_machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />";
         }
         if ($sort_time) {
             $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />";
         }
         $summaries_time = microtime();
     }
     $get_pages = array_slice($pages, $limit, $num);
     $to_get_count = count($get_pages);
     $groups_with_docs = false;
     if (preg_match("/\\bsite:doc\\b/", $original_query)) {
         $groups_with_docs = true;
     }
     $out_pages = array();
     $cur_limit = $limit;
     while (count($out_pages) < $to_get_count && $get_pages) {
         $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs));
         if ($save_timestamp_name != "") {
             break;
         }
         $cur_limit += $num;
         $get_pages = array_slice($pages, $cur_limit, $num);
     }
     $out_pages = array_slice($out_pages, 0, $num);
     if (QUERY_STATISTICS) {
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $round_summary_times = unserialize($summary_times_string);
             $summary_delta_time = changeInMicrotime($summaries_time);
             $summary_time_info = "{$summary_delta_time}<br /> {$in4}";
             $sum_max_time = 0;
             foreach ($round_summary_times as $summary_times) {
                 $i = 0;
                 $max_time = 0;
                 foreach ($summary_times as $summary_time) {
                     $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}";
                     $max_time = $summary_time > $max_time ? $summary_time : $max_time;
                     $i++;
                 }
                 $sum_max_time += $max_time;
             }
             $net_overhead = $summary_delta_time - $sum_max_time;
             $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead;
         } else {
             $summary_time_info = changeInMicrotime($summaries_time);
         }
         $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />";
     }
     $results['PAGES'] =& $out_pages;
     $results['TIME'] = time();
     $lang = guessLocaleFromString($original_query);
     $tokenizer = PhraseParser::getTokenizer($lang);
     //only use tokenizer if no meta word or disjuncts in query
     if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) {
         $results = $this->sortByThesaurusScore($results, $original_query, $lang);
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $CACHE->set($summary_hash, $results);
     }
     return $results;
 }

Пример #4

Показать файл

Файл: thesaurus.php Проект: yakar/yioop

 /**
  * Computes suggested related phrases from thesaurus based on part of
  * speech  done on each query term.
  *
  * @param string $query query entered by user
  * @param string $lang locale tag for the query
  * @return string array $suggestion consisting of phrases suggested to
  *     be similar in meaning to some sens of the query
  */
 static function getInitialSuggestions($query, $lang)
 {
     $tokenizer = PhraseParser::getTokenizer($lang);
     $pos_query = $tokenizer->tagPartsOfSpeechPhrase($query);
     $max_len = 25;
     $replacement_phrases = array();
     $suggestions = array();
     $terms = preg_split("/\\s+|\\-/", trim($query));
     $pos_terms = preg_split("/\\s+/", trim($pos_query), -1, PREG_SPLIT_NO_EMPTY);
     $num_pos_terms = count($pos_terms);
     $word_type = NULL;
     $similar_words = array();
     $known_word_types = array("NN", "VB", "AJ", "AV");
     for ($i = 0; $i < $num_pos_terms; $i++) {
         $pos = strpos($pos_terms[$i], '~');
         $word_type = trim(substr($pos_terms[$i], $pos + 1));
         if (!in_array($word_type, $known_word_types)) {
             $word_type = "NA";
         }
         $current_word = substr($pos_terms[$i], 0, $pos);
         if ($word_type != "NA") {
             $similar_phrases = $tokenizer->scoredThesaurusMatches($current_word, $word_type, $query);
             $highest_scoring_sense_phrases = $similar_phrases ? array_shift($similar_phrases) : false;
             if ($highest_scoring_sense_phrases) {
                 $replacement_phrases[$current_word] = $highest_scoring_sense_phrases;
             }
         }
     }
     $i = 0;
     foreach ($replacement_phrases as $words => $similar_phrases) {
         foreach ($similar_phrases as $phrase) {
             if (mb_strpos(trim($phrase), ' ') !== false) {
                 $phrase = preg_replace('/~[\\w]+/', '', $phrase);
             }
             $modified_query = preg_replace('/' . $words . '/', trim($phrase), $query);
             if (mb_strlen($modified_query) < $max_len && mb_strpos($modified_query, $query) === false) {
                 $suggestions[$i] = $modified_query;
                 $i++;
             }
         }
     }
     return $suggestions;
 }

Пример #5

Показать файл

Файл: centroid_summarizer.php Проект: yakar/yioop

 /**
  * Generates a centroid with which every sentence is ranked with cosine
  * ranking method and also generates a word cloud.
  * @param string $doc complete raw page to generate the summary from.
  * @param string $lang language of the page to decide which stop words to
  *     call proper tokenizer.php of the specified language.
  *
  * @return array array of summary and word cloud
  */
 static function getCentroidSummary($doc, $lang)
 {
     $doc = self::pageProcessing($doc);
     /* Format the document to remove characters other than periods and
           alphanumerics.
        */
     $formatted_doc = self::formatDoc($doc);
     $stop_obj = PhraseParser::getTokenizer($lang);
     /* Splitting into sentences */
     $out_sentences = self::getSentences($doc);
     $n = count($out_sentences);
     $sentences = array();
     if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
         for ($i = 0; $i < $n; $i++) {
             $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i]));
         }
     } else {
         $sentences = $out_sentences;
     }
     /*  Splitting into terms */
     $terms = array();
     foreach ($sentences as $sentence) {
         $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang));
     }
     $terms = array_filter($terms);
     $terms_counts = array_count_values($terms);
     arsort($terms_counts);
     $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS);
     $terms = array_unique(array_keys($terms_counts));
     $t = count($terms);
     if ($t == 0) {
         return array("", "");
     }
     /* Initialize Nk array(Number of sentences the term occurs) */
     $nk = array();
     $nk = array_fill(0, $t, 0);
     $nt = array();
     /* Count TF for each word */
     for ($i = 0; $i < $n; $i++) {
         for ($j = 0; $j < $t; $j++) {
             if (strpos($sentences[$i], $terms[$j]) !== false) {
                 $nk[$j]++;
             }
         }
     }
     /* Calculate weights of each term for every sentence */
     $w = array();
     $idf = array();
     $idf_temp = 0;
     for ($k = 0; $k < $t; $k++) {
         if ($nk[$k] == 0) {
             $idf_temp = 0;
             $tmp = 0;
         } else {
             $idf_temp = $n / $nk[$k];
             $tmp = log($idf_temp);
         }
         $idf[$k] = $tmp;
     }
     /* Count TF for finding centroid */
     $wc = array();
     $max_nt = -1;
     $b = "\\b";
     if (in_array($lang, array("zh-CN", "ja", "ko"))) {
         $b = "";
     }
     for ($j = 0; $j < $t; $j++) {
         $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches);
         //$matches included for backwards compatibility
         $wc[$j] = $nt * $idf[$j];
         if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
             $wc[$j] = 0;
         }
     }
     /* Calculate centroid */
     arsort($wc);
     $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
     /* Initializing centroid weight array by 0 */
     $wc = array_fill(0, $t, 0);
     /* Word cloud */
     $i = 0;
     $word_cloud = array();
     foreach ($centroid as $key => $value) {
         $wc[$key] = $value;
         if ($i < self::WORD_CLOUD_LEN) {
             $word_cloud[$i] = $terms[$key];
         }
         $i++;
     }
     if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) {
         //if input short only use above to get a word cloud
         $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len);
         return array($formatted_doc, $word_cloud);
     }
     ksort($wc);
     /* Calculate similarity measure between centroid and each sentence */
     $sim = array();
     for ($i = 0; $i < $n; $i++) {
         $a = $b1 = $b2 = $c1 = $c2 = $d = 0;
         for ($k = 0; $k < $t; $k++) {
             $wck = $wc[$k];
             $idfk = $idf[$k];
             $tmp = substr_count($sentences[$i], $terms[$k]);
             $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0;
             $a += $wik * $wck * $idfk;
             $b1 += $wik * $wik;
             $c1 += $wck * $wck;
         }
         $b2 = sqrt($b1);
         $c2 = sqrt($c1);
         $d = $b2 * $c2;
         if ($d == 0) {
             $sim[$i] = 0;
         } else {
             $sim[$i] = $a / $d;
         }
     }
     arsort($sim);
     /* Getting how many sentences should be there in summary */
     $top = self::summarySentenceCount($out_sentences, $sim);
     $sum_array = array();
     $sum_array = array_slice($sim, 0, $top - 1, true);
     ksort($sum_array);
     /* Printing Summary */
     $summary = '';
     foreach ($sum_array as $key => $value) {
         $summary .= "{$out_sentences[$key]}" . ". ";
     }
     /* Summary of text summarization */
     return array($summary, $word_cloud);
 }