Example #1
0
 /**
  * Each test we set up a new Russian Tokenizer object
  */
 function setUp()
 {
     $this->test_objects['FILE1'] = PhraseParser::getTokenizer("ru");
 }
Example #2
0
 /**
  * Computes similar words and scores from WordNet output based on word type.
  *
  * @param string $term term to find related thesaurus terms
  * @param string $word_type is the type of word such as "NN" (noun),
  *     "VB" (verb), "AJ" (adjective), or "AV" (adverb)
  *     (all other types will be ignored)
  * @param string $whole_query the original query $term came from
  * @return array a sequence of
  *     (score => array of thesaurus terms) associations. The score
  *     representing one word sense of term
  */
 static function scoredThesaurusMatches($term, $word_type, $whole_query)
 {
     $word_map = array("VB" => "verb", "NN" => "noun", "AJ" => "adj", "AV" => "adv");
     //Gets overview of senses of term[$i] into data
     exec(WORDNET_EXEC . " {$term} -over", $data);
     if (!$data || !isset($word_map[$word_type])) {
         return NULL;
     }
     $full_name = $word_map[$word_type];
     $lexicon_output = implode("\n", $data);
     $sense_parts = preg_split("/\\bThe\\s{$full_name}" . '[^\\n]*\\n\\n/', $lexicon_output);
     if (!isset($sense_parts[1])) {
         return NULL;
     }
     list($sense, ) = preg_split("/\\bOverview\\sof\\s/", $sense_parts[1]);
     $definitions_for_sense = preg_split("/\\d+\\.\\s/", $sense, -1, PREG_SPLIT_NO_EMPTY);
     $num_definitions = count($definitions_for_sense);
     $sentence = array();
     $similar_phrases = array();
     $avg_scores = array();
     for ($i = 0; $i < $num_definitions; $i++) {
         //get sentence fragments examples of using that definition
         preg_match_all('/\\"(.*?)\\"/', $definitions_for_sense[$i], $matches);
         // to separate out the words
         preg_match('/[\\w+\\s\\,\\.\']+\\s\\-+/', $definitions_for_sense[$i], $match_word);
         $thesaurus_phrases = preg_split("/\\s*\\,\\s*/", strtolower(rtrim(trim($match_word[0]), "-")));
         //remove ori ginal term from thesaurus phrases if present
         $m = 0;
         foreach ($thesaurus_phrases as $thesaurus_phrase) {
             $tphrase = trim($thesaurus_phrase);
             if ($tphrase == trim($term)) {
                 unset($thesaurus_phrases[$m]);
             }
             $m++;
         }
         $thesaurus_phrases = array_filter($thesaurus_phrases);
         if ($thesaurus_phrases == array()) {
             continue;
         }
         $num_example_sentences = count($matches[1]);
         $score = array();
         for ($j = 0; $j < $num_example_sentences; $j++) {
             $query_parts = explode(' ', strtolower($whole_query));
             $example_sentence_parts = explode(' ', strtolower($matches[1][$j]));
             $score[$j] = PhraseParser::getCosineRank($query_parts, $example_sentence_parts);
             /*  If Cosine similarity is zero then go for
              * intersection similarity ranking
              */
             if ($score[$j] == 0) {
                 $score[$j] = PhraseParser::getIntersection($query_parts, $example_sentence_parts);
             }
         }
         /*  We use the rounded average of the above times 100 as a score
                score for a definition. To avoid ties we store in the low
                order digits 99 - the definition it was
             */
         if ($num_example_sentences > 0) {
             $definition_score = 100 * round(100 * (array_sum($score) / $num_example_sentences)) + (99 - $i);
         } else {
             $definition_score = 99 - $i;
         }
         $similar_phrases[$definition_score] = $thesaurus_phrases;
     }
     krsort($similar_phrases);
     return $similar_phrases;
 }
Example #3
0
 /**
  * A word segmenter.
  * Such a segmenter on input thisisabunchofwords would output
  * this is a bunch of words
  *
  * @param string $pre_segment  before segmentation
  * @return string with words separated by space
  */
 static function segment($pre_segment)
 {
     return PhraseParser::reverseMaximalMatch($pre_segment, "zh-CN");
 }
Example #4
0
 /**
  * Handles admin request related to controlling file options to be used
  * in a crawl
  *
  * This activity allows a user to specify the page range size to be
  * be used during a crawl as well as which file types can be downloaded
  */
 function pageOptions()
 {
     global $INDEXED_FILE_TYPES;
     /* get processors for different file types (populating
        $INDEXED_FILE_TYPES) */
     foreach (glob(BASE_DIR . "/lib/processors/*_processor.php") as $filename) {
         require_once $filename;
     }
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $profile_model = $parent->model("profile");
     $data["ELEMENT"] = "pageoptions";
     $data['SCRIPT'] = "";
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $num_machines = count($machine_urls);
     if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) {
         $machine_urls = NULL;
     }
     $data['available_options'] = array(tl('crawl_component_use_below'), tl('crawl_component_use_defaults'));
     $crawls = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data['options_default'] = tl('crawl_component_use_below');
     foreach ($crawls as $crawl) {
         if (strlen($crawl['DESCRIPTION']) > 0) {
             $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'];
         }
     }
     $seed_info = $crawl_model->getSeedInfo();
     $data['RECRAWL_FREQS'] = array(-1 => tl('crawl_component_recrawl_never'), 1 => tl('crawl_component_recrawl_1day'), 2 => tl('crawl_component_recrawl_2day'), 3 => tl('crawl_component_recrawl_3day'), 7 => tl('crawl_component_recrawl_7day'), 14 => tl('crawl_component_recrawl_14day'));
     $data['SIZE_VALUES'] = array(10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['LEN_VALUES'] = array(2000 => 2000, 10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['available_summarizers'] = array(self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid'));
     if (!isset($seed_info["indexed_file_types"]["extensions"])) {
         $seed_info["indexed_file_types"]["extensions"] = $INDEXED_FILE_TYPES;
     }
     $loaded = false;
     if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) {
         if ($_REQUEST['load_option'] == 1) {
             $seed_loaded = $crawl_model->getSeedInfo(true);
         } else {
             $timestamp = substr($parent->clean($_REQUEST['load_option'], "int"), 0, TIMESTAMP_LEN);
             $seed_loaded = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls);
         }
         $copy_options = array("general" => array("page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'), "indexed_file_types" => array("extensions"), "indexing_plugins" => array("plugins", "plugins_data"));
         foreach ($copy_options as $main_option => $sub_options) {
             foreach ($sub_options as $sub_option) {
                 if (isset($seed_loaded[$main_option][$sub_option])) {
                     $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option];
                 }
             }
         }
         if (isset($seed_loaded['page_rules'])) {
             $seed_info['page_rules'] = $seed_loaded['page_rules'];
         }
         if (isset($seed_loaded['active_classifiers'])) {
             $seed_info['active_classifiers'] = $seed_loaded['active_classifiers'];
         } else {
             $seed_info['active_classifiers'] = array();
             $seed_info['active_classifiers']['label'] = array();
         }
         $loaded = true;
     } else {
         $seed_info = $crawl_model->getSeedInfo();
         if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) {
             $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"];
         }
         if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) {
             $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"];
         }
         if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) {
             $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option'];
         }
         if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) {
             $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"];
         }
         if (isset($_REQUEST["cache_pages"])) {
             $seed_info["general"]["cache_pages"] = true;
         } else {
             if (isset($_REQUEST['posted'])) {
                 //form sent but check box unchecked
                 $seed_info["general"]["cache_pages"] = false;
             }
         }
         if (isset($_REQUEST['page_rules'])) {
             $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray($_REQUEST['page_rules'], 'rule');
         }
     }
     if (!isset($seed_info["general"]["page_recrawl_frequency"])) {
         $seed_info["general"]["page_recrawl_frequency"] = PAGE_RECRAWL_FREQUENCY;
     }
     $data['summarizer_option'] = $seed_info['general']['summarizer_option'];
     $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"];
     if (!isset($seed_info["general"]["cache_pages"])) {
         $seed_info["general"]["cache_pages"] = false;
     }
     $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"];
     if (!isset($seed_info["general"]["page_range_request"])) {
         $seed_info["general"]["page_range_request"] = PAGE_RANGE_REQUEST;
     }
     $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"];
     if (!isset($seed_info["general"]["max_description_len"])) {
         $seed_info["general"]["max_description_len"] = MAX_DESCRIPTION_LEN;
     }
     $data['MAX_LEN'] = $seed_info["general"]["max_description_len"];
     $data['INDEXING_PLUGINS'] = array();
     $included_plugins = array();
     if (isset($_REQUEST["posted"]) && !$loaded) {
         $seed_info['indexing_plugins']['plugins'] = isset($_REQUEST["INDEXING_PLUGINS"]) ? $_REQUEST["INDEXING_PLUGINS"] : array();
     }
     $included_plugins = isset($seed_info['indexing_plugins']['plugins']) ? $seed_info['indexing_plugins']['plugins'] : array();
     foreach ($parent->indexing_plugins as $plugin) {
         $plugin_name = ucfirst($plugin);
         $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = in_array($plugin_name, $included_plugins) ? "checked='checked'" : "";
         /* to use method_exists we want that the require_once for the plugin
              class has occurred so we instantiate the object via the plugin
              method call which will also do the require if needed.
            */
         $plugin_object = $parent->plugin(lcfirst($plugin_name));
         $class_name = $plugin_name . "Plugin";
         if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) {
             if (isset($seed_info['indexing_plugins']['plugins_data'][$plugin_name])) {
                 $plugin_object->setConfiguration($seed_info['indexing_plugins']['plugins_data'][$plugin_name]);
             } else {
                 $plugin_object->loadDefaultConfiguration();
             }
             $plugin_object->saveConfiguration();
         }
         if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true;
             $plugin_object->configureHandler($data);
         } else {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false;
         }
     }
     $profile = $profile_model->getProfile(WORK_DIRECTORY);
     if (!isset($_REQUEST['load_option'])) {
         $data = array_merge($data, $profile);
     } else {
         $parent->updateProfileFields($data, $profile, array('IP_LINK', 'CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST'));
     }
     $weights = array('TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6);
     $change = false;
     foreach ($weights as $weight => $value) {
         if (isset($_REQUEST[$weight])) {
             $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1);
             $profile[$weight] = $data[$weight];
             $change = true;
         } else {
             if (isset($profile[$weight]) && $profile[$weight] != "") {
                 $data[$weight] = $profile[$weight];
             } else {
                 $data[$weight] = $value;
                 $profile[$weight] = $data[$weight];
                 $change = true;
             }
         }
     }
     if ($change == true) {
         $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
     }
     $data['INDEXED_FILE_TYPES'] = array();
     $filetypes = array();
     foreach ($INDEXED_FILE_TYPES as $filetype) {
         $ison = false;
         if (isset($_REQUEST["filetype"]) && !$loaded) {
             if (isset($_REQUEST["filetype"][$filetype])) {
                 $filetypes[] = $filetype;
                 $ison = true;
                 $change = true;
             }
         } else {
             if (in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) {
                 $filetypes[] = $filetype;
                 $ison = true;
             }
         }
         $data['INDEXED_FILE_TYPES'][$filetype] = $ison ? "checked='checked'" : '';
     }
     $seed_info["indexed_file_types"]["extensions"] = $filetypes;
     $data['CLASSIFIERS'] = array();
     $data['RANKERS'] = array();
     $active_classifiers = array();
     $active_rankers = array();
     foreach (Classifier::getClassifierList() as $classifier) {
         $label = $classifier->class_label;
         $ison = false;
         if (isset($_REQUEST['classifier']) && !$loaded) {
             if (isset($_REQUEST['classifier'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) {
                 if (in_array($label, $seed_info['active_classifiers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['CLASSIFIERS'][$label] = 'checked="checked"';
             $active_classifiers[] = $label;
         } else {
             $data['CLASSIFIERS'][$label] = '';
         }
         $ison = false;
         if (isset($_REQUEST['ranker']) && !$loaded) {
             if (isset($_REQUEST['ranker'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) {
                 if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['RANKERS'][$label] = 'checked="checked"';
             $active_rankers[] = $label;
         } else {
             $data['RANKERS'][$label] = '';
         }
     }
     $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', DEFAULT_ADMIN_PAGING_NUM / 5, array(), "", array('name' => 'class_label'));
     $seed_info['active_classifiers']['label'] = $active_classifiers;
     $seed_info['active_rankers']['label'] = $active_rankers;
     if (isset($seed_info['page_rules']['rule'])) {
         if (isset($seed_info['page_rules']['rule']['rule'])) {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']['rule']);
         } else {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']);
         }
     } else {
         $data['page_rules'] = "";
     }
     $allowed_options = array('crawl_time', 'search_time', 'test_options');
     if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) {
         $data['option_type'] = $_REQUEST['option_type'];
     } else {
         $data['option_type'] = 'crawl_time';
     }
     if ($data['option_type'] == 'crawl_time') {
         $data['crawl_time_active'] = "active";
         $data['search_time_active'] = "";
         $data['test_options_active'] = "";
         $data['SCRIPT'] .= "\nswitchTab('crawltimetab'," . "'searchtimetab', 'testoptionstab')\n";
     } else {
         if ($data['option_type'] == 'search_time') {
             $data['search_time_active'] = "active";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "";
             $data['SCRIPT'] .= "\nswitchTab('searchtimetab'," . "'crawltimetab', 'testoptionstab')\n";
         } else {
             $data['search_time_active'] = "";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "active";
             $data['SCRIPT'] .= "\nswitchTab('testoptionstab'," . "'crawltimetab', 'searchtimetab');\n";
         }
     }
     $crawl_model->setSeedInfo($seed_info);
     if ($change == true && $data['option_type'] != 'test_options') {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_updated') . "</h1>')";
     }
     $test_processors = array("text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text");
     $data['MIME_TYPES'] = array_keys($test_processors);
     $data['page_type'] = "text/html";
     if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
         $data['page_type'] = $_REQUEST['page_type'];
     }
     $data['TESTPAGE'] = isset($_REQUEST['TESTPAGE']) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : "";
     if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] != "") {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_running_tests') . "</h1>')";
         $site = array();
         $site[self::ENCODING] = "UTF-8";
         $site[self::URL] = "http://test-site.yioop.com/";
         $site[self::IP_ADDRESSES] = array("1.1.1.1");
         $site[self::HTTP_CODE] = 200;
         $site[self::MODIFIED] = date("U", time());
         $site[self::TIMESTAMP] = time();
         $site[self::TYPE] = "text/html";
         $site[self::HEADER] = "page options test extractor";
         $site[self::SERVER] = "unknown";
         $site[self::SERVER_VERSION] = "unknown";
         $site[self::OPERATING_SYSTEM] = "unknown";
         $site[self::LANG] = 'en';
         $site[self::JUST_METAS] = false;
         if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
             $site[self::TYPE] = $_REQUEST['page_type'];
         }
         if ($site[self::TYPE] == 'text/html') {
             $site[self::ENCODING] = guessEncodingHtml($_REQUEST['TESTPAGE']);
         }
         $prefix_name = $test_processors[$site[self::TYPE]];
         $processor_name = ucfirst($prefix_name) . "Processor";
         $plugin_processors = array();
         if (isset($seed_info['indexing_plugins']['plugins'])) {
             foreach ($seed_info['indexing_plugins']['plugins'] as $plugin) {
                 $plugin_name = $plugin . "Plugin";
                 $supported_processors = $plugin_name::getProcessors();
                 foreach ($supported_processors as $supported_processor) {
                     $parent_processor = $processor_name;
                     do {
                         if ($supported_processor == $parent_processor) {
                             $plugin_object = $parent->plugin(lcfirst($plugin));
                             if (method_exists($plugin_name, "loadConfiguration")) {
                                 $plugin_object->loadConfiguration();
                             }
                             $plugin_processors[] = $plugin_object;
                             break;
                         }
                     } while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor");
                 }
             }
         }
         $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]);
         restore_error_handler();
         $data["PAGE_RANGE_REQUEST"] = $seed_info["general"]["page_range_request"];
         $doc_info = $page_processor->handle(substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]);
         set_error_handler("yioop_error_handler");
         if (!$doc_info) {
             $data["AFTER_PAGE_PROCESS"] = "";
             $data["AFTER_RULE_PROCESS"] = "";
             $data["EXTRACTED_WORDS"] = "";
             $data["EXTRACTED_META_WORDS"] = "";
             return $data;
         }
         if ($processor_name != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
             $doc_info[self::LINKS] = UrlParser::pruneLinks($doc_info[self::LINKS]);
         }
         foreach ($doc_info as $key => $value) {
             $site[$key] = $value;
         }
         if (isset($site[self::PAGE])) {
             unset($site[self::PAGE]);
         }
         if (isset($site[self::ROBOT_PATHS])) {
             $site[self::JUST_METAS] = true;
         }
         $reflect = new ReflectionClass("CrawlConstants");
         $crawl_constants = $reflect->getConstants();
         $crawl_keys = array_keys($crawl_constants);
         $crawl_values = array_values($crawl_constants);
         $inverse_constants = array_combine($crawl_values, $crawl_keys);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $rule_string = implode("\n", $seed_info['page_rules']['rule']);
         $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
         $page_rule_parser = new PageRuleParser($rule_string);
         $page_rule_parser->executeRuleTrees($site);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $lang = NULL;
         if (isset($site[self::LANG])) {
             $lang = $site[self::LANG];
         }
         $meta_ids = PhraseParser::calculateMetas($site);
         if (!$site[self::JUST_METAS]) {
             $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]);
             $path_words = UrlParser::getWordsLastPathPartUrl($site[self::URL]);
             $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
             if ($site[self::TITLE] != "") {
                 $lang = guessLocaleFromString($site[self::TITLE], $lang);
             } else {
                 $lang = guessLocaleFromString(substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $lang);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!isset($word_lists)) {
             $word_lists = array();
         }
         $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(print_r($word_lists, true), "string"), 75, "\n", true);
         $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(print_r($meta_ids, true), "string"), 75, "\n", true);
     }
     return $data;
 }
Example #5
0
    /**
     * Checks whether the same search threshold can classify p**n from
     * non-p**n sites. Sample were taken from a couple p**n sites,
     * sorted alphabetically by word and then some of the non sensitive words
     * were substituted so as to avoid copyright issues. For the safe tests
     * a similar process was done with the Wizard of Oz (now public domain)
     * and with some sexually related Wikipedia articles (Creative Commons SA).
     */
    function computeSafeSearchScoreTestCase()
    {
        $phrase_string = <<<EOD
a a a a a a a a a a a a all and and
and and and and and another any arose at aunt aunt be bed bed beds big
build building by by called carried case cellar cellar chairs contained
cookstove corner corner could crush cupboard cyclone dark dishes door
dorothy dorothy down dug em em enough except family farmer farmer's floor
floor for for four four from garret go great great ground had had henry
henry hole hole house in in in in in in in into it it its kansas ladder led
little lived looking lumber made many middle midst mighty miles no no of of
of one one one or path prairies reached roof room room rusty small small
small table the the the the the the the the the the the their there there
this those three to to to trap uncle uncle wagon walls was was was was was
were where which which whirlwinds who who wife with
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score < 0.012, "Easy Safe Test 1");
        $phrase_string = <<<EOD
a afraid all and anon baby big boobs but c**k crave dicking does
for from grown has how in is isnt knot lolita matts monster pussies ready
she she shew s**t teens their thom them thought they're tight to to to total
up use whether
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score > 0.012, "Easy Unsafe Test 1");
        $phrase_string = <<<EOD
a a a a a adventure after all alotta amazing and and and and and
and and and and and around as ball ball big body boobies bounce boy
brunhilda came check check chilled cirque do enjoy ensued exercises
flap friends f*****g f*****g give going gorge got got grabbing had
had had has he hell her her horny i if in it it it it it jog junk
just know little little loved me mean melons melons my my of on out out
ploy precious kitties see she she she sought sizzle so so spent spicy
started stretch sucking swinging that that that the the the then things
those those those tit titties titty to to to togo today tramp truly
us was we we we what what when what wild with with with workout wrap yes
you
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score > 0.012, "Harder Unsafe Test 1");
        $phrase_string = <<<EOD
amino hog known a a a a an and and
and and are are as as asymmetry be biology both but can cases cells
combining combining contain deem distance each early evolved exist
female female for firm firm from function gametes gametes gametes gametes
genetic genentech has ideal in in in in information disinherit into intone
is isopod known large mole mole many mixing motile motile necessary
non nutrients of of of of offspring often optimized or organism organisms
over parents process reproduce reproduce result sex sex sexual
sexual small specialist specialized specific such that that the the the
the their to to traits traits transport two types variety while young
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score < 0.012, "Harder Safe Test 1");
        $phrase_string = <<<EOD
a a active adverb an an and are as as as attribute be
between by caught characterized daft describe describe desire desire deft
french female female females having homosexuality identify in is language
lesbian may moist verb object of of or or or others secondary refer relay
romantic same sex sexual trim the the the them to to to to to used
used who who wide women ward
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score < 0.012, "Harder Safe Test 2");
    }
Example #6
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Example #7
0
 /**
  * Scores documents according to the lack or nonlack of sexually explicit
  * terms. Tries to work for several languages. Very crude classifier.
  *
  * @param array $word_lists word => pos_list tuples
  * @param int $len length of text being examined in characters
  * @return int $score of how explicit document is
  */
 static function computeSafeSearchScore(&$word_lists, $len)
 {
     static $unsafe_phrase = "\nXXX sex s**t nymphomaniac MILF lolita lesbian sadomasochism\nbondage fisting erotic v****a Tribadism penis facial hermaphrodite\ntranssexual tranny bestiality snuff boob fondle tit\nblowjob lap c**k dick hardcore pr0n f**k pussy penetration ass\ncunt bisexual prostitution screw ass masturbation clitoris c**t suck w***e bitch\nbellaco cachar chingar shimar chinquechar chichar clavar coger culear hundir\njoder mámalo singar cojon carajo caray bicho concha chucha chocha\nchuchamadre coño panocha almeja culo fundillo fundío puta puto teta\nconnorito cul pute putain sexe pénis vulve foutre baiser sein nicher nichons\nputa sapatão foder ferro punheta vadia buceta bucetinha bunda caralho\nmentula cunnus verpa sōpiō pipinna\ncōleī cunnilingus futuō copulate cēveō crīsō\nscortor meretrīx futatrix minchia coglione cornuto culo inocchio frocio puttana\nvaffanculo fok hoer kut lul やりまん 打っ掛け\n 二形 ふたなりゴックン ゴックン\nショタコン 全裸 受け 裏本 пизда́ хуй еба́ть\nблядь елда́ гондо́н хер манда́ му́ди мудя\nпидора́с залу́па жо́па за́дница буфер\n雞巴 鷄巴 雞雞 鷄鷄 阴茎 陰莖 胯下物\n屌 吊 小鳥 龟头 龜頭 屄 鸡白 雞白 傻屄 老二\n那话儿 那話兒 屄 鸡白 雞白 阴道 陰道\n阴户 陰戶 大姨妈 淫蟲 老嫖 妓女 臭婊子 卖豆腐\n賣豆腐 咪咪 大豆腐 爆乳 肏操\n炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık";
     static $unsafe_terms = array();
     if (count($word_lists) == 0) {
         return 0;
     }
     if ($unsafe_terms == array()) {
         $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase, "en-US");
         $unsafe_terms = array_keys($unsafe_lists);
     }
     $num_unsafe_terms = 0;
     $unsafe_count = 0;
     $words = array_keys($word_lists);
     $unsafe_found = array_intersect($words, $unsafe_terms);
     foreach ($unsafe_found as $term) {
         $count = count($word_lists[$term]);
         if ($count > 0) {
             $unsafe_count += $count;
             $num_unsafe_terms++;
         }
     }
     $score = $num_unsafe_terms * $unsafe_count / ($len + 1);
     return $score;
 }
Example #8
0
 /**
  * Gets doc summaries of documents containing given words and meeting the
  * additional provided criteria
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appeared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param int $limit number of first document in order to return
  * @param int $num number of documents to return summaries of
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  * @param bool $use_cache_if_allowed if true and USE_CACHE is true then
  *     an attempt will be made to look up the results in either
  *     the file cache or memcache. Otherwise, items will be recomputed
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw > 0)
  *     no grouping done on data. if ($raw == 1) no lookups of summaries
  *     done
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the original query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is not empty, then
  *     save iterate position, so can resume on future queries that make
  *     use of the timestamp. If used then $limit ignored and get next $num
  *     docs after $save_timestamp 's previous iterate position.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return array document summaries
  */
 function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     global $CACHE;
     $indent = "&nbsp;&nbsp;";
     $in2 = $indent . $indent;
     $in3 = $in2 . $indent;
     $in4 = $in2 . $in2;
     if (QUERY_STATISTICS) {
         $lookup_time = microtime();
     }
     $use_proximity = false;
     $time = time();
     if (count($word_structs) > 1 || isset($word_structs[0]["KEYS"]) && count($word_structs[0]["KEYS"]) > 1 || $word_structs == array() && substr_count($original_query, " ") > 1) {
         $use_proximity = true;
     }
     if (!isset($filter['time'])) {
         $filter['time'] = 0;
     }
     $filter_time = $filter['time'];
     unset($filter['time']);
     //iterators don't expect time field
     $pages = array();
     $generation = 0;
     $to_retrieve = ceil(($limit + $num) / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     $start_slice = floor($limit / self::NUM_CACHE_PAGES) * self::NUM_CACHE_PAGES;
     if ($save_timestamp_name != "") {
         $to_retrieve = $num;
         $limit = 0;
         $start_slice = 0;
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $mem_tmp = serialize($raw) . serialize($word_structs) . $original_query . $this->index_name;
         $summary_hash = crawlHash($mem_tmp . ":" . $limit . ":" . $num);
         if ($use_cache_if_allowed) {
             $cache_success = true;
             $results = $CACHE->get($summary_hash);
             if (!isset($results['TIME']) || $filter_time > $results['TIME']) {
                 //if filter has changed since cached, then invalidate cache
                 $results = false;
             }
             if (isset($results['TIME'])) {
                 $cached_time = $time - $results['TIME'];
             } else {
                 $cached_time = $time;
             }
             if ($cached_time > MAX_QUERY_CACHE_TIME) {
                 $results = false;
             }
             if (isset($results['PAGES'])) {
                 $close_prefix = WORK_DIRECTORY . "/schedules/" . self::index_closed_name;
                 $has_changeable_results = false;
                 $seen_times = array();
                 foreach ($results['PAGES'] as $page) {
                     if (!isset($page[self::CRAWL_TIME]) || in_array($page[self::CRAWL_TIME], $seen_times)) {
                         continue;
                     }
                     $seen_times[] = $page[self::CRAWL_TIME];
                     $current_closed = $close_prefix . $page[self::CRAWL_TIME] . ".txt";
                     if (!file_exists($current_closed)) {
                         //either feed result or from active crawl
                         $has_changeable_results = true;
                         break;
                     }
                 }
                 if ($has_changeable_results) {
                     if ($cached_time > MIN_QUERY_CACHE_TIME) {
                         $results = false;
                     }
                 }
             }
             if (QUERY_STATISTICS) {
                 $this->query_info['QUERY'] .= "{$in2}<b>Cache Lookup Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
             }
             if ($results !== false) {
                 return $results;
             }
         }
     }
     $old_to_retrieve = $to_retrieve;
     $query_iterator = $this->getQueryIterator($word_structs, $filter, $raw, $to_retrieve, $queue_servers, $original_query, $save_timestamp_name, $limit_news);
     $num_retrieved = 0;
     $pages = array();
     if (is_object($query_iterator)) {
         while ($num_retrieved < $to_retrieve && is_array($next_docs = $query_iterator->nextDocsWithWord())) {
             $pages += $next_docs;
             $num_retrieved = count($pages);
         }
     }
     if ($save_timestamp_name != "" && ($queue_servers == array() || $this->isSingleLocalhost($queue_servers))) {
         // used for archive crawls of crawl mixes
         $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
         $iterators = $query_iterator->save_iterators;
         $cnt_iterators = count($iterators);
         $save_point = array();
         for ($i = 0; $i < $cnt_iterators; $i++) {
             $save_point[$i] = $iterators[$i]->currentGenDocOffsetWithWord();
         }
         $results["SAVE_POINT"] = $save_point;
         file_put_contents($save_file, serialize($save_point));
         $this->db->setWorldPermissionsRecursive($save_file);
     }
     $pages = array_values($pages);
     $result_count = count($pages);
     $sort_time = 0;
     if ($raw == 0) {
         // initialize scores
         $sort_start = microtime();
         $max_user_ranks = 0;
         for ($i = 0; $i < $result_count; $i++) {
             $pages[$i]["OUT_SCORE"] = 0;
             if (isset($pages[$i][self::USER_RANKS])) {
                 $j = count($pages[$i][self::USER_RANKS]);
                 if ($max_user_ranks < $j) {
                     $max_user_ranks = $j;
                 }
             }
         }
         if ($max_user_ranks > 0) {
             for ($i = 0; $i < $result_count; $i++) {
                 for ($j = 0; $j < $max_user_ranks; $j++) {
                     if (isset($pages[$i][self::USER_RANKS][$j])) {
                         $pages[$i]["USCORE{$j}"] = $pages[$i][self::USER_RANKS][$j];
                     } else {
                         $pages[$i]["USCORE{$j}"] = 0;
                     }
                 }
             }
         }
         $subscore_fields = array(self::DOC_RANK, self::RELEVANCE);
         if ($use_proximity) {
             $subscore_fields[] = self::PROXIMITY;
         }
         if ($max_user_ranks > 0) {
             for ($j = 0; $j < $max_user_ranks; $j++) {
                 $subscore_fields[] = "USCORE{$j}";
             }
         }
         $num_fields = count($subscore_fields);
         // Compute Reciprocal Rank Fusion Score
         $alpha = 600 / $num_fields;
         if (isset($pages[0])) {
             foreach ($subscore_fields as $field) {
                 orderCallback($pages[0], $pages[0], $field);
                 usort($pages, "orderCallback");
                 $score = 0;
                 for ($i = 0; $i < $result_count; $i++) {
                     if ($i > 0) {
                         if ($pages[$i - 1][$field] != $pages[$i][$field]) {
                             $score++;
                         }
                     }
                     $pages[$i]["OUT_SCORE"] += $alpha / (59 + $score);
                 }
             }
             orderCallback($pages[0], $pages[0], "OUT_SCORE");
         }
         usort($pages, "orderCallback");
         if ($use_proximity) {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         } else {
             for ($i = 0; $i < $result_count; $i++) {
                 $pages[$i][self::PROXIMITY] = 1;
                 $pages[$i][self::SCORE] = $pages[$i]["OUT_SCORE"];
             }
         }
         $sort_time = changeInMicrotime($sort_start);
     }
     if ($num_retrieved < $to_retrieve) {
         $results['TOTAL_ROWS'] = $num_retrieved;
     } else {
         $results['TOTAL_ROWS'] = $query_iterator->num_docs;
         //this is only an approximation
     }
     if ($raw == 1 && $save_timestamp_name == "") {
         $pages = array_slice($pages, $start_slice);
         $pages = array_slice($pages, $limit - $start_slice, $num);
         $results['PAGES'] =& $pages;
         if ($old_to_retrieve != $to_retrieve) {
             $results['HARD_QUERY'] = $old_to_retrieve;
         }
         return $results;
     }
     if (QUERY_STATISTICS) {
         $this->query_info['QUERY'] .= "{$in2}<b>Lookup Offsets Time</b>: " . changeInMicrotime($lookup_time) . "<br />";
         $machine_times = AnalyticsManager::get("MACHINE_TIMES");
         if ($machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Machine Sub-Times</i>:<br />" . $machine_times . "<br />";
         }
         $net_times = AnalyticsManager::get("NET_TIMES");
         $max_machine_times = AnalyticsManager::get("MAX_MACHINE_TIMES");
         if ($net_times && $max_machine_times) {
             $this->query_info['QUERY'] .= "{$in3}<i>Network Overhead Sub-Time</i>: " . ($net_times - $max_machine_times) . "<br />";
         }
         if ($sort_time) {
             $this->query_info['QUERY'] .= "{$in3}<i>Merge-Rank Sub-Time</i>: " . $sort_time . "<br />";
         }
         $summaries_time = microtime();
     }
     $get_pages = array_slice($pages, $limit, $num);
     $to_get_count = count($get_pages);
     $groups_with_docs = false;
     if (preg_match("/\\bsite:doc\\b/", $original_query)) {
         $groups_with_docs = true;
     }
     $out_pages = array();
     $cur_limit = $limit;
     while (count($out_pages) < $to_get_count && $get_pages) {
         $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, $raw, $groups_with_docs));
         if ($save_timestamp_name != "") {
             break;
         }
         $cur_limit += $num;
         $get_pages = array_slice($pages, $cur_limit, $num);
     }
     $out_pages = array_slice($out_pages, 0, $num);
     if (QUERY_STATISTICS) {
         $summary_times_string = AnalyticsManager::get("SUMMARY_TIMES");
         if ($summary_times_string) {
             $round_summary_times = unserialize($summary_times_string);
             $summary_delta_time = changeInMicrotime($summaries_time);
             $summary_time_info = "{$summary_delta_time}<br /> {$in4}";
             $sum_max_time = 0;
             foreach ($round_summary_times as $summary_times) {
                 $i = 0;
                 $max_time = 0;
                 foreach ($summary_times as $summary_time) {
                     $summary_time_info .= "ID_{$i}: " . $summary_time . "{$indent}";
                     $max_time = $summary_time > $max_time ? $summary_time : $max_time;
                     $i++;
                 }
                 $sum_max_time += $max_time;
             }
             $net_overhead = $summary_delta_time - $sum_max_time;
             $summary_time_info .= "<br />{$in3}<i>Network Overhead Sub-Time</i>: " . $net_overhead;
         } else {
             $summary_time_info = changeInMicrotime($summaries_time);
         }
         $this->query_info['QUERY'] .= "{$in2}<b>Get Summaries Time</b>: " . $summary_time_info . "<br />";
     }
     $results['PAGES'] =& $out_pages;
     $results['TIME'] = time();
     $lang = guessLocaleFromString($original_query);
     $tokenizer = PhraseParser::getTokenizer($lang);
     //only use tokenizer if no meta word or disjuncts in query
     if (!preg_match('/(\\||\\:)/u', $original_query) && $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") && method_exists($tokenizer, "tagPartsOfSpeechPhrase") && isset($tokenizer->use_thesaurus)) {
         $results = $this->sortByThesaurusScore($results, $original_query, $lang);
     }
     if (USE_CACHE && $save_timestamp_name == "") {
         $CACHE->set($summary_hash, $results);
     }
     return $results;
 }
Example #9
0
 /**
  * Returns the number of documents in an index that a phrase occurs in.
  * If it occurs in more than threshold documents then cut off search.
  *
  * @param string $phrase to look up in index
  * @param int $threshold once count in posting list for any word
  *     reaches to threshold then return the number
  * @param string $index_name selected index for search engine
  * @param string $lang locale tag for the query
  * @return int number of documents phrase occurs in
  */
 static function numDocsIndex($phrase, $threshold, $index_name, $lang)
 {
     PhraseParser::canonicalizePunctuatedTerms($phrase, $lang);
     $terms = PhraseParser::stemCharGramSegment($phrase, $lang);
     $num = count($terms);
     if ($index_name == NULL) {
         return 0;
     }
     if (count($terms) > MAX_QUERY_TERMS) {
         $terms = array_slice($terms, 0, MAX_QUERY_TERMS);
     }
     $whole_phrase = implode(" ", $terms);
     return IndexManager::numDocsTerm($whole_phrase, $index_name, $threshold);
 }
Example #10
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Example #11
0
 /**
  * Generates a centroid with which every sentence is ranked with cosine
  * ranking method and also generates a word cloud.
  * @param string $doc complete raw page to generate the summary from.
  * @param string $lang language of the page to decide which stop words to
  *     call proper tokenizer.php of the specified language.
  *
  * @return array array of summary and word cloud
  */
 static function getCentroidSummary($doc, $lang)
 {
     $doc = self::pageProcessing($doc);
     /* Format the document to remove characters other than periods and
           alphanumerics.
        */
     $formatted_doc = self::formatDoc($doc);
     $stop_obj = PhraseParser::getTokenizer($lang);
     /* Splitting into sentences */
     $out_sentences = self::getSentences($doc);
     $n = count($out_sentences);
     $sentences = array();
     if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
         for ($i = 0; $i < $n; $i++) {
             $sentences[$i] = $stop_obj->stopwordsRemover(self::formatDoc($out_sentences[$i]));
         }
     } else {
         $sentences = $out_sentences;
     }
     /*  Splitting into terms */
     $terms = array();
     foreach ($sentences as $sentence) {
         $terms = array_merge($terms, PhraseParser::segmentSegment($sentence, $lang));
     }
     $terms = array_filter($terms);
     $terms_counts = array_count_values($terms);
     arsort($terms_counts);
     $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS);
     $terms = array_unique(array_keys($terms_counts));
     $t = count($terms);
     if ($t == 0) {
         return array("", "");
     }
     /* Initialize Nk array(Number of sentences the term occurs) */
     $nk = array();
     $nk = array_fill(0, $t, 0);
     $nt = array();
     /* Count TF for each word */
     for ($i = 0; $i < $n; $i++) {
         for ($j = 0; $j < $t; $j++) {
             if (strpos($sentences[$i], $terms[$j]) !== false) {
                 $nk[$j]++;
             }
         }
     }
     /* Calculate weights of each term for every sentence */
     $w = array();
     $idf = array();
     $idf_temp = 0;
     for ($k = 0; $k < $t; $k++) {
         if ($nk[$k] == 0) {
             $idf_temp = 0;
             $tmp = 0;
         } else {
             $idf_temp = $n / $nk[$k];
             $tmp = log($idf_temp);
         }
         $idf[$k] = $tmp;
     }
     /* Count TF for finding centroid */
     $wc = array();
     $max_nt = -1;
     $b = "\\b";
     if (in_array($lang, array("zh-CN", "ja", "ko"))) {
         $b = "";
     }
     for ($j = 0; $j < $t; $j++) {
         $nt = @preg_match_all("/{$b}{$terms[$j]}{$b}/", $formatted_doc, $matches);
         //$matches included for backwards compatibility
         $wc[$j] = $nt * $idf[$j];
         if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
             $wc[$j] = 0;
         }
     }
     /* Calculate centroid */
     arsort($wc);
     $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
     /* Initializing centroid weight array by 0 */
     $wc = array_fill(0, $t, 0);
     /* Word cloud */
     $i = 0;
     $word_cloud = array();
     foreach ($centroid as $key => $value) {
         $wc[$key] = $value;
         if ($i < self::WORD_CLOUD_LEN) {
             $word_cloud[$i] = $terms[$key];
         }
         $i++;
     }
     if (strlen($formatted_doc) < PageProcessor::$max_description_len || $n == 1) {
         //if input short only use above to get a word cloud
         $formatted_doc = substr($formatted_doc, 0, PageProcessor::$max_description_len);
         return array($formatted_doc, $word_cloud);
     }
     ksort($wc);
     /* Calculate similarity measure between centroid and each sentence */
     $sim = array();
     for ($i = 0; $i < $n; $i++) {
         $a = $b1 = $b2 = $c1 = $c2 = $d = 0;
         for ($k = 0; $k < $t; $k++) {
             $wck = $wc[$k];
             $idfk = $idf[$k];
             $tmp = substr_count($sentences[$i], $terms[$k]);
             $wik = $tmp > 0 ? $idfk * (1 + log($tmp)) : 0;
             $a += $wik * $wck * $idfk;
             $b1 += $wik * $wik;
             $c1 += $wck * $wck;
         }
         $b2 = sqrt($b1);
         $c2 = sqrt($c1);
         $d = $b2 * $c2;
         if ($d == 0) {
             $sim[$i] = 0;
         } else {
             $sim[$i] = $a / $d;
         }
     }
     arsort($sim);
     /* Getting how many sentences should be there in summary */
     $top = self::summarySentenceCount($out_sentences, $sim);
     $sum_array = array();
     $sum_array = array_slice($sim, 0, $top - 1, true);
     ksort($sum_array);
     /* Printing Summary */
     $summary = '';
     foreach ($sum_array as $key => $value) {
         $summary .= "{$out_sentences[$key]}" . ". ";
     }
     /* Summary of text summarization */
     return array($summary, $word_cloud);
 }
Example #12
0
 /**
  * Given a page summary extract the words from it and try to find documents
  * which match the most relevant words. The algorithm for "relevant" is
  * pretty weak. For now we pick the $num many words whose ratio
  * of number of occurences in crawl item/ number of occurences in all
  * documents is the largest
  *
  * @param string $crawl_item a page summary
  * @param int $num number of key phrase to return
  * @param int $crawl_time the timestamp of an index to use, if 0 then
  *     default used
  * @return array  an array of most selective key phrases
  */
 function getTopPhrases($crawl_item, $num, $crawl_time = 0)
 {
     $crawl_model = $this->model("crawl");
     $queue_servers = $this->model("machine")->getQueueServerUrls();
     if ($crawl_time == 0) {
         $crawl_time = $crawl_model->getCurrentIndexDatabaseName();
     }
     $this->model("phrase")->index_name = $crawl_time;
     $crawl_model->index_name = $crawl_time;
     $phrase_string = PhraseParser::extractWordStringPageSummary($crawl_item);
     $crawl_item[self::LANG] = isset($crawl_item[self::LANG]) ? $crawl_item[self::LANG] : DEFAULT_LOCALE;
     $page_word_counts = PhraseParser::extractPhrasesAndCount($phrase_string, $crawl_item[self::LANG]);
     $words = array_keys($page_word_counts);
     $word_counts = $crawl_model->countWords($words, $queue_servers);
     $word_ratios = array();
     foreach ($page_word_counts as $word => $count) {
         $word_ratios[$word] = isset($word_counts[$word]) && $word_counts[$word] > 0 ? $count / $word_counts[$word] : 0;
         /*discard cases where word only occurs in one doc as want
           to find related relevant documents */
         if ($word_ratios[$word] == 1) {
             $word_ratios[$word] = 0;
         }
     }
     uasort($word_ratios, "greaterThan");
     $top_phrases = array_keys($word_ratios);
     $top_phrases = array_slice($top_phrases, 0, $num);
     return $top_phrases;
 }
Example #13
0
 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }
Example #14
0
 /**
  * Generates a n word grams text file from input wikipedia xml file.
  * The input file can be a bz2 compressed or uncompressed.
  * The input XML file is parsed line by line and pattern for
  * n word gram is searched. If a n word gram is found it is added to the
  * array. After the complete file is parsed we remove the duplicate
  * n word grams and sort them. The resulting array is written to the
  * text file. The function returns the number of bigrams stored in
  * the text file.
  *
  * @param string $wiki_file compressed or uncompressed wikipedia
  *     XML file path to be used to extract bigrams. This can also
  *     be a folder containing such files
  * @param string $lang Language to be used to create n grams.
  * @param string $locale Locale to be used to store results.
  * @param int $num_gram number of words in grams we are looking for
  * @param int $ngram_type where in Wiki Dump to extract grams from
  * @param int $max_terms maximum number of n-grams to compute and put in
  *      file
  * @return int $num_ngrams_found count of bigrams in text file.
  */
 static function makeNWordGramsTextFile($wiki_file, $lang, $locale, $num_gram = 2, $ngram_type = self::PAGE_COUNT_WIKIPEDIA, $max_terms = -1)
 {
     $output_message_threshold = self::BLOCK_SIZE * self::BLOCK_SIZE;
     $is_count_type = false;
     switch ($ngram_type) {
         case self::WIKI_DUMP_TITLE:
             $pattern = '/<title>[^\\p{P}]+';
             $pattern_end = '<\\/title>/u';
             $replace_array = array('<title>', '</title>');
             break;
         case self::WIKI_DUMP_REDIRECT:
             $pattern = '/#redirect\\s\\[\\[[^\\p{P}]+';
             $pattern_end = '\\]\\]/u';
             $replace_array = array('#redirect [[', ']]');
             break;
         case self::PAGE_COUNT_WIKIPEDIA:
             $pattern = '/^' . $lang . '\\s[^\\p{P}]+';
             $pattern_end = '/u';
             $is_count_type = true;
             break;
         case self::PAGE_COUNT_WIKTIONARY:
             $pattern = '/^' . $lang . '.d\\s[^\\p{P}]+';
             $pattern_end = '/u';
             $is_count_type = true;
             break;
     }
     $is_all = false;
     $repeat_pattern = "[\\s|_][^\\p{P}]+";
     if ($num_gram == "all" || $is_count_type) {
         $pattern .= "({$repeat_pattern})+";
         if ($num_gram == "all") {
             $is_all = true;
         }
         $max_gram_len = -1;
     } else {
         for ($i = 1; $i < $num_gram; $i++) {
             $pattern .= $repeat_pattern;
         }
         $max_gram_len = $num_gram;
     }
     $pattern .= $pattern_end;
     $replace_types = array(self::WIKI_DUMP_TITLE, self::WIKI_DUMP_REDIRECT);
     if (is_dir(PREP_DIR . "/{$wiki_file}")) {
         $folder_files = glob(PREP_DIR . "/{$wiki_file}/*.{gz,bz}", GLOB_BRACE);
     } else {
         $folder_files = array(PREP_DIR . "/{$wiki_file}");
     }
     $ngrams = array();
     foreach ($folder_files as $wiki_file_path) {
         if (strpos($wiki_file_path, "bz2") !== false) {
             $fr = bzopen($wiki_file_path, 'r') or die("Can't open compressed file");
             $read = "bzread";
             $close = "bzclose";
         } else {
             if (strpos($wiki_file_path, "gz") !== false) {
                 $fr = gzopen($wiki_file_path, 'r') or die("Can't open compressed file");
                 $read = "gzread";
                 $close = "gzclose";
             } else {
                 $fr = fopen($wiki_file_path, 'r') or die("Can't open file");
                 $read = "fread";
                 $close = "fclose";
             }
         }
         $ngrams_file_path = LOCALE_DIR . "/{$locale}/resources/" . "{$num_gram}" . self::TEXT_SUFFIX;
         $input_buffer = "";
         $time = time();
         echo "Reading wiki file ...{$wiki_file_path}...\n";
         $bytes = 0;
         $bytes_since_last_output = 0;
         while (!feof($fr)) {
             $input_text = $read($fr, self::BLOCK_SIZE);
             $len = strlen($input_text);
             if ($len == 0) {
                 break;
             }
             $bytes += $len;
             $bytes_since_last_output += $len;
             if ($bytes_since_last_output > $output_message_threshold) {
                 echo "Have now read " . $bytes . " many bytes." . " Peak memory so far: " . memory_get_peak_usage() . ".\n     Number of word grams so far: " . count($ngrams) . ". Elapsed time so far: " . (time() - $time) . "s\n";
                 $bytes_since_last_output = 0;
             }
             $input_buffer .= mb_strtolower($input_text);
             $lines = explode("\n", $input_buffer);
             $input_buffer = array_pop($lines);
             foreach ($lines as $line) {
                 preg_match($pattern, $line, $matches);
                 if (count($matches) > 0) {
                     if ($is_count_type) {
                         $line_parts = explode(" ", $matches[0]);
                         if (isset($line_parts[1]) && isset($line_parts[2])) {
                             $ngram = mb_ereg_replace("_", " ", $line_parts[1]);
                             $char_grams = PhraseParser::getCharGramsTerm(array($ngram), $locale);
                             $ngram = implode(" ", $char_grams);
                             $ngram_num_words = mb_substr_count($ngram, " ") + 1;
                             if ($is_all && $ngram_num_words > 1 || !$is_all && $ngram_num_words == $num_gram) {
                                 $ngrams[$ngram] = $line_parts[2];
                             }
                         }
                     } else {
                         $ngram = mb_ereg_replace($replace_array, "", $matches[0]);
                         $ngram = mb_ereg_replace("_", " ", $ngram);
                         $ngrams[] = $ngram;
                     }
                     if ($is_all && isset($ngram)) {
                         $ngram_num_words = mb_substr_count($ngram, " ") + 1;
                         $max_gram_len = max($max_gram_len, $ngram_num_words);
                     }
                 }
             }
         }
     }
     if ($is_count_type) {
         arsort($ngrams);
         $ngrams = array_keys($ngrams);
     }
     $ngrams = array_unique($ngrams);
     $num_ngrams_found = count($ngrams);
     if ($max_terms > 0 && $num_ngrams_found > $max_terms) {
         $ngrams = array_slice($ngrams, 0, $max_terms);
     }
     $num_ngrams_found = count($ngrams);
     // in is_all case add prefix*'s for (n >= 3)-grams
     if ($is_all) {
         for ($i = 0; $i < $num_ngrams_found; $i++) {
             $ngram_in_word = mb_substr_count($ngrams[$i], " ") + 1;
             if ($ngram_in_word >= 3) {
                 $ngram_parts = explode(" ", $ngrams[$i]);
                 $ngram = $ngram_parts[0];
                 for ($j = 1; $j < $ngram_in_word - 1; $j++) {
                     $ngram .= " " . $ngram_parts[$j];
                     $ngrams[] = $ngram . "*";
                 }
             }
         }
         $ngrams = array_unique($ngrams);
         $num_ngrams_found = count($ngrams);
     }
     sort($ngrams);
     $ngrams_string = implode("\n", $ngrams);
     file_put_contents($ngrams_file_path, $ngrams_string);
     $close($fr);
     return array($num_ngrams_found, $max_gram_len);
 }