Beispiel #1
0
    /**
     * Checks whether the same search threshold can classify p**n from
     * non-p**n sites. Sample were taken from a couple p**n sites,
     * sorted alphabetically by word and then some of the non sensitive words
     * were substituted so as to avoid copyright issues. For the safe tests
     * a similar process was done with the Wizard of Oz (now public domain)
     * and with some sexually related Wikipedia articles (Creative Commons SA).
     */
    function computeSafeSearchScoreTestCase()
    {
        $phrase_string = <<<EOD
a a a a a a a a a a a a all and and
and and and and and another any arose at aunt aunt be bed bed beds big
build building by by called carried case cellar cellar chairs contained
cookstove corner corner could crush cupboard cyclone dark dishes door
dorothy dorothy down dug em em enough except family farmer farmer's floor
floor for for four four from garret go great great ground had had henry
henry hole hole house in in in in in in in into it it its kansas ladder led
little lived looking lumber made many middle midst mighty miles no no of of
of one one one or path prairies reached roof room room rusty small small
small table the the the the the the the the the the the their there there
this those three to to to trap uncle uncle wagon walls was was was was was
were where which which whirlwinds who who wife with
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score < 0.012, "Easy Safe Test 1");
        $phrase_string = <<<EOD
a afraid all and anon baby big boobs but c**k crave dicking does
for from grown has how in is isnt knot lolita matts monster pussies ready
she she shew s**t teens their thom them thought they're tight to to to total
up use whether
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score > 0.012, "Easy Unsafe Test 1");
        $phrase_string = <<<EOD
a a a a a adventure after all alotta amazing and and and and and
and and and and and around as ball ball big body boobies bounce boy
brunhilda came check check chilled cirque do enjoy ensued exercises
flap friends f*****g f*****g give going gorge got got grabbing had
had had has he hell her her horny i if in it it it it it jog junk
just know little little loved me mean melons melons my my of on out out
ploy precious kitties see she she she sought sizzle so so spent spicy
started stretch sucking swinging that that that the the the then things
those those those tit titties titty to to to togo today tramp truly
us was we we we what what when what wild with with with workout wrap yes
you
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score > 0.012, "Harder Unsafe Test 1");
        $phrase_string = <<<EOD
amino hog known a a a a an and and
and and are are as as asymmetry be biology both but can cases cells
combining combining contain deem distance each early evolved exist
female female for firm firm from function gametes gametes gametes gametes
genetic genentech has ideal in in in in information disinherit into intone
is isopod known large mole mole many mixing motile motile necessary
non nutrients of of of of offspring often optimized or organism organisms
over parents process reproduce reproduce result sex sex sexual
sexual small specialist specialized specific such that that the the the
the their to to traits traits transport two types variety while young
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score < 0.012, "Harder Safe Test 1");
        $phrase_string = <<<EOD
a a active adverb an an and are as as as attribute be
between by caught characterized daft describe describe desire desire deft
french female female females having homosexuality identify in is language
lesbian may moist verb object of of or or or others secondary refer relay
romantic same sex sexual trim the the the them to to to to to used
used who who wide women ward
EOD;
        $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, "en-US");
        $len = strlen($phrase_string);
        $score = PhraseParser::computeSafeSearchScore($word_lists, $len);
        $this->assertTrue($score < 0.012, "Harder Safe Test 2");
    }
Beispiel #2
0
 /**
  * Handles admin request related to controlling file options to be used
  * in a crawl
  *
  * This activity allows a user to specify the page range size to be
  * be used during a crawl as well as which file types can be downloaded
  */
 function pageOptions()
 {
     global $INDEXED_FILE_TYPES;
     /* get processors for different file types (populating
        $INDEXED_FILE_TYPES) */
     foreach (glob(BASE_DIR . "/lib/processors/*_processor.php") as $filename) {
         require_once $filename;
     }
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $profile_model = $parent->model("profile");
     $data["ELEMENT"] = "pageoptions";
     $data['SCRIPT'] = "";
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $num_machines = count($machine_urls);
     if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) {
         $machine_urls = NULL;
     }
     $data['available_options'] = array(tl('crawl_component_use_below'), tl('crawl_component_use_defaults'));
     $crawls = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data['options_default'] = tl('crawl_component_use_below');
     foreach ($crawls as $crawl) {
         if (strlen($crawl['DESCRIPTION']) > 0) {
             $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'];
         }
     }
     $seed_info = $crawl_model->getSeedInfo();
     $data['RECRAWL_FREQS'] = array(-1 => tl('crawl_component_recrawl_never'), 1 => tl('crawl_component_recrawl_1day'), 2 => tl('crawl_component_recrawl_2day'), 3 => tl('crawl_component_recrawl_3day'), 7 => tl('crawl_component_recrawl_7day'), 14 => tl('crawl_component_recrawl_14day'));
     $data['SIZE_VALUES'] = array(10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['LEN_VALUES'] = array(2000 => 2000, 10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['available_summarizers'] = array(self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid'));
     if (!isset($seed_info["indexed_file_types"]["extensions"])) {
         $seed_info["indexed_file_types"]["extensions"] = $INDEXED_FILE_TYPES;
     }
     $loaded = false;
     if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) {
         if ($_REQUEST['load_option'] == 1) {
             $seed_loaded = $crawl_model->getSeedInfo(true);
         } else {
             $timestamp = substr($parent->clean($_REQUEST['load_option'], "int"), 0, TIMESTAMP_LEN);
             $seed_loaded = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls);
         }
         $copy_options = array("general" => array("page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'), "indexed_file_types" => array("extensions"), "indexing_plugins" => array("plugins", "plugins_data"));
         foreach ($copy_options as $main_option => $sub_options) {
             foreach ($sub_options as $sub_option) {
                 if (isset($seed_loaded[$main_option][$sub_option])) {
                     $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option];
                 }
             }
         }
         if (isset($seed_loaded['page_rules'])) {
             $seed_info['page_rules'] = $seed_loaded['page_rules'];
         }
         if (isset($seed_loaded['active_classifiers'])) {
             $seed_info['active_classifiers'] = $seed_loaded['active_classifiers'];
         } else {
             $seed_info['active_classifiers'] = array();
             $seed_info['active_classifiers']['label'] = array();
         }
         $loaded = true;
     } else {
         $seed_info = $crawl_model->getSeedInfo();
         if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) {
             $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"];
         }
         if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) {
             $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"];
         }
         if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) {
             $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option'];
         }
         if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) {
             $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"];
         }
         if (isset($_REQUEST["cache_pages"])) {
             $seed_info["general"]["cache_pages"] = true;
         } else {
             if (isset($_REQUEST['posted'])) {
                 //form sent but check box unchecked
                 $seed_info["general"]["cache_pages"] = false;
             }
         }
         if (isset($_REQUEST['page_rules'])) {
             $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray($_REQUEST['page_rules'], 'rule');
         }
     }
     if (!isset($seed_info["general"]["page_recrawl_frequency"])) {
         $seed_info["general"]["page_recrawl_frequency"] = PAGE_RECRAWL_FREQUENCY;
     }
     $data['summarizer_option'] = $seed_info['general']['summarizer_option'];
     $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"];
     if (!isset($seed_info["general"]["cache_pages"])) {
         $seed_info["general"]["cache_pages"] = false;
     }
     $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"];
     if (!isset($seed_info["general"]["page_range_request"])) {
         $seed_info["general"]["page_range_request"] = PAGE_RANGE_REQUEST;
     }
     $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"];
     if (!isset($seed_info["general"]["max_description_len"])) {
         $seed_info["general"]["max_description_len"] = MAX_DESCRIPTION_LEN;
     }
     $data['MAX_LEN'] = $seed_info["general"]["max_description_len"];
     $data['INDEXING_PLUGINS'] = array();
     $included_plugins = array();
     if (isset($_REQUEST["posted"]) && !$loaded) {
         $seed_info['indexing_plugins']['plugins'] = isset($_REQUEST["INDEXING_PLUGINS"]) ? $_REQUEST["INDEXING_PLUGINS"] : array();
     }
     $included_plugins = isset($seed_info['indexing_plugins']['plugins']) ? $seed_info['indexing_plugins']['plugins'] : array();
     foreach ($parent->indexing_plugins as $plugin) {
         $plugin_name = ucfirst($plugin);
         $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = in_array($plugin_name, $included_plugins) ? "checked='checked'" : "";
         /* to use method_exists we want that the require_once for the plugin
              class has occurred so we instantiate the object via the plugin
              method call which will also do the require if needed.
            */
         $plugin_object = $parent->plugin(lcfirst($plugin_name));
         $class_name = $plugin_name . "Plugin";
         if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) {
             if (isset($seed_info['indexing_plugins']['plugins_data'][$plugin_name])) {
                 $plugin_object->setConfiguration($seed_info['indexing_plugins']['plugins_data'][$plugin_name]);
             } else {
                 $plugin_object->loadDefaultConfiguration();
             }
             $plugin_object->saveConfiguration();
         }
         if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true;
             $plugin_object->configureHandler($data);
         } else {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false;
         }
     }
     $profile = $profile_model->getProfile(WORK_DIRECTORY);
     if (!isset($_REQUEST['load_option'])) {
         $data = array_merge($data, $profile);
     } else {
         $parent->updateProfileFields($data, $profile, array('IP_LINK', 'CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST'));
     }
     $weights = array('TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6);
     $change = false;
     foreach ($weights as $weight => $value) {
         if (isset($_REQUEST[$weight])) {
             $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1);
             $profile[$weight] = $data[$weight];
             $change = true;
         } else {
             if (isset($profile[$weight]) && $profile[$weight] != "") {
                 $data[$weight] = $profile[$weight];
             } else {
                 $data[$weight] = $value;
                 $profile[$weight] = $data[$weight];
                 $change = true;
             }
         }
     }
     if ($change == true) {
         $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
     }
     $data['INDEXED_FILE_TYPES'] = array();
     $filetypes = array();
     foreach ($INDEXED_FILE_TYPES as $filetype) {
         $ison = false;
         if (isset($_REQUEST["filetype"]) && !$loaded) {
             if (isset($_REQUEST["filetype"][$filetype])) {
                 $filetypes[] = $filetype;
                 $ison = true;
                 $change = true;
             }
         } else {
             if (in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) {
                 $filetypes[] = $filetype;
                 $ison = true;
             }
         }
         $data['INDEXED_FILE_TYPES'][$filetype] = $ison ? "checked='checked'" : '';
     }
     $seed_info["indexed_file_types"]["extensions"] = $filetypes;
     $data['CLASSIFIERS'] = array();
     $data['RANKERS'] = array();
     $active_classifiers = array();
     $active_rankers = array();
     foreach (Classifier::getClassifierList() as $classifier) {
         $label = $classifier->class_label;
         $ison = false;
         if (isset($_REQUEST['classifier']) && !$loaded) {
             if (isset($_REQUEST['classifier'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) {
                 if (in_array($label, $seed_info['active_classifiers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['CLASSIFIERS'][$label] = 'checked="checked"';
             $active_classifiers[] = $label;
         } else {
             $data['CLASSIFIERS'][$label] = '';
         }
         $ison = false;
         if (isset($_REQUEST['ranker']) && !$loaded) {
             if (isset($_REQUEST['ranker'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) {
                 if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['RANKERS'][$label] = 'checked="checked"';
             $active_rankers[] = $label;
         } else {
             $data['RANKERS'][$label] = '';
         }
     }
     $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', DEFAULT_ADMIN_PAGING_NUM / 5, array(), "", array('name' => 'class_label'));
     $seed_info['active_classifiers']['label'] = $active_classifiers;
     $seed_info['active_rankers']['label'] = $active_rankers;
     if (isset($seed_info['page_rules']['rule'])) {
         if (isset($seed_info['page_rules']['rule']['rule'])) {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']['rule']);
         } else {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']);
         }
     } else {
         $data['page_rules'] = "";
     }
     $allowed_options = array('crawl_time', 'search_time', 'test_options');
     if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) {
         $data['option_type'] = $_REQUEST['option_type'];
     } else {
         $data['option_type'] = 'crawl_time';
     }
     if ($data['option_type'] == 'crawl_time') {
         $data['crawl_time_active'] = "active";
         $data['search_time_active'] = "";
         $data['test_options_active'] = "";
         $data['SCRIPT'] .= "\nswitchTab('crawltimetab'," . "'searchtimetab', 'testoptionstab')\n";
     } else {
         if ($data['option_type'] == 'search_time') {
             $data['search_time_active'] = "active";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "";
             $data['SCRIPT'] .= "\nswitchTab('searchtimetab'," . "'crawltimetab', 'testoptionstab')\n";
         } else {
             $data['search_time_active'] = "";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "active";
             $data['SCRIPT'] .= "\nswitchTab('testoptionstab'," . "'crawltimetab', 'searchtimetab');\n";
         }
     }
     $crawl_model->setSeedInfo($seed_info);
     if ($change == true && $data['option_type'] != 'test_options') {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_updated') . "</h1>')";
     }
     $test_processors = array("text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text");
     $data['MIME_TYPES'] = array_keys($test_processors);
     $data['page_type'] = "text/html";
     if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
         $data['page_type'] = $_REQUEST['page_type'];
     }
     $data['TESTPAGE'] = isset($_REQUEST['TESTPAGE']) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : "";
     if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] != "") {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_running_tests') . "</h1>')";
         $site = array();
         $site[self::ENCODING] = "UTF-8";
         $site[self::URL] = "http://test-site.yioop.com/";
         $site[self::IP_ADDRESSES] = array("1.1.1.1");
         $site[self::HTTP_CODE] = 200;
         $site[self::MODIFIED] = date("U", time());
         $site[self::TIMESTAMP] = time();
         $site[self::TYPE] = "text/html";
         $site[self::HEADER] = "page options test extractor";
         $site[self::SERVER] = "unknown";
         $site[self::SERVER_VERSION] = "unknown";
         $site[self::OPERATING_SYSTEM] = "unknown";
         $site[self::LANG] = 'en';
         $site[self::JUST_METAS] = false;
         if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
             $site[self::TYPE] = $_REQUEST['page_type'];
         }
         if ($site[self::TYPE] == 'text/html') {
             $site[self::ENCODING] = guessEncodingHtml($_REQUEST['TESTPAGE']);
         }
         $prefix_name = $test_processors[$site[self::TYPE]];
         $processor_name = ucfirst($prefix_name) . "Processor";
         $plugin_processors = array();
         if (isset($seed_info['indexing_plugins']['plugins'])) {
             foreach ($seed_info['indexing_plugins']['plugins'] as $plugin) {
                 $plugin_name = $plugin . "Plugin";
                 $supported_processors = $plugin_name::getProcessors();
                 foreach ($supported_processors as $supported_processor) {
                     $parent_processor = $processor_name;
                     do {
                         if ($supported_processor == $parent_processor) {
                             $plugin_object = $parent->plugin(lcfirst($plugin));
                             if (method_exists($plugin_name, "loadConfiguration")) {
                                 $plugin_object->loadConfiguration();
                             }
                             $plugin_processors[] = $plugin_object;
                             break;
                         }
                     } while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor");
                 }
             }
         }
         $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]);
         restore_error_handler();
         $data["PAGE_RANGE_REQUEST"] = $seed_info["general"]["page_range_request"];
         $doc_info = $page_processor->handle(substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]);
         set_error_handler("yioop_error_handler");
         if (!$doc_info) {
             $data["AFTER_PAGE_PROCESS"] = "";
             $data["AFTER_RULE_PROCESS"] = "";
             $data["EXTRACTED_WORDS"] = "";
             $data["EXTRACTED_META_WORDS"] = "";
             return $data;
         }
         if ($processor_name != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
             $doc_info[self::LINKS] = UrlParser::pruneLinks($doc_info[self::LINKS]);
         }
         foreach ($doc_info as $key => $value) {
             $site[$key] = $value;
         }
         if (isset($site[self::PAGE])) {
             unset($site[self::PAGE]);
         }
         if (isset($site[self::ROBOT_PATHS])) {
             $site[self::JUST_METAS] = true;
         }
         $reflect = new ReflectionClass("CrawlConstants");
         $crawl_constants = $reflect->getConstants();
         $crawl_keys = array_keys($crawl_constants);
         $crawl_values = array_values($crawl_constants);
         $inverse_constants = array_combine($crawl_values, $crawl_keys);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $rule_string = implode("\n", $seed_info['page_rules']['rule']);
         $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
         $page_rule_parser = new PageRuleParser($rule_string);
         $page_rule_parser->executeRuleTrees($site);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $lang = NULL;
         if (isset($site[self::LANG])) {
             $lang = $site[self::LANG];
         }
         $meta_ids = PhraseParser::calculateMetas($site);
         if (!$site[self::JUST_METAS]) {
             $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]);
             $path_words = UrlParser::getWordsLastPathPartUrl($site[self::URL]);
             $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
             if ($site[self::TITLE] != "") {
                 $lang = guessLocaleFromString($site[self::TITLE], $lang);
             } else {
                 $lang = guessLocaleFromString(substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $lang);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!isset($word_lists)) {
             $word_lists = array();
         }
         $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(print_r($word_lists, true), "string"), 75, "\n", true);
         $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(print_r($meta_ids, true), "string"), 75, "\n", true);
     }
     return $data;
 }
Beispiel #3
0
 /**
  * Scores documents according to the lack or nonlack of sexually explicit
  * terms. Tries to work for several languages. Very crude classifier.
  *
  * @param array $word_lists word => pos_list tuples
  * @param int $len length of text being examined in characters
  * @return int $score of how explicit document is
  */
 static function computeSafeSearchScore(&$word_lists, $len)
 {
     static $unsafe_phrase = "\nXXX sex s**t nymphomaniac MILF lolita lesbian sadomasochism\nbondage fisting erotic v****a Tribadism penis facial hermaphrodite\ntranssexual tranny bestiality snuff boob fondle tit\nblowjob lap c**k dick hardcore pr0n f**k pussy penetration ass\ncunt bisexual prostitution screw ass masturbation clitoris c**t suck w***e bitch\nbellaco cachar chingar shimar chinquechar chichar clavar coger culear hundir\njoder mámalo singar cojon carajo caray bicho concha chucha chocha\nchuchamadre coño panocha almeja culo fundillo fundío puta puto teta\nconnorito cul pute putain sexe pénis vulve foutre baiser sein nicher nichons\nputa sapatão foder ferro punheta vadia buceta bucetinha bunda caralho\nmentula cunnus verpa sōpiō pipinna\ncōleī cunnilingus futuō copulate cēveō crīsō\nscortor meretrīx futatrix minchia coglione cornuto culo inocchio frocio puttana\nvaffanculo fok hoer kut lul やりまん 打っ掛け\n 二形 ふたなりゴックン ゴックン\nショタコン 全裸 受け 裏本 пизда́ хуй еба́ть\nблядь елда́ гондо́н хер манда́ му́ди мудя\nпидора́с залу́па жо́па за́дница буфер\n雞巴 鷄巴 雞雞 鷄鷄 阴茎 陰莖 胯下物\n屌 吊 小鳥 龟头 龜頭 屄 鸡白 雞白 傻屄 老二\n那话儿 那話兒 屄 鸡白 雞白 阴道 陰道\n阴户 陰戶 大姨妈 淫蟲 老嫖 妓女 臭婊子 卖豆腐\n賣豆腐 咪咪 大豆腐 爆乳 肏操\n炒饭 炒飯 cặc lồn kaltak orospu siktir sıçmak amcık";
     static $unsafe_terms = array();
     if (count($word_lists) == 0) {
         return 0;
     }
     if ($unsafe_terms == array()) {
         $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase, "en-US");
         $unsafe_terms = array_keys($unsafe_lists);
     }
     $num_unsafe_terms = 0;
     $unsafe_count = 0;
     $words = array_keys($word_lists);
     $unsafe_found = array_intersect($words, $unsafe_terms);
     foreach ($unsafe_found as $term) {
         $count = count($word_lists[$term]);
         if ($count > 0) {
             $unsafe_count += $count;
             $num_unsafe_terms++;
         }
     }
     $score = $num_unsafe_terms * $unsafe_count / ($len + 1);
     return $score;
 }
Beispiel #4
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Beispiel #5
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Beispiel #6
0
 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }