public function testWeightedProbability()
 {
     $this->assertLessThan(0.7, $this->classifier->weightedProbability('out', 'IPC', 'featureProbability'));
     $this->assertGreaterThan(0.6, $this->classifier->weightedProbability('out', 'IPC', 'featureProbability'));
     // use within margin
     $this->assertLessThan(0.5, $this->classifier->weightedProbability('component', 'IPC', 'featureProbability'));
     $this->assertGreaterThan(0.4, $this->classifier->weightedProbability('component', 'IPC', 'featureProbability'));
 }
Example #2
0
 private function RunTestSet($input)
 {
     $classifier = new Classifier();
     $report = new Report($this->reportTemplate);
     $testSet = Spyc::YAMLLoad($input);
     $testBaseDir = dirname(realpath($input)) . '/';
     foreach ($testSet as $case) {
         $fileName = $testBaseDir . $case['fileName'];
         $classifier->Classify($this->GetFileNames($fileName));
         $report->OpenTestCase($case);
         $classifier->RenderReport($report);
         $report->CloseTestCase();
     }
     $report->Save($this->testReport);
 }
 public function load()
 {
     parent::load();
     //Unpack
     foreach ($this->db as $class => $model) {
         $this->db[$class] = array_flip($model);
     }
 }
 /**
  * @return HashMap
  */
 public function getCotacoes()
 {
     $map = new CodeMap();
     $values = new HashMap();
     $values->put("BMF", new ArrayIterator());
     $values->put("London", new ArrayIterator());
     $values->put("NY", new ArrayIterator());
     $values->put("Dolar", new ArrayIterator());
     $values->put("Euro", new ArrayIterator());
     $values->put("Arabica", new ArrayIterator());
     $values->put("IBovespa", new ArrayIterator());
     $dateMap = new DateMap();
     $classifier = new Classifier($dateMap);
     foreach ($this->json as $cotacao) {
         $cotation = $classifier->classify($cotacao);
         if ($cotation != null) {
             $cotation->setJsonInfo($cotacao);
             $values->get($cotation->type())->append($cotation);
         }
         $cot = new Cotacao();
     }
     return $values;
 }
Example #5
0
 /**
  * Handles admin request related to controlling file options to be used
  * in a crawl
  *
  * This activity allows a user to specify the page range size to be
  * be used during a crawl as well as which file types can be downloaded
  */
 function pageOptions()
 {
     global $INDEXED_FILE_TYPES;
     /* get processors for different file types (populating
        $INDEXED_FILE_TYPES) */
     foreach (glob(BASE_DIR . "/lib/processors/*_processor.php") as $filename) {
         require_once $filename;
     }
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $profile_model = $parent->model("profile");
     $data["ELEMENT"] = "pageoptions";
     $data['SCRIPT'] = "";
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $num_machines = count($machine_urls);
     if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) {
         $machine_urls = NULL;
     }
     $data['available_options'] = array(tl('crawl_component_use_below'), tl('crawl_component_use_defaults'));
     $crawls = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data['options_default'] = tl('crawl_component_use_below');
     foreach ($crawls as $crawl) {
         if (strlen($crawl['DESCRIPTION']) > 0) {
             $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'];
         }
     }
     $seed_info = $crawl_model->getSeedInfo();
     $data['RECRAWL_FREQS'] = array(-1 => tl('crawl_component_recrawl_never'), 1 => tl('crawl_component_recrawl_1day'), 2 => tl('crawl_component_recrawl_2day'), 3 => tl('crawl_component_recrawl_3day'), 7 => tl('crawl_component_recrawl_7day'), 14 => tl('crawl_component_recrawl_14day'));
     $data['SIZE_VALUES'] = array(10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['LEN_VALUES'] = array(2000 => 2000, 10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['available_summarizers'] = array(self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid'));
     if (!isset($seed_info["indexed_file_types"]["extensions"])) {
         $seed_info["indexed_file_types"]["extensions"] = $INDEXED_FILE_TYPES;
     }
     $loaded = false;
     if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) {
         if ($_REQUEST['load_option'] == 1) {
             $seed_loaded = $crawl_model->getSeedInfo(true);
         } else {
             $timestamp = substr($parent->clean($_REQUEST['load_option'], "int"), 0, TIMESTAMP_LEN);
             $seed_loaded = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls);
         }
         $copy_options = array("general" => array("page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'), "indexed_file_types" => array("extensions"), "indexing_plugins" => array("plugins", "plugins_data"));
         foreach ($copy_options as $main_option => $sub_options) {
             foreach ($sub_options as $sub_option) {
                 if (isset($seed_loaded[$main_option][$sub_option])) {
                     $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option];
                 }
             }
         }
         if (isset($seed_loaded['page_rules'])) {
             $seed_info['page_rules'] = $seed_loaded['page_rules'];
         }
         if (isset($seed_loaded['active_classifiers'])) {
             $seed_info['active_classifiers'] = $seed_loaded['active_classifiers'];
         } else {
             $seed_info['active_classifiers'] = array();
             $seed_info['active_classifiers']['label'] = array();
         }
         $loaded = true;
     } else {
         $seed_info = $crawl_model->getSeedInfo();
         if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) {
             $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"];
         }
         if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) {
             $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"];
         }
         if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) {
             $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option'];
         }
         if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) {
             $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"];
         }
         if (isset($_REQUEST["cache_pages"])) {
             $seed_info["general"]["cache_pages"] = true;
         } else {
             if (isset($_REQUEST['posted'])) {
                 //form sent but check box unchecked
                 $seed_info["general"]["cache_pages"] = false;
             }
         }
         if (isset($_REQUEST['page_rules'])) {
             $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray($_REQUEST['page_rules'], 'rule');
         }
     }
     if (!isset($seed_info["general"]["page_recrawl_frequency"])) {
         $seed_info["general"]["page_recrawl_frequency"] = PAGE_RECRAWL_FREQUENCY;
     }
     $data['summarizer_option'] = $seed_info['general']['summarizer_option'];
     $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"];
     if (!isset($seed_info["general"]["cache_pages"])) {
         $seed_info["general"]["cache_pages"] = false;
     }
     $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"];
     if (!isset($seed_info["general"]["page_range_request"])) {
         $seed_info["general"]["page_range_request"] = PAGE_RANGE_REQUEST;
     }
     $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"];
     if (!isset($seed_info["general"]["max_description_len"])) {
         $seed_info["general"]["max_description_len"] = MAX_DESCRIPTION_LEN;
     }
     $data['MAX_LEN'] = $seed_info["general"]["max_description_len"];
     $data['INDEXING_PLUGINS'] = array();
     $included_plugins = array();
     if (isset($_REQUEST["posted"]) && !$loaded) {
         $seed_info['indexing_plugins']['plugins'] = isset($_REQUEST["INDEXING_PLUGINS"]) ? $_REQUEST["INDEXING_PLUGINS"] : array();
     }
     $included_plugins = isset($seed_info['indexing_plugins']['plugins']) ? $seed_info['indexing_plugins']['plugins'] : array();
     foreach ($parent->indexing_plugins as $plugin) {
         $plugin_name = ucfirst($plugin);
         $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = in_array($plugin_name, $included_plugins) ? "checked='checked'" : "";
         /* to use method_exists we want that the require_once for the plugin
              class has occurred so we instantiate the object via the plugin
              method call which will also do the require if needed.
            */
         $plugin_object = $parent->plugin(lcfirst($plugin_name));
         $class_name = $plugin_name . "Plugin";
         if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) {
             if (isset($seed_info['indexing_plugins']['plugins_data'][$plugin_name])) {
                 $plugin_object->setConfiguration($seed_info['indexing_plugins']['plugins_data'][$plugin_name]);
             } else {
                 $plugin_object->loadDefaultConfiguration();
             }
             $plugin_object->saveConfiguration();
         }
         if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true;
             $plugin_object->configureHandler($data);
         } else {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false;
         }
     }
     $profile = $profile_model->getProfile(WORK_DIRECTORY);
     if (!isset($_REQUEST['load_option'])) {
         $data = array_merge($data, $profile);
     } else {
         $parent->updateProfileFields($data, $profile, array('IP_LINK', 'CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST'));
     }
     $weights = array('TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6);
     $change = false;
     foreach ($weights as $weight => $value) {
         if (isset($_REQUEST[$weight])) {
             $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1);
             $profile[$weight] = $data[$weight];
             $change = true;
         } else {
             if (isset($profile[$weight]) && $profile[$weight] != "") {
                 $data[$weight] = $profile[$weight];
             } else {
                 $data[$weight] = $value;
                 $profile[$weight] = $data[$weight];
                 $change = true;
             }
         }
     }
     if ($change == true) {
         $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
     }
     $data['INDEXED_FILE_TYPES'] = array();
     $filetypes = array();
     foreach ($INDEXED_FILE_TYPES as $filetype) {
         $ison = false;
         if (isset($_REQUEST["filetype"]) && !$loaded) {
             if (isset($_REQUEST["filetype"][$filetype])) {
                 $filetypes[] = $filetype;
                 $ison = true;
                 $change = true;
             }
         } else {
             if (in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) {
                 $filetypes[] = $filetype;
                 $ison = true;
             }
         }
         $data['INDEXED_FILE_TYPES'][$filetype] = $ison ? "checked='checked'" : '';
     }
     $seed_info["indexed_file_types"]["extensions"] = $filetypes;
     $data['CLASSIFIERS'] = array();
     $data['RANKERS'] = array();
     $active_classifiers = array();
     $active_rankers = array();
     foreach (Classifier::getClassifierList() as $classifier) {
         $label = $classifier->class_label;
         $ison = false;
         if (isset($_REQUEST['classifier']) && !$loaded) {
             if (isset($_REQUEST['classifier'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) {
                 if (in_array($label, $seed_info['active_classifiers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['CLASSIFIERS'][$label] = 'checked="checked"';
             $active_classifiers[] = $label;
         } else {
             $data['CLASSIFIERS'][$label] = '';
         }
         $ison = false;
         if (isset($_REQUEST['ranker']) && !$loaded) {
             if (isset($_REQUEST['ranker'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) {
                 if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['RANKERS'][$label] = 'checked="checked"';
             $active_rankers[] = $label;
         } else {
             $data['RANKERS'][$label] = '';
         }
     }
     $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', DEFAULT_ADMIN_PAGING_NUM / 5, array(), "", array('name' => 'class_label'));
     $seed_info['active_classifiers']['label'] = $active_classifiers;
     $seed_info['active_rankers']['label'] = $active_rankers;
     if (isset($seed_info['page_rules']['rule'])) {
         if (isset($seed_info['page_rules']['rule']['rule'])) {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']['rule']);
         } else {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']);
         }
     } else {
         $data['page_rules'] = "";
     }
     $allowed_options = array('crawl_time', 'search_time', 'test_options');
     if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) {
         $data['option_type'] = $_REQUEST['option_type'];
     } else {
         $data['option_type'] = 'crawl_time';
     }
     if ($data['option_type'] == 'crawl_time') {
         $data['crawl_time_active'] = "active";
         $data['search_time_active'] = "";
         $data['test_options_active'] = "";
         $data['SCRIPT'] .= "\nswitchTab('crawltimetab'," . "'searchtimetab', 'testoptionstab')\n";
     } else {
         if ($data['option_type'] == 'search_time') {
             $data['search_time_active'] = "active";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "";
             $data['SCRIPT'] .= "\nswitchTab('searchtimetab'," . "'crawltimetab', 'testoptionstab')\n";
         } else {
             $data['search_time_active'] = "";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "active";
             $data['SCRIPT'] .= "\nswitchTab('testoptionstab'," . "'crawltimetab', 'searchtimetab');\n";
         }
     }
     $crawl_model->setSeedInfo($seed_info);
     if ($change == true && $data['option_type'] != 'test_options') {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_updated') . "</h1>')";
     }
     $test_processors = array("text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text");
     $data['MIME_TYPES'] = array_keys($test_processors);
     $data['page_type'] = "text/html";
     if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
         $data['page_type'] = $_REQUEST['page_type'];
     }
     $data['TESTPAGE'] = isset($_REQUEST['TESTPAGE']) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : "";
     if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] != "") {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_running_tests') . "</h1>')";
         $site = array();
         $site[self::ENCODING] = "UTF-8";
         $site[self::URL] = "http://test-site.yioop.com/";
         $site[self::IP_ADDRESSES] = array("1.1.1.1");
         $site[self::HTTP_CODE] = 200;
         $site[self::MODIFIED] = date("U", time());
         $site[self::TIMESTAMP] = time();
         $site[self::TYPE] = "text/html";
         $site[self::HEADER] = "page options test extractor";
         $site[self::SERVER] = "unknown";
         $site[self::SERVER_VERSION] = "unknown";
         $site[self::OPERATING_SYSTEM] = "unknown";
         $site[self::LANG] = 'en';
         $site[self::JUST_METAS] = false;
         if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
             $site[self::TYPE] = $_REQUEST['page_type'];
         }
         if ($site[self::TYPE] == 'text/html') {
             $site[self::ENCODING] = guessEncodingHtml($_REQUEST['TESTPAGE']);
         }
         $prefix_name = $test_processors[$site[self::TYPE]];
         $processor_name = ucfirst($prefix_name) . "Processor";
         $plugin_processors = array();
         if (isset($seed_info['indexing_plugins']['plugins'])) {
             foreach ($seed_info['indexing_plugins']['plugins'] as $plugin) {
                 $plugin_name = $plugin . "Plugin";
                 $supported_processors = $plugin_name::getProcessors();
                 foreach ($supported_processors as $supported_processor) {
                     $parent_processor = $processor_name;
                     do {
                         if ($supported_processor == $parent_processor) {
                             $plugin_object = $parent->plugin(lcfirst($plugin));
                             if (method_exists($plugin_name, "loadConfiguration")) {
                                 $plugin_object->loadConfiguration();
                             }
                             $plugin_processors[] = $plugin_object;
                             break;
                         }
                     } while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor");
                 }
             }
         }
         $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]);
         restore_error_handler();
         $data["PAGE_RANGE_REQUEST"] = $seed_info["general"]["page_range_request"];
         $doc_info = $page_processor->handle(substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]);
         set_error_handler("yioop_error_handler");
         if (!$doc_info) {
             $data["AFTER_PAGE_PROCESS"] = "";
             $data["AFTER_RULE_PROCESS"] = "";
             $data["EXTRACTED_WORDS"] = "";
             $data["EXTRACTED_META_WORDS"] = "";
             return $data;
         }
         if ($processor_name != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
             $doc_info[self::LINKS] = UrlParser::pruneLinks($doc_info[self::LINKS]);
         }
         foreach ($doc_info as $key => $value) {
             $site[$key] = $value;
         }
         if (isset($site[self::PAGE])) {
             unset($site[self::PAGE]);
         }
         if (isset($site[self::ROBOT_PATHS])) {
             $site[self::JUST_METAS] = true;
         }
         $reflect = new ReflectionClass("CrawlConstants");
         $crawl_constants = $reflect->getConstants();
         $crawl_keys = array_keys($crawl_constants);
         $crawl_values = array_values($crawl_constants);
         $inverse_constants = array_combine($crawl_values, $crawl_keys);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $rule_string = implode("\n", $seed_info['page_rules']['rule']);
         $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
         $page_rule_parser = new PageRuleParser($rule_string);
         $page_rule_parser->executeRuleTrees($site);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $lang = NULL;
         if (isset($site[self::LANG])) {
             $lang = $site[self::LANG];
         }
         $meta_ids = PhraseParser::calculateMetas($site);
         if (!$site[self::JUST_METAS]) {
             $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]);
             $path_words = UrlParser::getWordsLastPathPartUrl($site[self::URL]);
             $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
             if ($site[self::TITLE] != "") {
                 $lang = guessLocaleFromString($site[self::TITLE], $lang);
             } else {
                 $lang = guessLocaleFromString(substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $lang);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!isset($word_lists)) {
             $word_lists = array();
         }
         $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(print_r($word_lists, true), "string"), 75, "\n", true);
         $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(print_r($meta_ids, true), "string"), 75, "\n", true);
     }
     return $data;
 }
Example #6
0
 /**
  * Checks for the crawl time according either to crawl_status.txt or to
  * network_status.txt, and presents it to the requesting fetcher, along
  * with a list of available queue servers.
  */
 function crawlTime()
 {
     $info = array();
     $info[self::STATUS] = self::CONTINUE_STATE;
     $view = "fetch";
     $cron_model = $this->model("cron");
     if (isset($_REQUEST['crawl_time'])) {
         $prev_crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN);
     } else {
         $prev_crawl_time = 0;
     }
     $cron_time = $cron_model->getCronTime("fetcher_restart");
     $delta = time() - $cron_time;
     if ($delta > self::CRON_INTERVAL) {
         $cron_model->updateCronTime("fetcher_restart");
         $this->doCronTasks();
     } else {
         if ($delta == 0) {
             $cron_model->updateCronTime("fetcher_restart");
         }
     }
     $local_filename = CRAWL_DIR . "/schedules/crawl_status.txt";
     $network_filename = CRAWL_DIR . "/schedules/network_status.txt";
     if (file_exists($local_filename)) {
         $crawl_status = unserialize(file_get_contents($local_filename));
         $crawl_time = isset($crawl_status["CRAWL_TIME"]) ? $crawl_status["CRAWL_TIME"] : 0;
     } else {
         if (file_exists($network_filename)) {
             $crawl_time = unserialize(file_get_contents($network_filename));
         } else {
             $crawl_time = 0;
         }
     }
     $info[self::CRAWL_TIME] = $crawl_time;
     $status_filename = CRAWL_DIR . "/schedules/name_server_messages.txt";
     if ($crawl_time != 0 && file_exists($status_filename)) {
         $status = unserialize(file_get_contents($status_filename));
         if ($status[self::STATUS] == 'STOP_CRAWL') {
             $info[self::STATUS] == 'STOP_CRAWL';
             $info[self::CRAWL_TIME] = 0;
         }
         if ($status[self::STATUS] != 'STOP_CRAWL' && $crawl_time != $prev_crawl_time) {
             $to_copy_fields = array(self::ALLOWED_SITES, self::ARC_DIR, self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE, self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES, self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL, self::SUMMARIZER_OPTION, self::TOR_PROXY);
             foreach ($to_copy_fields as $field) {
                 if (isset($status[$field])) {
                     $info[$field] = $status[$field];
                 }
             }
             /*
               When initiating a new crawl AND there are active
               classifiers (an array of class labels), then augment the
               info with compressed, serialized versions of each active
               classifier so that each fetcher can reconstruct the same
               classifiers.
             */
             $classifier_array = array();
             if (isset($status[self::ACTIVE_CLASSIFIERS])) {
                 $classifier_array = array_merge($status[self::ACTIVE_CLASSIFIERS]);
                 $info[self::ACTIVE_CLASSIFIERS] = $status[self::ACTIVE_CLASSIFIERS];
             }
             if (isset($status[self::ACTIVE_RANKERS])) {
                 $classifier_array = array_merge($classifier_array, $status[self::ACTIVE_RANKERS]);
                 $info[self::ACTIVE_RANKERS] = $status[self::ACTIVE_RANKERS];
             }
             if ($classifier_array != array()) {
                 $classifiers_data = Classifier::loadClassifiersData($classifier_array);
                 $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
             }
         }
     }
     $info[self::QUEUE_SERVERS] = $this->model("machine")->getQueueServerUrls();
     $info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes();
     $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size"));
     if (count($info[self::QUEUE_SERVERS]) == 0) {
         $info[self::QUEUE_SERVERS] = array(NAME_SERVER);
     }
     $data = array();
     $data['MESSAGE'] = serialize($info);
     $this->displayView($view, $data);
 }
Example #7
0
 /**
  * Finalizes the current classifier, uses it to classify all test
  * documents, and logs the classification error.  The current classifier is
  * saved to disk after finalizing (though not before), and left in
  * `classify' mode. The iterator over the test dataset is reset for the
  * next round of testing (if any).
  *
  * @param object $classifier classifier instance to test
  * @param array $data the array of training and test datasets, constructed
  * by loadDataset, of which only the `test' dataset it used.
  */
 function testClassifier($classifier, $data)
 {
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     $classifier->prepareToClassify();
     $wrong = 0;
     $total = 0;
     $pages = $data['test'];
     while (!$pages->end_of_iterator) {
         $page = $pages->nextPage();
         $score = $classifier->classify($page);
         $page_label = $score >= 0.5 ? 1 : -1;
         if ($page_label != $page['TRUE_LABEL']) {
             $wrong++;
         }
         $total++;
     }
     $error = (double) $wrong / $total;
     $this->log(0, 'error = %.4f', $error);
     $pages->reset();
 }
Example #8
0
 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }
 /**     
  * @param type $arrayObjects
  * @return ArrayIterator 
  */
 private function cotationFactory($arrayObjects)
 {
     $dateMap = new DateMap();
     /*if($type == "BMF"){
           return new BMF($dateMap);
       }elseif($type == "NY"){
           return new NewYork($dateMap);
       }elseif($type == "London"){
           return new London($dateMap);
       }*/
     $arrayCotations = new ArrayIterator();
     foreach ($arrayObjects as $arrayInfo) {
         $classifier = new Classifier($dateMap);
         $cotation = $classifier->classify($arrayInfo);
         $cotation->setJsonInfo($arrayInfo);
         $arrayCotations->append($cotation);
     }
     return $arrayCotations;
 }
Example #10
0
 /**
  * This is the function that should be called to get the
  * classifier_trainer to start training a logistic regression instance for
  * a particular classifier. The class label corresponding to the
  * classifier to be finalized should be passed as the second command-line
  * argument.
  */
 function start()
 {
     global $argv;
     CrawlDaemon::init($argv, "classifier_trainer");
     $label = $argv[2];
     crawlLog("Initializing classifier trainer log..", $label . '-classifier_trainer', true);
     $classifier = Classifier::getClassifier($label);
     $classifier->prepareToFinalize();
     $classifier->finalize();
     Classifier::setClassifier($classifier);
     crawlLog("Training complete.\n");
     CrawlDaemon::stop('classifier_trainer', $label);
 }
 public function __construct(Tokenizer $tokenizer)
 {
     parent::__construct($tokenizer);
 }
Example #12
0
 /**
  * Creates a fresh array from an existing page summary array, and augments
  * it with extra data relevant to the labeling interface on the client.
  *
  * @param array $page original page summary array
  * @param float $score classification score (estimated by the Naive Bayes
  * text classification algorithm) for $page
  * @param float $disagreement disagreement score computed for $page
  * @param int $crawl_time index the page came from
  * @param string $keywords query supplied to the crawl mix used to find
  * $page
  * @return array reduced page summary structure containing only the
  * information that the client needs to display a summary of the page
  */
 function prepareUnlabelledDocument($page, $score, $disagreement, $crawl_time, $keywords)
 {
     $phrase_model = $this->model("phrase");
     // Highlight the query keywords, if any.
     $disjunct_phrases = explode("|", $keywords);
     $words = array();
     foreach ($disjunct_phrases as $disjunct_phrase) {
         list($word_struct, $format_words) = $phrase_model->parseWordStructConjunctiveQuery($disjunct_phrase);
         $words = array_merge($words, $format_words);
     }
     $title = $phrase_model->boldKeywords($page[self::TITLE], $words);
     $description = $phrase_model->getSnippets(strip_tags($page[self::DESCRIPTION]), $words, 400);
     $description = $phrase_model->boldKeywords($description, $words);
     $cache_link = "?c=search&amp;a=cache" . "&amp;q=" . urlencode($keywords) . "&amp;arg=" . urlencode($page[self::URL]) . "&amp;its=" . $crawl_time;
     /*
       Note that the confidence is a transformation of the score that
       converts it into a value between 0 and 1, where it's 0 if the score
       was exactly 0.5, and increases toward 1 as the score either
       increases toward 1 or decreases toward 0.
     */
     return array('title' => $title, 'url' => $page[self::URL], 'key' => webencode(Classifier::makeKey($page)), 'cache_link' => $cache_link, 'description' => $description, 'score' => $score, 'positive' => $score >= 0.5 ? 1 : 0, 'confidence' => abs($score - 0.5) / 0.5, 'disagreement' => $disagreement);
 }