Esempio n. 1
0
 /**
  * Handles admin request related to controlling file options to be used
  * in a crawl
  *
  * This activity allows a user to specify the page range size to be
  * be used during a crawl as well as which file types can be downloaded
  */
 function pageOptions()
 {
     global $INDEXED_FILE_TYPES;
     /* get processors for different file types (populating
        $INDEXED_FILE_TYPES) */
     foreach (glob(BASE_DIR . "/lib/processors/*_processor.php") as $filename) {
         require_once $filename;
     }
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $profile_model = $parent->model("profile");
     $data["ELEMENT"] = "pageoptions";
     $data['SCRIPT'] = "";
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $num_machines = count($machine_urls);
     if ($num_machines < 1 || $num_machines == 1 && UrlParser::isLocalhostUrl($machine_urls[0])) {
         $machine_urls = NULL;
     }
     $data['available_options'] = array(tl('crawl_component_use_below'), tl('crawl_component_use_defaults'));
     $crawls = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data['options_default'] = tl('crawl_component_use_below');
     foreach ($crawls as $crawl) {
         if (strlen($crawl['DESCRIPTION']) > 0) {
             $data['available_options'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'];
         }
     }
     $seed_info = $crawl_model->getSeedInfo();
     $data['RECRAWL_FREQS'] = array(-1 => tl('crawl_component_recrawl_never'), 1 => tl('crawl_component_recrawl_1day'), 2 => tl('crawl_component_recrawl_2day'), 3 => tl('crawl_component_recrawl_3day'), 7 => tl('crawl_component_recrawl_7day'), 14 => tl('crawl_component_recrawl_14day'));
     $data['SIZE_VALUES'] = array(10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['LEN_VALUES'] = array(2000 => 2000, 10000 => 10000, 50000 => 50000, 100000 => 100000, 500000 => 500000, 1000000 => 1000000, 5000000 => 5000000, 10000000 => 10000000);
     $data['available_summarizers'] = array(self::BASIC_SUMMARIZER => tl('crawl_component_basic'), self::CENTROID_SUMMARIZER => tl('crawl_component_centroid'));
     if (!isset($seed_info["indexed_file_types"]["extensions"])) {
         $seed_info["indexed_file_types"]["extensions"] = $INDEXED_FILE_TYPES;
     }
     $loaded = false;
     if (isset($_REQUEST['load_option']) && $_REQUEST['load_option'] > 0) {
         if ($_REQUEST['load_option'] == 1) {
             $seed_loaded = $crawl_model->getSeedInfo(true);
         } else {
             $timestamp = substr($parent->clean($_REQUEST['load_option'], "int"), 0, TIMESTAMP_LEN);
             $seed_loaded = $crawl_model->getCrawlSeedInfo($timestamp, $machine_urls);
         }
         $copy_options = array("general" => array("page_recrawl_frequency", "page_range_request", "max_description_len", "cache_pages", 'summarizer_option'), "indexed_file_types" => array("extensions"), "indexing_plugins" => array("plugins", "plugins_data"));
         foreach ($copy_options as $main_option => $sub_options) {
             foreach ($sub_options as $sub_option) {
                 if (isset($seed_loaded[$main_option][$sub_option])) {
                     $seed_info[$main_option][$sub_option] = $seed_loaded[$main_option][$sub_option];
                 }
             }
         }
         if (isset($seed_loaded['page_rules'])) {
             $seed_info['page_rules'] = $seed_loaded['page_rules'];
         }
         if (isset($seed_loaded['active_classifiers'])) {
             $seed_info['active_classifiers'] = $seed_loaded['active_classifiers'];
         } else {
             $seed_info['active_classifiers'] = array();
             $seed_info['active_classifiers']['label'] = array();
         }
         $loaded = true;
     } else {
         $seed_info = $crawl_model->getSeedInfo();
         if (isset($_REQUEST["page_recrawl_frequency"]) && in_array($_REQUEST["page_recrawl_frequency"], array_keys($data['RECRAWL_FREQS']))) {
             $seed_info["general"]["page_recrawl_frequency"] = $_REQUEST["page_recrawl_frequency"];
         }
         if (isset($_REQUEST["page_range_request"]) && in_array($_REQUEST["page_range_request"], $data['SIZE_VALUES'])) {
             $seed_info["general"]["page_range_request"] = $_REQUEST["page_range_request"];
         }
         if (isset($_REQUEST['summarizer_option']) && in_array($_REQUEST['summarizer_option'], array_keys($data['available_summarizers']))) {
             $seed_info['general']['summarizer_option'] = $_REQUEST['summarizer_option'];
         }
         if (isset($_REQUEST["max_description_len"]) && in_array($_REQUEST["max_description_len"], $data['LEN_VALUES'])) {
             $seed_info["general"]["max_description_len"] = $_REQUEST["max_description_len"];
         }
         if (isset($_REQUEST["cache_pages"])) {
             $seed_info["general"]["cache_pages"] = true;
         } else {
             if (isset($_REQUEST['posted'])) {
                 //form sent but check box unchecked
                 $seed_info["general"]["cache_pages"] = false;
             }
         }
         if (isset($_REQUEST['page_rules'])) {
             $seed_info['page_rules']['rule'] = $parent->convertStringCleanArray($_REQUEST['page_rules'], 'rule');
         }
     }
     if (!isset($seed_info["general"]["page_recrawl_frequency"])) {
         $seed_info["general"]["page_recrawl_frequency"] = PAGE_RECRAWL_FREQUENCY;
     }
     $data['summarizer_option'] = $seed_info['general']['summarizer_option'];
     $data['PAGE_RECRAWL_FREQUENCY'] = $seed_info["general"]["page_recrawl_frequency"];
     if (!isset($seed_info["general"]["cache_pages"])) {
         $seed_info["general"]["cache_pages"] = false;
     }
     $data["CACHE_PAGES"] = $seed_info["general"]["cache_pages"];
     if (!isset($seed_info["general"]["page_range_request"])) {
         $seed_info["general"]["page_range_request"] = PAGE_RANGE_REQUEST;
     }
     $data['PAGE_SIZE'] = $seed_info["general"]["page_range_request"];
     if (!isset($seed_info["general"]["max_description_len"])) {
         $seed_info["general"]["max_description_len"] = MAX_DESCRIPTION_LEN;
     }
     $data['MAX_LEN'] = $seed_info["general"]["max_description_len"];
     $data['INDEXING_PLUGINS'] = array();
     $included_plugins = array();
     if (isset($_REQUEST["posted"]) && !$loaded) {
         $seed_info['indexing_plugins']['plugins'] = isset($_REQUEST["INDEXING_PLUGINS"]) ? $_REQUEST["INDEXING_PLUGINS"] : array();
     }
     $included_plugins = isset($seed_info['indexing_plugins']['plugins']) ? $seed_info['indexing_plugins']['plugins'] : array();
     foreach ($parent->indexing_plugins as $plugin) {
         $plugin_name = ucfirst($plugin);
         $data['INDEXING_PLUGINS'][$plugin_name]['checked'] = in_array($plugin_name, $included_plugins) ? "checked='checked'" : "";
         /* to use method_exists we want that the require_once for the plugin
              class has occurred so we instantiate the object via the plugin
              method call which will also do the require if needed.
            */
         $plugin_object = $parent->plugin(lcfirst($plugin_name));
         $class_name = $plugin_name . "Plugin";
         if ($loaded && method_exists($class_name, 'setConfiguration') && method_exists($class_name, 'loadDefaultConfiguration')) {
             if (isset($seed_info['indexing_plugins']['plugins_data'][$plugin_name])) {
                 $plugin_object->setConfiguration($seed_info['indexing_plugins']['plugins_data'][$plugin_name]);
             } else {
                 $plugin_object->loadDefaultConfiguration();
             }
             $plugin_object->saveConfiguration();
         }
         if (method_exists($class_name, 'configureHandler') && method_exists($class_name, 'configureView')) {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = true;
             $plugin_object->configureHandler($data);
         } else {
             $data['INDEXING_PLUGINS'][$plugin_name]['configure'] = false;
         }
     }
     $profile = $profile_model->getProfile(WORK_DIRECTORY);
     if (!isset($_REQUEST['load_option'])) {
         $data = array_merge($data, $profile);
     } else {
         $parent->updateProfileFields($data, $profile, array('IP_LINK', 'CACHE_LINK', 'SIMILAR_LINK', 'IN_LINK', 'SIGNIN_LINK', 'SUBSEARCH_LINK', 'WORD_SUGGEST'));
     }
     $weights = array('TITLE_WEIGHT' => 4, 'DESCRIPTION_WEIGHT' => 1, 'LINK_WEIGHT' => 2, 'MIN_RESULTS_TO_GROUP' => 200, 'SERVER_ALPHA' => 1.6);
     $change = false;
     foreach ($weights as $weight => $value) {
         if (isset($_REQUEST[$weight])) {
             $data[$weight] = $parent->clean($_REQUEST[$weight], 'float', 1);
             $profile[$weight] = $data[$weight];
             $change = true;
         } else {
             if (isset($profile[$weight]) && $profile[$weight] != "") {
                 $data[$weight] = $profile[$weight];
             } else {
                 $data[$weight] = $value;
                 $profile[$weight] = $data[$weight];
                 $change = true;
             }
         }
     }
     if ($change == true) {
         $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
     }
     $data['INDEXED_FILE_TYPES'] = array();
     $filetypes = array();
     foreach ($INDEXED_FILE_TYPES as $filetype) {
         $ison = false;
         if (isset($_REQUEST["filetype"]) && !$loaded) {
             if (isset($_REQUEST["filetype"][$filetype])) {
                 $filetypes[] = $filetype;
                 $ison = true;
                 $change = true;
             }
         } else {
             if (in_array($filetype, $seed_info["indexed_file_types"]["extensions"])) {
                 $filetypes[] = $filetype;
                 $ison = true;
             }
         }
         $data['INDEXED_FILE_TYPES'][$filetype] = $ison ? "checked='checked'" : '';
     }
     $seed_info["indexed_file_types"]["extensions"] = $filetypes;
     $data['CLASSIFIERS'] = array();
     $data['RANKERS'] = array();
     $active_classifiers = array();
     $active_rankers = array();
     foreach (Classifier::getClassifierList() as $classifier) {
         $label = $classifier->class_label;
         $ison = false;
         if (isset($_REQUEST['classifier']) && !$loaded) {
             if (isset($_REQUEST['classifier'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_classifiers']['label'])) {
                 if (in_array($label, $seed_info['active_classifiers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['CLASSIFIERS'][$label] = 'checked="checked"';
             $active_classifiers[] = $label;
         } else {
             $data['CLASSIFIERS'][$label] = '';
         }
         $ison = false;
         if (isset($_REQUEST['ranker']) && !$loaded) {
             if (isset($_REQUEST['ranker'][$label])) {
                 $ison = true;
             }
         } else {
             if ($loaded || !isset($_REQUEST['posted']) && isset($seed_info['active_rankers']['label'])) {
                 if (isset($seed_info['active_rankers']['label']) && in_array($label, $seed_info['active_rankers']['label'])) {
                     $ison = true;
                 }
             }
         }
         if ($ison) {
             $data['RANKERS'][$label] = 'checked="checked"';
             $active_rankers[] = $label;
         } else {
             $data['RANKERS'][$label] = '';
         }
     }
     $parent->pagingLogic($data, 'CLASSIFIERS', 'CLASSIFIERS', DEFAULT_ADMIN_PAGING_NUM / 5, array(), "", array('name' => 'class_label'));
     $seed_info['active_classifiers']['label'] = $active_classifiers;
     $seed_info['active_rankers']['label'] = $active_rankers;
     if (isset($seed_info['page_rules']['rule'])) {
         if (isset($seed_info['page_rules']['rule']['rule'])) {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']['rule']);
         } else {
             $data['page_rules'] = $parent->convertArrayLines($seed_info['page_rules']['rule']);
         }
     } else {
         $data['page_rules'] = "";
     }
     $allowed_options = array('crawl_time', 'search_time', 'test_options');
     if (isset($_REQUEST['option_type']) && in_array($_REQUEST['option_type'], $allowed_options)) {
         $data['option_type'] = $_REQUEST['option_type'];
     } else {
         $data['option_type'] = 'crawl_time';
     }
     if ($data['option_type'] == 'crawl_time') {
         $data['crawl_time_active'] = "active";
         $data['search_time_active'] = "";
         $data['test_options_active'] = "";
         $data['SCRIPT'] .= "\nswitchTab('crawltimetab'," . "'searchtimetab', 'testoptionstab')\n";
     } else {
         if ($data['option_type'] == 'search_time') {
             $data['search_time_active'] = "active";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "";
             $data['SCRIPT'] .= "\nswitchTab('searchtimetab'," . "'crawltimetab', 'testoptionstab')\n";
         } else {
             $data['search_time_active'] = "";
             $data['crawl_time_active'] = "";
             $data['test_options_active'] = "active";
             $data['SCRIPT'] .= "\nswitchTab('testoptionstab'," . "'crawltimetab', 'searchtimetab');\n";
         }
     }
     $crawl_model->setSeedInfo($seed_info);
     if ($change == true && $data['option_type'] != 'test_options') {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_updated') . "</h1>')";
     }
     $test_processors = array("text/html" => "html", "text/asp" => "html", "text/xml" => "xml", "text/robot" => "robot", "application/xml" => "xml", "application/xhtml+xml" => "html", "application/rss+xml" => "rss", "application/atom+xml" => "rss", "text/csv" => "text", "text/gopher" => "gopher", "text/plain" => "text", "text/rtf" => "rtf", "text/tab-separated-values" => "text");
     $data['MIME_TYPES'] = array_keys($test_processors);
     $data['page_type'] = "text/html";
     if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
         $data['page_type'] = $_REQUEST['page_type'];
     }
     $data['TESTPAGE'] = isset($_REQUEST['TESTPAGE']) ? $parent->clean($_REQUEST['TESTPAGE'], 'string') : "";
     if ($data['option_type'] == 'test_options' && $data['TESTPAGE'] != "") {
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_page_options_running_tests') . "</h1>')";
         $site = array();
         $site[self::ENCODING] = "UTF-8";
         $site[self::URL] = "http://test-site.yioop.com/";
         $site[self::IP_ADDRESSES] = array("1.1.1.1");
         $site[self::HTTP_CODE] = 200;
         $site[self::MODIFIED] = date("U", time());
         $site[self::TIMESTAMP] = time();
         $site[self::TYPE] = "text/html";
         $site[self::HEADER] = "page options test extractor";
         $site[self::SERVER] = "unknown";
         $site[self::SERVER_VERSION] = "unknown";
         $site[self::OPERATING_SYSTEM] = "unknown";
         $site[self::LANG] = 'en';
         $site[self::JUST_METAS] = false;
         if (isset($_REQUEST['page_type']) && in_array($_REQUEST['page_type'], $data['MIME_TYPES'])) {
             $site[self::TYPE] = $_REQUEST['page_type'];
         }
         if ($site[self::TYPE] == 'text/html') {
             $site[self::ENCODING] = guessEncodingHtml($_REQUEST['TESTPAGE']);
         }
         $prefix_name = $test_processors[$site[self::TYPE]];
         $processor_name = ucfirst($prefix_name) . "Processor";
         $plugin_processors = array();
         if (isset($seed_info['indexing_plugins']['plugins'])) {
             foreach ($seed_info['indexing_plugins']['plugins'] as $plugin) {
                 $plugin_name = $plugin . "Plugin";
                 $supported_processors = $plugin_name::getProcessors();
                 foreach ($supported_processors as $supported_processor) {
                     $parent_processor = $processor_name;
                     do {
                         if ($supported_processor == $parent_processor) {
                             $plugin_object = $parent->plugin(lcfirst($plugin));
                             if (method_exists($plugin_name, "loadConfiguration")) {
                                 $plugin_object->loadConfiguration();
                             }
                             $plugin_processors[] = $plugin_object;
                             break;
                         }
                     } while (($parent_processor = get_parent_class($parent_processor)) && $parent_processor != "PageProcessor");
                 }
             }
         }
         $page_processor = new $processor_name($plugin_processors, $seed_info["general"]["max_description_len"], $seed_info["general"]["summarizer_option"]);
         restore_error_handler();
         $data["PAGE_RANGE_REQUEST"] = $seed_info["general"]["page_range_request"];
         $doc_info = $page_processor->handle(substr($_REQUEST['TESTPAGE'], 0, $data["PAGE_RANGE_REQUEST"]), $site[self::URL]);
         set_error_handler("yioop_error_handler");
         if (!$doc_info) {
             $data["AFTER_PAGE_PROCESS"] = "";
             $data["AFTER_RULE_PROCESS"] = "";
             $data["EXTRACTED_WORDS"] = "";
             $data["EXTRACTED_META_WORDS"] = "";
             return $data;
         }
         if ($processor_name != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
             $doc_info[self::LINKS] = UrlParser::pruneLinks($doc_info[self::LINKS]);
         }
         foreach ($doc_info as $key => $value) {
             $site[$key] = $value;
         }
         if (isset($site[self::PAGE])) {
             unset($site[self::PAGE]);
         }
         if (isset($site[self::ROBOT_PATHS])) {
             $site[self::JUST_METAS] = true;
         }
         $reflect = new ReflectionClass("CrawlConstants");
         $crawl_constants = $reflect->getConstants();
         $crawl_keys = array_keys($crawl_constants);
         $crawl_values = array_values($crawl_constants);
         $inverse_constants = array_combine($crawl_values, $crawl_keys);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_PAGE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $rule_string = implode("\n", $seed_info['page_rules']['rule']);
         $rule_string = html_entity_decode($rule_string, ENT_QUOTES);
         $page_rule_parser = new PageRuleParser($rule_string);
         $page_rule_parser->executeRuleTrees($site);
         $after_process = array();
         foreach ($site as $key => $value) {
             $out_key = isset($inverse_constants[$key]) ? $inverse_constants[$key] : $key;
             $after_process[$out_key] = $value;
         }
         $data["AFTER_RULE_PROCESS"] = wordwrap($parent->clean(print_r($after_process, true), "string"), 75, "\n", true);
         $lang = NULL;
         if (isset($site[self::LANG])) {
             $lang = $site[self::LANG];
         }
         $meta_ids = PhraseParser::calculateMetas($site);
         if (!$site[self::JUST_METAS]) {
             $host_words = UrlParser::getWordsIfHostUrl($site[self::URL]);
             $path_words = UrlParser::getWordsLastPathPartUrl($site[self::URL]);
             $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
             if ($site[self::TITLE] != "") {
                 $lang = guessLocaleFromString($site[self::TITLE], $lang);
             } else {
                 $lang = guessLocaleFromString(substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $lang);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!isset($word_lists)) {
             $word_lists = array();
         }
         $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(print_r($word_lists, true), "string"), 75, "\n", true);
         $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(print_r($meta_ids, true), "string"), 75, "\n", true);
     }
     return $data;
 }