/** * urlMemberSiteArray is a function called by both allowedToCrawlSite * disallowedToCrawlSite to test if a url belongs to alist of * regex's of urls or domain. This test function tests this functionality */ function urlMemberSiteArrayTestCase() { $sites = array("http://www.example.com/", "http://www.cs.sjsu.edu/faculty/pollett/*/*/", "http://www.bing.com/video/search?*&*&", "http://*.cool.*/a/*/", "domain:ucla.edu", "domain:foodnetwork.com"); $test_urls = array(array("http://www.cs.sjsu.edu/faculty/pollett/", false, "regex url negative 1"), array("http://www.bing.com/video/search?", false, "regex url negative 2"), array("http://www.cool.edu/a", false, "regex url negative 3"), array("http://ucla.edu.com", false, "domain test negative"), array("http://www.cs.sjsu.edu/faculty/pollett/a/b/c", true, "regex url positive 1"), array("http://www.bing.com/video/search?a&b&c", true, "regex url positive 2"), array("http://www.cool.bob.edu/a/b/c", true, "regex url positive 3"), array("http://test.ucla.edu", true, "domain test positive"), array("https://test.ucla.edu", true, "domain https test positive"), array("gopher://test.ucla.edu", true, "domain gopher stest positive"), array("http://www.foodnetworkstore.com/small-appliances/", false, "domain test negative")); foreach ($test_urls as $test_url) { $result = UrlParser::urlMemberSiteArray($test_url[0], $sites, "s"); $this->assertEqual($result, $test_url[1], $test_url[2]); } }
/** * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT * This method attempts to cull from the doc_info struct the * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing * links which of filetype or sites the crawler is forbidden from crawl. * Then a crude estimate of the informaation contained in the links test: * strlen(gzip(text)) is used to extract the best remaining links. * * @param array& $doc_info a string with a CrawlConstants::LINKS subarray * This subarray in turn contains url => text pairs. * @param string $field field for links default is CrawlConstants::LINKS * @param int $member_cache_time says how long allowed and disallowed url * info should be caches by urlMemberSiteArray */ function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0) { if (!isset($doc_info[self::LINKS])) { return; } $links = array(); $allowed_name = "a" . $member_cache_time; $disallowed_name = "d" . $member_cache_time; foreach ($doc_info[$field] as $url => $text) { $doc_type = UrlParser::getDocumentType($url); if (!in_array($doc_type, $this->all_file_types)) { $doc_type = "unknown"; } if (!in_array($doc_type, $this->indexed_file_types)) { continue; } if ($this->restrict_sites_by_url) { if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) { continue; } } if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) { continue; } $links[$url] = $text; } $doc_info[$field] = UrlParser::pruneLinks($links); }
/** * Checks if the $url is from a site which has an hourly quota to download. * If so, it bumps the quota count and return true; false otherwise. * This method also resets the quota queue every over * * @param string $url to check if within quota * @return bool whether $url exceeds the hourly quota of the site it is from */ function withinQuota($url) { if (!($site = UrlParser::urlMemberSiteArray($url, $this->quota_sites_keys, "q" . $this->allow_disallow_cache_time, true))) { return true; } list($quota, $current_count) = $this->quota_sites[$site]; if ($current_count < $quota) { $this->quota_sites[$site] = array($quota, $current_count + 1); $flag = true; } else { $flag = false; } if ($this->quota_clear_time + ONE_HOUR < time()) { $this->quota_clear_time = time(); foreach ($this->quota_sites as $site => $info) { list($quota, ) = $info; $this->quota_sites[$site] = array($quota, 0); } } return $flag; }
/** * This method adds robots metas to or removes entirely a summary * produced by a text page processor or its subsclasses depending on * whether the summary title and description satisfy various rules * in $this->filter_rules * * @param array& $summary the summary data produced by the relevant page * processor's handle method; modified in-place. * @param string $url the url where the summary contents came from */ function pageSummaryProcessing(&$summary, $url) { $sites = array_keys($this->filter_rules); $filter_rules = $this->filter_rules; $rules = $filter_rules['default'] ? $filter_rules['default'] : array(); foreach ($sites as $site) { if ($site == "default") { continue; } $sign = $site[0] == '-' ? false : true; if (!$sign || $site[0] == '+') { $check_url = substr($site, 1); } else { $check_url = $site; } if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) { $rules = array_merge($rules, $filter_rules[$site]); } } foreach ($rules as $rule) { $preconditions = $rule["PRECONDITIONS"]; $actions = $rule["ACTIONS"]; $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]); if ($filter_flag) { if (in_array("NOPROCESS", $actions)) { crawlLog(" Word filter plugin removed page."); $summary = false; break; } else { if (!isset($summary[self::ROBOT_METAS])) { $summary[self::ROBOT_METAS] = array(); } $summary[self::ROBOT_METAS] += $actions; } } } }