Exemple #1
0
 /**
  * urlMemberSiteArray is a function called by both allowedToCrawlSite
  * disallowedToCrawlSite to test if a url belongs to alist of
  * regex's of urls or domain. This test function tests this functionality
  */
 function urlMemberSiteArrayTestCase()
 {
     $sites = array("http://www.example.com/", "http://www.cs.sjsu.edu/faculty/pollett/*/*/", "http://www.bing.com/video/search?*&*&", "http://*.cool.*/a/*/", "domain:ucla.edu", "domain:foodnetwork.com");
     $test_urls = array(array("http://www.cs.sjsu.edu/faculty/pollett/", false, "regex url negative 1"), array("http://www.bing.com/video/search?", false, "regex url negative 2"), array("http://www.cool.edu/a", false, "regex url negative 3"), array("http://ucla.edu.com", false, "domain test negative"), array("http://www.cs.sjsu.edu/faculty/pollett/a/b/c", true, "regex url positive 1"), array("http://www.bing.com/video/search?a&b&c", true, "regex url positive 2"), array("http://www.cool.bob.edu/a/b/c", true, "regex url positive 3"), array("http://test.ucla.edu", true, "domain test positive"), array("https://test.ucla.edu", true, "domain https test positive"), array("gopher://test.ucla.edu", true, "domain gopher stest positive"), array("http://www.foodnetworkstore.com/small-appliances/", false, "domain test negative"));
     foreach ($test_urls as $test_url) {
         $result = UrlParser::urlMemberSiteArray($test_url[0], $sites, "s");
         $this->assertEqual($result, $test_url[1], $test_url[2]);
     }
 }
Exemple #2
0
 /**
  * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
  * This method attempts to cull from the doc_info struct the
  * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
  * links which of filetype or sites the crawler is forbidden from crawl.
  * Then a crude estimate of the informaation contained in the links test:
  * strlen(gzip(text)) is used to extract the best remaining links.
  *
  * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
  * This subarray in turn contains url => text pairs.
  * @param string $field field for links default is CrawlConstants::LINKS
  * @param int $member_cache_time says how long allowed and disallowed url
  *      info should be caches by urlMemberSiteArray
  */
 function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0)
 {
     if (!isset($doc_info[self::LINKS])) {
         return;
     }
     $links = array();
     $allowed_name = "a" . $member_cache_time;
     $disallowed_name = "d" . $member_cache_time;
     foreach ($doc_info[$field] as $url => $text) {
         $doc_type = UrlParser::getDocumentType($url);
         if (!in_array($doc_type, $this->all_file_types)) {
             $doc_type = "unknown";
         }
         if (!in_array($doc_type, $this->indexed_file_types)) {
             continue;
         }
         if ($this->restrict_sites_by_url) {
             if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) {
                 continue;
             }
         }
         if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) {
             continue;
         }
         $links[$url] = $text;
     }
     $doc_info[$field] = UrlParser::pruneLinks($links);
 }
Exemple #3
0
 /**
  * Checks if the $url is from a site which has an hourly quota to download.
  * If so, it bumps the quota count and return true; false otherwise.
  * This method also resets the quota queue every over
  *
  * @param string $url to check if within quota
  * @return bool whether $url exceeds the hourly quota of the site it is from
  */
 function withinQuota($url)
 {
     if (!($site = UrlParser::urlMemberSiteArray($url, $this->quota_sites_keys, "q" . $this->allow_disallow_cache_time, true))) {
         return true;
     }
     list($quota, $current_count) = $this->quota_sites[$site];
     if ($current_count < $quota) {
         $this->quota_sites[$site] = array($quota, $current_count + 1);
         $flag = true;
     } else {
         $flag = false;
     }
     if ($this->quota_clear_time + ONE_HOUR < time()) {
         $this->quota_clear_time = time();
         foreach ($this->quota_sites as $site => $info) {
             list($quota, ) = $info;
             $this->quota_sites[$site] = array($quota, 0);
         }
     }
     return $flag;
 }
Exemple #4
0
 /**
  * This method adds robots metas to or removes entirely a summary
  * produced by a text page processor or its subsclasses depending on
  * whether the summary title and description satisfy various rules
  * in $this->filter_rules
  *
  * @param array& $summary the summary data produced by the relevant page
  *     processor's handle method; modified in-place.
  * @param string $url the url where the summary contents came from
  */
 function pageSummaryProcessing(&$summary, $url)
 {
     $sites = array_keys($this->filter_rules);
     $filter_rules = $this->filter_rules;
     $rules = $filter_rules['default'] ? $filter_rules['default'] : array();
     foreach ($sites as $site) {
         if ($site == "default") {
             continue;
         }
         $sign = $site[0] == '-' ? false : true;
         if (!$sign || $site[0] == '+') {
             $check_url = substr($site, 1);
         } else {
             $check_url = $site;
         }
         if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) {
             $rules = array_merge($rules, $filter_rules[$site]);
         }
     }
     foreach ($rules as $rule) {
         $preconditions = $rule["PRECONDITIONS"];
         $actions = $rule["ACTIONS"];
         $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]);
         if ($filter_flag) {
             if (in_array("NOPROCESS", $actions)) {
                 crawlLog("  Word filter plugin removed page.");
                 $summary = false;
                 break;
             } else {
                 if (!isset($summary[self::ROBOT_METAS])) {
                     $summary[self::ROBOT_METAS] = array();
                 }
                 $summary[self::ROBOT_METAS] += $actions;
             }
         }
     }
 }