Beispiel #1
  * urlMemberSiteArray is a function called by both allowedToCrawlSite
  * disallowedToCrawlSite to test if a url belongs to alist of
  * regex's of urls or domain. This test function tests this functionality
 function urlMemberSiteArrayTestCase()
     $sites = array("", "*/*/", "*&*&", "http://*.cool.*/a/*/", "", "");
     $test_urls = array(array("", false, "regex url negative 1"), array("", false, "regex url negative 2"), array("", false, "regex url negative 3"), array("", false, "domain test negative"), array("", true, "regex url positive 1"), array("", true, "regex url positive 2"), array("", true, "regex url positive 3"), array("", true, "domain test positive"), array("", true, "domain https test positive"), array("gopher://", true, "domain gopher stest positive"), array("", false, "domain test negative"));
     foreach ($test_urls as $test_url) {
         $result = UrlParser::urlMemberSiteArray($test_url[0], $sites, "s");
         $this->assertEqual($result, $test_url[1], $test_url[2]);
Beispiel #2
  * Page processors are allowed to extract up to MAX_LINKS_TO_EXTRACT
  * This method attempts to cull from the doc_info struct the
  * best MAX_LINKS_PER_PAGE. Currently, this is done by first removing
  * links which of filetype or sites the crawler is forbidden from crawl.
  * Then a crude estimate of the informaation contained in the links test:
  * strlen(gzip(text)) is used to extract the best remaining links.
  * @param array& $doc_info a string with a CrawlConstants::LINKS subarray
  * This subarray in turn contains url => text pairs.
  * @param string $field field for links default is CrawlConstants::LINKS
  * @param int $member_cache_time says how long allowed and disallowed url
  *      info should be caches by urlMemberSiteArray
 function pruneLinks(&$doc_info, $field = CrawlConstants::LINKS, $member_cache_time = 0)
     if (!isset($doc_info[self::LINKS])) {
     $links = array();
     $allowed_name = "a" . $member_cache_time;
     $disallowed_name = "d" . $member_cache_time;
     foreach ($doc_info[$field] as $url => $text) {
         $doc_type = UrlParser::getDocumentType($url);
         if (!in_array($doc_type, $this->all_file_types)) {
             $doc_type = "unknown";
         if (!in_array($doc_type, $this->indexed_file_types)) {
         if ($this->restrict_sites_by_url) {
             if (!UrlParser::urlMemberSiteArray($url, $this->allowed_sites, $allowed_name)) {
         if (UrlParser::urlMemberSiteArray($url, $this->disallowed_sites, $disallowed_name)) {
         $links[$url] = $text;
     $doc_info[$field] = UrlParser::pruneLinks($links);
Beispiel #3
  * Checks if the $url is from a site which has an hourly quota to download.
  * If so, it bumps the quota count and return true; false otherwise.
  * This method also resets the quota queue every over
  * @param string $url to check if within quota
  * @return bool whether $url exceeds the hourly quota of the site it is from
 function withinQuota($url)
     if (!($site = UrlParser::urlMemberSiteArray($url, $this->quota_sites_keys, "q" . $this->allow_disallow_cache_time, true))) {
         return true;
     list($quota, $current_count) = $this->quota_sites[$site];
     if ($current_count < $quota) {
         $this->quota_sites[$site] = array($quota, $current_count + 1);
         $flag = true;
     } else {
         $flag = false;
     if ($this->quota_clear_time + ONE_HOUR < time()) {
         $this->quota_clear_time = time();
         foreach ($this->quota_sites as $site => $info) {
             list($quota, ) = $info;
             $this->quota_sites[$site] = array($quota, 0);
     return $flag;
Beispiel #4
  * This method adds robots metas to or removes entirely a summary
  * produced by a text page processor or its subsclasses depending on
  * whether the summary title and description satisfy various rules
  * in $this->filter_rules
  * @param array& $summary the summary data produced by the relevant page
  *     processor's handle method; modified in-place.
  * @param string $url the url where the summary contents came from
 function pageSummaryProcessing(&$summary, $url)
     $sites = array_keys($this->filter_rules);
     $filter_rules = $this->filter_rules;
     $rules = $filter_rules['default'] ? $filter_rules['default'] : array();
     foreach ($sites as $site) {
         if ($site == "default") {
         $sign = $site[0] == '-' ? false : true;
         if (!$sign || $site[0] == '+') {
             $check_url = substr($site, 1);
         } else {
             $check_url = $site;
         if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) {
             $rules = array_merge($rules, $filter_rules[$site]);
     foreach ($rules as $rule) {
         $preconditions = $rule["PRECONDITIONS"];
         $actions = $rule["ACTIONS"];
         $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]);
         if ($filter_flag) {
             if (in_array("NOPROCESS", $actions)) {
                 crawlLog("  Word filter plugin removed page.");
                 $summary = false;
             } else {
                 if (!isset($summary[self::ROBOT_METAS])) {
                     $summary[self::ROBOT_METAS] = array();
                 $summary[self::ROBOT_METAS] += $actions;