Ejemplo n.º 1
0
 /**
  * Checks if the url belongs to one of the sites listed in site_array
  * Sites can be either given in the form domain:host or
  * in the form of a url in which case it is check that the site url
  * is a substring of the passed url.
  *
  * @param string $url url to check
  * @param array $site_array sites to check against
  * @param string $name identifier to store $site_array with in this
  *     function's cache
  * @param bool $return_rule whether when a match is found to return true or
  *     to return the matching site rule
  * @return mixed whether the url belongs to one of the sites
  */
 static function urlMemberSiteArray($url, $site_array, $name, $return_rule = false)
 {
     static $cache = array();
     if (!is_array($site_array)) {
         return false;
     }
     if (!isset($cache[$name])) {
         if (count($cache) > 100) {
             $cache = array();
         }
         $i = 0;
         $cache[$name]["domains"] = array();
         $cache[$name]["hosts"] = array();
         $cache[$name]["paths"] = array();
         $cache[$name]["sites"] = array();
         foreach ($site_array as $site) {
             if (strncmp($site, "domain:", 7) == 0) {
                 $cache[$name]["domains"][] = substr($site, 7);
                 continue;
             }
             list($site_host, $site_path) = UrlParser::getHostAndPath($site, true, true);
             $cache[$name]["hosts"][] = $site_host;
             $cache[$name]["paths"][] = $site_path;
             $cache[$name]["sites"][] = $site;
             $i++;
         }
         $cache[$name]["domains"] = array_values(array_unique($cache[$name]["domains"]));
     }
     $flag = false;
     $domains =& $cache[$name]["domains"];
     $hosts =& $cache[$name]["hosts"];
     $paths =& $cache[$name]["paths"];
     $sites =& $cache[$name]["sites"];
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     foreach ($domains as $domain) {
         $pos = strrpos($host, $domain);
         if ($pos !== false && $pos + strlen($domain) == strlen($host)) {
             if ($return_rule) {
                 return "domain:{$domain}";
             }
             return true;
         }
     }
     $count = count($sites);
     for ($i = 0; $i < $count; $i++) {
         $flag = UrlParser::isPathMemberRegexPaths($host, array($hosts[$i]));
         if (!$flag) {
             continue;
         }
         $flag = UrlParser::isPathMemberRegexPaths($path, array($paths[$i]));
         if ($flag) {
             break;
         }
     }
     if ($return_rule && $flag) {
         $flag = $sites[$i];
     }
     return $flag;
 }
Ejemplo n.º 2
0
 /**
  * Checks if the given $url is allowed to be crawled based on stored
  * robots.txt info.
  * @param string $url to check
  * @return bool whether it was allowed or not
  */
 function checkRobotOkay($url)
 {
     // local cache of recent robot.txt stuff
     static $robot_cache = array();
     $cache_size = 2000;
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     $path = urldecode($path);
     $key = crawlHash($host, true);
     if (isset($robot_cache[$key])) {
         $robot_object = $robot_cache[$key];
     } else {
         $data = $this->robot_table->lookup($key);
         $offset = unpackInt($data);
         $robot_object = $this->robot_archive->getObjects($offset, 1);
         $robot_cache[$key] = $robot_object;
         if (count($robot_cache) > $cache_size) {
             array_shift($robot_cache);
         }
     }
     $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array();
     //these should have been urldecoded in RobotProcessor
     $robots_okay = true;
     $robots_not_okay = false;
     if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
         $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]);
         $robots_okay = !$robots_not_okay;
     }
     if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) {
         $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]);
     }
     return $robots_okay || !$robots_not_okay;
 }