Example #1
0
 /**
  * Check is a path matches with a list of paths presumably coming from
  * a robots.txt file
  */
 function isPathMemberRegexPathsTestCase()
 {
     $path = array();
     $robot_paths = array();
     $results = array();
     $tests = array(array("/bobby", array("/bob"), true, "Substring Positive"), array("/bobby", array("/alice", "/f/g/h/d"), false, "Substring Negative 1"), array("/bobby/", array("/bobby/bay", "/f/g/h/d", "/yo"), false, "Substring Negative 2"), array("/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false, "Substring Negative 3 (should match start)"), array("http://test.com/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false, "Substring Negative 4 (should match start)"), array("/a/bbbb/c/", array("/bobby/bay", "/a/*/c/", "/yo"), true, "Star Positive 1"), array("/a/bbbb/d/", array("/bobby/bay", "/a/*/c/", "/yo"), false, "Star Negative 1"), array("/test.html?a=b", array("/bobby/bay", "/*?", "/yo"), true, "Star Positive 2"), array("/test.html", array("/bobby/bay", "/*.html\$", "/yo"), true, "Dollar Positive 1"), array("/test.htmlish", array("/bobby/bay", "/*.html\$", "/yo"), false, "Dollar Negative 1"), array("/test.htmlish", array("/bobby/bay", "*", "/yo"), true, "Degenerate 1"), array("/test.html", array("/bobby/bay", "/**.html\$", "/yo"), true, "Degenerate 2"), array("/videos/search?q=Angelina+Jolie", array("/videos/search?"), true, "End With Question Regex Case 1"));
     foreach ($tests as $test) {
         list($path, $robot_paths, $result, $description) = $test;
         $this->assertEqual(UrlParser::isPathMemberRegexPaths($path, $robot_paths), $result, $description);
     }
 }
Example #2
0
 /**
  * Checks if the url belongs to one of the sites listed in site_array
  * Sites can be either given in the form domain:host or
  * in the form of a url in which case it is check that the site url
  * is a substring of the passed url.
  *
  * @param string $url url to check
  * @param array $site_array sites to check against
  * @param string $name identifier to store $site_array with in this
  *     function's cache
  * @param bool $return_rule whether when a match is found to return true or
  *     to return the matching site rule
  * @return mixed whether the url belongs to one of the sites
  */
 static function urlMemberSiteArray($url, $site_array, $name, $return_rule = false)
 {
     static $cache = array();
     if (!is_array($site_array)) {
         return false;
     }
     if (!isset($cache[$name])) {
         if (count($cache) > 100) {
             $cache = array();
         }
         $i = 0;
         $cache[$name]["domains"] = array();
         $cache[$name]["hosts"] = array();
         $cache[$name]["paths"] = array();
         $cache[$name]["sites"] = array();
         foreach ($site_array as $site) {
             if (strncmp($site, "domain:", 7) == 0) {
                 $cache[$name]["domains"][] = substr($site, 7);
                 continue;
             }
             list($site_host, $site_path) = UrlParser::getHostAndPath($site, true, true);
             $cache[$name]["hosts"][] = $site_host;
             $cache[$name]["paths"][] = $site_path;
             $cache[$name]["sites"][] = $site;
             $i++;
         }
         $cache[$name]["domains"] = array_values(array_unique($cache[$name]["domains"]));
     }
     $flag = false;
     $domains =& $cache[$name]["domains"];
     $hosts =& $cache[$name]["hosts"];
     $paths =& $cache[$name]["paths"];
     $sites =& $cache[$name]["sites"];
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     foreach ($domains as $domain) {
         $pos = strrpos($host, $domain);
         if ($pos !== false && $pos + strlen($domain) == strlen($host)) {
             if ($return_rule) {
                 return "domain:{$domain}";
             }
             return true;
         }
     }
     $count = count($sites);
     for ($i = 0; $i < $count; $i++) {
         $flag = UrlParser::isPathMemberRegexPaths($host, array($hosts[$i]));
         if (!$flag) {
             continue;
         }
         $flag = UrlParser::isPathMemberRegexPaths($path, array($paths[$i]));
         if ($flag) {
             break;
         }
     }
     if ($return_rule && $flag) {
         $flag = $sites[$i];
     }
     return $flag;
 }
Example #3
0
 /**
  * Checks if the given $url is allowed to be crawled based on stored
  * robots.txt info.
  * @param string $url to check
  * @return bool whether it was allowed or not
  */
 function checkRobotOkay($url)
 {
     // local cache of recent robot.txt stuff
     static $robot_cache = array();
     $cache_size = 2000;
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     $path = urldecode($path);
     $key = crawlHash($host, true);
     if (isset($robot_cache[$key])) {
         $robot_object = $robot_cache[$key];
     } else {
         $data = $this->robot_table->lookup($key);
         $offset = unpackInt($data);
         $robot_object = $this->robot_archive->getObjects($offset, 1);
         $robot_cache[$key] = $robot_object;
         if (count($robot_cache) > $cache_size) {
             array_shift($robot_cache);
         }
     }
     $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array();
     //these should have been urldecoded in RobotProcessor
     $robots_okay = true;
     $robots_not_okay = false;
     if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
         $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]);
         $robots_okay = !$robots_not_okay;
     }
     if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) {
         $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]);
     }
     return $robots_okay || !$robots_not_okay;
 }