/** * Check is a path matches with a list of paths presumably coming from * a robots.txt file */ function isPathMemberRegexPathsTestCase() { $path = array(); $robot_paths = array(); $results = array(); $tests = array(array("/bobby", array("/bob"), true, "Substring Positive"), array("/bobby", array("/alice", "/f/g/h/d"), false, "Substring Negative 1"), array("/bobby/", array("/bobby/bay", "/f/g/h/d", "/yo"), false, "Substring Negative 2"), array("/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false, "Substring Negative 3 (should match start)"), array("http://test.com/bay/bobby/", array("/bobby/", "/f/g/h/d", "/yo"), false, "Substring Negative 4 (should match start)"), array("/a/bbbb/c/", array("/bobby/bay", "/a/*/c/", "/yo"), true, "Star Positive 1"), array("/a/bbbb/d/", array("/bobby/bay", "/a/*/c/", "/yo"), false, "Star Negative 1"), array("/test.html?a=b", array("/bobby/bay", "/*?", "/yo"), true, "Star Positive 2"), array("/test.html", array("/bobby/bay", "/*.html\$", "/yo"), true, "Dollar Positive 1"), array("/test.htmlish", array("/bobby/bay", "/*.html\$", "/yo"), false, "Dollar Negative 1"), array("/test.htmlish", array("/bobby/bay", "*", "/yo"), true, "Degenerate 1"), array("/test.html", array("/bobby/bay", "/**.html\$", "/yo"), true, "Degenerate 2"), array("/videos/search?q=Angelina+Jolie", array("/videos/search?"), true, "End With Question Regex Case 1")); foreach ($tests as $test) { list($path, $robot_paths, $result, $description) = $test; $this->assertEqual(UrlParser::isPathMemberRegexPaths($path, $robot_paths), $result, $description); } }
/** * Checks if the url belongs to one of the sites listed in site_array * Sites can be either given in the form domain:host or * in the form of a url in which case it is check that the site url * is a substring of the passed url. * * @param string $url url to check * @param array $site_array sites to check against * @param string $name identifier to store $site_array with in this * function's cache * @param bool $return_rule whether when a match is found to return true or * to return the matching site rule * @return mixed whether the url belongs to one of the sites */ static function urlMemberSiteArray($url, $site_array, $name, $return_rule = false) { static $cache = array(); if (!is_array($site_array)) { return false; } if (!isset($cache[$name])) { if (count($cache) > 100) { $cache = array(); } $i = 0; $cache[$name]["domains"] = array(); $cache[$name]["hosts"] = array(); $cache[$name]["paths"] = array(); $cache[$name]["sites"] = array(); foreach ($site_array as $site) { if (strncmp($site, "domain:", 7) == 0) { $cache[$name]["domains"][] = substr($site, 7); continue; } list($site_host, $site_path) = UrlParser::getHostAndPath($site, true, true); $cache[$name]["hosts"][] = $site_host; $cache[$name]["paths"][] = $site_path; $cache[$name]["sites"][] = $site; $i++; } $cache[$name]["domains"] = array_values(array_unique($cache[$name]["domains"])); } $flag = false; $domains =& $cache[$name]["domains"]; $hosts =& $cache[$name]["hosts"]; $paths =& $cache[$name]["paths"]; $sites =& $cache[$name]["sites"]; list($host, $path) = UrlParser::getHostAndPath($url, true, true); foreach ($domains as $domain) { $pos = strrpos($host, $domain); if ($pos !== false && $pos + strlen($domain) == strlen($host)) { if ($return_rule) { return "domain:{$domain}"; } return true; } } $count = count($sites); for ($i = 0; $i < $count; $i++) { $flag = UrlParser::isPathMemberRegexPaths($host, array($hosts[$i])); if (!$flag) { continue; } $flag = UrlParser::isPathMemberRegexPaths($path, array($paths[$i])); if ($flag) { break; } } if ($return_rule && $flag) { $flag = $sites[$i]; } return $flag; }
/** * Checks if the given $url is allowed to be crawled based on stored * robots.txt info. * @param string $url to check * @return bool whether it was allowed or not */ function checkRobotOkay($url) { // local cache of recent robot.txt stuff static $robot_cache = array(); $cache_size = 2000; list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $key = crawlHash($host, true); if (isset($robot_cache[$key])) { $robot_object = $robot_cache[$key]; } else { $data = $this->robot_table->lookup($key); $offset = unpackInt($data); $robot_object = $this->robot_archive->getObjects($offset, 1); $robot_cache[$key] = $robot_object; if (count($robot_cache) > $cache_size) { array_shift($robot_cache); } } $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array(); //these should have been urldecoded in RobotProcessor $robots_okay = true; $robots_not_okay = false; if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } return $robots_okay || !$robots_not_okay; }