/** * Checks if the url belongs to one of the sites listed in site_array * Sites can be either given in the form domain:host or * in the form of a url in which case it is check that the site url * is a substring of the passed url. * * @param string $url url to check * @param array $site_array sites to check against * @param string $name identifier to store $site_array with in this * function's cache * @param bool $return_rule whether when a match is found to return true or * to return the matching site rule * @return mixed whether the url belongs to one of the sites */ static function urlMemberSiteArray($url, $site_array, $name, $return_rule = false) { static $cache = array(); if (!is_array($site_array)) { return false; } if (!isset($cache[$name])) { if (count($cache) > 100) { $cache = array(); } $i = 0; $cache[$name]["domains"] = array(); $cache[$name]["hosts"] = array(); $cache[$name]["paths"] = array(); $cache[$name]["sites"] = array(); foreach ($site_array as $site) { if (strncmp($site, "domain:", 7) == 0) { $cache[$name]["domains"][] = substr($site, 7); continue; } list($site_host, $site_path) = UrlParser::getHostAndPath($site, true, true); $cache[$name]["hosts"][] = $site_host; $cache[$name]["paths"][] = $site_path; $cache[$name]["sites"][] = $site; $i++; } $cache[$name]["domains"] = array_values(array_unique($cache[$name]["domains"])); } $flag = false; $domains =& $cache[$name]["domains"]; $hosts =& $cache[$name]["hosts"]; $paths =& $cache[$name]["paths"]; $sites =& $cache[$name]["sites"]; list($host, $path) = UrlParser::getHostAndPath($url, true, true); foreach ($domains as $domain) { $pos = strrpos($host, $domain); if ($pos !== false && $pos + strlen($domain) == strlen($host)) { if ($return_rule) { return "domain:{$domain}"; } return true; } } $count = count($sites); for ($i = 0; $i < $count; $i++) { $flag = UrlParser::isPathMemberRegexPaths($host, array($hosts[$i])); if (!$flag) { continue; } $flag = UrlParser::isPathMemberRegexPaths($path, array($paths[$i])); if ($flag) { break; } } if ($return_rule && $flag) { $flag = $sites[$i]; } return $flag; }
/** * Checks if the given $url is allowed to be crawled based on stored * robots.txt info. * @param string $url to check * @return bool whether it was allowed or not */ function checkRobotOkay($url) { // local cache of recent robot.txt stuff static $robot_cache = array(); $cache_size = 2000; list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $key = crawlHash($host, true); if (isset($robot_cache[$key])) { $robot_object = $robot_cache[$key]; } else { $data = $this->robot_table->lookup($key); $offset = unpackInt($data); $robot_object = $this->robot_archive->getObjects($offset, 1); $robot_cache[$key] = $robot_object; if (count($robot_cache) > $cache_size) { array_shift($robot_cache); } } $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array(); //these should have been urldecoded in RobotProcessor $robots_okay = true; $robots_not_okay = false; if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } return $robots_okay || !$robots_not_okay; }