public function processLinks($domain, $use_robots = true) { $this->links = array_unique($this->links); $this->links = array_values($this->links); foreach ($this->links as $ldx => $link) { if (stristr($link, '..')) { unset($this->links[$ldx]); continue; } if (!stristr($link, $domain)) { $link = $domain . $link; } if ($use_robots == true && $this->url_exists($domain . '/robots.txt')) { if (!Robots_txt::urlAllowed($link, $this->PageRequest->userAgentString)) { unset($this->links[$ldx]); continue; } } if (!$this->url_exists($link)) { unset($this->links[$ldx]); continue; } $this->links[$ldx] = $link; } // Final re-order $this->links = array_values($this->links); return $this->links; }
/** @brief Checks if the url may be crawled @param $strUrl the url to check @param $strUserAgent the useragent name @returns boolean @throws Exception if $strUserAgent is missing on first call, if an instance cannot be created, or if an invalid url is passed */ public static final function urlAllowed($strUrl, $strUserAgent = null) { $blOut = null; //check userAgent $strUserAgent = trim($strUserAgent); if (is_null(self::$strReportedUserAgent) && strlen($strUserAgent)) { self::$strReportedUserAgent = $strUserAgent; } if (is_null(self::$strReportedUserAgent)) { throw new Exception('strUserAgent is required on first call to Robots_txt::urlAllowed()'); } if (self::isSupportedScheme($strUrl, $arrResult)) { if ($objEngine = self::getInstance($arrResult['scheme'], $arrResult['host'], self::$strReportedUserAgent)) { $blOut = $objEngine->__urlAllowed($arrResult['url']); } else { throw new Exception('Cannot get Robots_txt instance.'); } } else { throw new Exception('Invalid URL: ' . $strUrl); } return $blOut; }