Exemplo n.º 1
0
 public function processLinks($domain, $use_robots = true)
 {
     $this->links = array_unique($this->links);
     $this->links = array_values($this->links);
     foreach ($this->links as $ldx => $link) {
         if (stristr($link, '..')) {
             unset($this->links[$ldx]);
             continue;
         }
         if (!stristr($link, $domain)) {
             $link = $domain . $link;
         }
         if ($use_robots == true && $this->url_exists($domain . '/robots.txt')) {
             if (!Robots_txt::urlAllowed($link, $this->PageRequest->userAgentString)) {
                 unset($this->links[$ldx]);
                 continue;
             }
         }
         if (!$this->url_exists($link)) {
             unset($this->links[$ldx]);
             continue;
         }
         $this->links[$ldx] = $link;
     }
     // Final re-order
     $this->links = array_values($this->links);
     return $this->links;
 }
Exemplo n.º 2
0
 /** @brief Checks if the url may be crawled
 			@param $strUrl the url to check
 			@param $strUserAgent the useragent name
 			@returns boolean
 			@throws Exception if $strUserAgent is missing on first call, if an instance cannot be created, or if an invalid url is passed */
 public static final function urlAllowed($strUrl, $strUserAgent = null)
 {
     $blOut = null;
     //check userAgent
     $strUserAgent = trim($strUserAgent);
     if (is_null(self::$strReportedUserAgent) && strlen($strUserAgent)) {
         self::$strReportedUserAgent = $strUserAgent;
     }
     if (is_null(self::$strReportedUserAgent)) {
         throw new Exception('strUserAgent is required on first call to Robots_txt::urlAllowed()');
     }
     if (self::isSupportedScheme($strUrl, $arrResult)) {
         if ($objEngine = self::getInstance($arrResult['scheme'], $arrResult['host'], self::$strReportedUserAgent)) {
             $blOut = $objEngine->__urlAllowed($arrResult['url']);
         } else {
             throw new Exception('Cannot get Robots_txt instance.');
         }
     } else {
         throw new Exception('Invalid URL: ' . $strUrl);
     }
     return $blOut;
 }