/**
  * Retreives the content of a robots.txt-file
  *
  * @param PHPCrawlerURLDescriptor $Url The URL of the robots.txt-file
  * @return string The content of the robots.txt or NULL if no robots.txt was found.
  */
 protected function getRobotsTxtContent(PHPCrawlerURLDescriptor $Url)
 {
     // Request robots-txt
     $this->PageRequest->setUrl($Url);
     $PageInfo = $this->PageRequest->sendRequest();
     // Return content of the robots.txt-file if it was found, otherwie
     // reutrn NULL
     if ($PageInfo->http_status_code == 200) {
         return $PageInfo->content;
     } else {
         return null;
     }
 }
 /**
  * Gets the content from the given file or URL
  *
  * @param string  $uri                        The URI (like "file://../myfile.txt" or "http://foo.com")
  * @param string  $request_user_agent_string  The UrserAgent-string to use for URL-requests
  * @param bool    $throw_exception            If set to true, an exception will get thrown in case of an IO-error
  * @return string The content of thr URI or NULL if the content couldn't be read
  */
 public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false)
 {
     $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri);
     $error_str = "";
     // If protocol is "file"
     if ($UriParts->protocol == "file://") {
         $file = preg_replace("#^file://#", "", $uri);
         if (file_exists($file) && is_readable($file)) {
             return file_get_contents($file);
         } else {
             $error_str = "Error reading from file '" . $file . "'";
         }
     } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") {
         $uri = self::normalizeURL($uri);
         $Request = new PHPCrawlerHTTPRequest();
         $Request->setUrl(new PHPCrawlerURLDescriptor($uri));
         if ($request_user_agent_string !== null) {
             $Request->userAgentString = $request_user_agent_string;
         }
         $DocInfo = $Request->sendRequest();
         if ($DocInfo->received == true) {
             return $DocInfo->source;
         } else {
             $error_str = "Error reading from URL '" . $uri . "'";
         }
     } else {
         $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'";
     }
     // Throw exception?
     if ($throw_exception == true) {
         throw new Exception($error_str);
     }
     return null;
 }
示例#3
0
 /**
  * Receives and processes the given URL
  *
  * @param PHPCrawlerURLDescriptor $UrlDescriptor The URL as PHPCrawlerURLDescriptor-object
  * @return bool TURE if the crawling-process should be aborted after processig the URL, otherwise FALSE.
  */
 protected function processUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
 {
     PHPCrawlerBenchmark::start("processing_url");
     // Setup HTTP-request
     $this->PageRequest->setUrl($UrlDescriptor);
     // Add cookies to request
     if ($this->cookie_handling_enabled == true) {
         $this->PageRequest->addCookieDescriptors($this->CookieCache->getCookiesForUrl($UrlDescriptor->url_rebuild));
     }
     // Add basic-authentications to request
     $authentication = $this->UserSendDataCache->getBasicAuthenticationForUrl($UrlDescriptor->url_rebuild);
     if ($authentication != null) {
         $this->PageRequest->setBasicAuthentication($authentication["username"], $authentication["password"]);
     }
     // Add post-data to request
     $post_data = $this->UserSendDataCache->getPostDataForUrl($UrlDescriptor->url_rebuild);
     while (list($post_key, $post_value) = @each($post_data)) {
         $this->PageRequest->addPostData($post_key, $post_value);
     }
     // Do request
     $PageInfo = $this->PageRequest->sendRequest();
     if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) {
         // Check for abort
         $abort_reason = $this->checkForAbort();
         if ($abort_reason !== null) {
             return true;
         }
         $this->ProcessCommunication->updateCrawlerStatus($PageInfo);
     }
     // Remove post and cookie-data from request-object
     $this->PageRequest->clearCookies();
     $this->PageRequest->clearPostData();
     // Call user-moethods if crawler doesn't run in MPMODE_PARENT_EXECUTES_USERCODE
     if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) {
         // Call the "abstract" method handlePageData
         $user_abort = false;
         $page_info = $PageInfo->toArray();
         $user_return_value = $this->handlePageData($page_info);
         if ($user_return_value < 0) {
             $user_abort = true;
         }
         // Call the "abstract" method handleDocumentInfo
         $user_return_value = $this->handleDocumentInfo($PageInfo);
         if ($user_return_value < 0) {
             $user_abort = true;
         }
         // Update status if user aborted process
         if ($user_abort == true) {
             $this->ProcessCommunication->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_USERABORT);
         }
         // Check for abort from other processes
         if ($this->checkForAbort() !== null) {
             return true;
         }
     }
     // Filter found URLs by defined rules
     if ($this->follow_redirects_till_content == true) {
         $crawler_status = $this->ProcessCommunication->getCrawlerStatus();
         // If content wasn't found so far and content was found NOW
         if ($crawler_status->first_content_url == null && $PageInfo->http_status_code == 200) {
             $this->ProcessCommunication->updateCrawlerStatus(null, null, $PageInfo->url);
             $this->UrlFilter->setBaseURL($PageInfo->url);
             // Set current page as base-URL
             $this->UrlFilter->filterUrls($PageInfo);
             $this->follow_redirects_till_content = false;
             // Content was found, so this can be set to FALSE
         } else {
             if ($crawler_status->first_content_url == null) {
                 $this->UrlFilter->keepRedirectUrls($PageInfo);
                 // Content wasn't found so far, so just keep redirect-urls
             } else {
                 if ($crawler_status->first_content_url != null) {
                     $this->follow_redirects_till_content = false;
                     $this->UrlFilter->filterUrls($PageInfo);
                 }
             }
         }
     } else {
         $this->UrlFilter->filterUrls($PageInfo);
     }
     // Add Cookies to Cookie-cache
     if ($this->cookie_handling_enabled == true) {
         $this->CookieCache->addCookies($PageInfo->cookies);
     }
     // Add filtered links to URL-cache
     $this->LinkCache->addURLs($PageInfo->links_found_url_descriptors);
     PHPCrawlerBenchmark::stop("processing_url");
     // Complete PageInfo-Object with benchmarks
     $PageInfo->benchmarks = PHPCrawlerBenchmark::getAllBenchmarks();
     if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) {
         $this->DocumentInfoQueue->addDocumentInfo($PageInfo);
     }
     // Mark URL as "followed"
     $this->LinkCache->markUrlAsFollowed($UrlDescriptor);
     PHPCrawlerBenchmark::resetAll(array("crawling_process"));
     return false;
 }