/** * Retreives the content of a robots.txt-file * * @param PHPCrawlerURLDescriptor $Url The URL of the robots.txt-file * @return string The content of the robots.txt or NULL if no robots.txt was found. */ protected function getRobotsTxtContent(PHPCrawlerURLDescriptor $Url) { // Request robots-txt $this->PageRequest->setUrl($Url); $PageInfo = $this->PageRequest->sendRequest(); // Return content of the robots.txt-file if it was found, otherwie // reutrn NULL if ($PageInfo->http_status_code == 200) { return $PageInfo->content; } else { return null; } }
/** * Gets the content from the given file or URL * * @param string $uri The URI (like "file://../myfile.txt" or "http://foo.com") * @param string $request_user_agent_string The UrserAgent-string to use for URL-requests * @param bool $throw_exception If set to true, an exception will get thrown in case of an IO-error * @return string The content of thr URI or NULL if the content couldn't be read */ public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false) { $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri); $error_str = ""; // If protocol is "file" if ($UriParts->protocol == "file://") { $file = preg_replace("#^file://#", "", $uri); if (file_exists($file) && is_readable($file)) { return file_get_contents($file); } else { $error_str = "Error reading from file '" . $file . "'"; } } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") { $uri = self::normalizeURL($uri); $Request = new PHPCrawlerHTTPRequest(); $Request->setUrl(new PHPCrawlerURLDescriptor($uri)); if ($request_user_agent_string !== null) { $Request->userAgentString = $request_user_agent_string; } $DocInfo = $Request->sendRequest(); if ($DocInfo->received == true) { return $DocInfo->source; } else { $error_str = "Error reading from URL '" . $uri . "'"; } } else { $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'"; } // Throw exception? if ($throw_exception == true) { throw new Exception($error_str); } return null; }
/** * Receives and processes the given URL * * @param PHPCrawlerURLDescriptor $UrlDescriptor The URL as PHPCrawlerURLDescriptor-object * @return bool TURE if the crawling-process should be aborted after processig the URL, otherwise FALSE. */ protected function processUrl(PHPCrawlerURLDescriptor $UrlDescriptor) { PHPCrawlerBenchmark::start("processing_url"); // Setup HTTP-request $this->PageRequest->setUrl($UrlDescriptor); // Add cookies to request if ($this->cookie_handling_enabled == true) { $this->PageRequest->addCookieDescriptors($this->CookieCache->getCookiesForUrl($UrlDescriptor->url_rebuild)); } // Add basic-authentications to request $authentication = $this->UserSendDataCache->getBasicAuthenticationForUrl($UrlDescriptor->url_rebuild); if ($authentication != null) { $this->PageRequest->setBasicAuthentication($authentication["username"], $authentication["password"]); } // Add post-data to request $post_data = $this->UserSendDataCache->getPostDataForUrl($UrlDescriptor->url_rebuild); while (list($post_key, $post_value) = @each($post_data)) { $this->PageRequest->addPostData($post_key, $post_value); } // Do request $PageInfo = $this->PageRequest->sendRequest(); if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { // Check for abort $abort_reason = $this->checkForAbort(); if ($abort_reason !== null) { return true; } $this->ProcessCommunication->updateCrawlerStatus($PageInfo); } // Remove post and cookie-data from request-object $this->PageRequest->clearCookies(); $this->PageRequest->clearPostData(); // Call user-moethods if crawler doesn't run in MPMODE_PARENT_EXECUTES_USERCODE if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { // Call the "abstract" method handlePageData $user_abort = false; $page_info = $PageInfo->toArray(); $user_return_value = $this->handlePageData($page_info); if ($user_return_value < 0) { $user_abort = true; } // Call the "abstract" method handleDocumentInfo $user_return_value = $this->handleDocumentInfo($PageInfo); if ($user_return_value < 0) { $user_abort = true; } // Update status if user aborted process if ($user_abort == true) { $this->ProcessCommunication->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_USERABORT); } // Check for abort from other processes if ($this->checkForAbort() !== null) { return true; } } // Filter found URLs by defined rules if ($this->follow_redirects_till_content == true) { $crawler_status = $this->ProcessCommunication->getCrawlerStatus(); // If content wasn't found so far and content was found NOW if ($crawler_status->first_content_url == null && $PageInfo->http_status_code == 200) { $this->ProcessCommunication->updateCrawlerStatus(null, null, $PageInfo->url); $this->UrlFilter->setBaseURL($PageInfo->url); // Set current page as base-URL $this->UrlFilter->filterUrls($PageInfo); $this->follow_redirects_till_content = false; // Content was found, so this can be set to FALSE } else { if ($crawler_status->first_content_url == null) { $this->UrlFilter->keepRedirectUrls($PageInfo); // Content wasn't found so far, so just keep redirect-urls } else { if ($crawler_status->first_content_url != null) { $this->follow_redirects_till_content = false; $this->UrlFilter->filterUrls($PageInfo); } } } } else { $this->UrlFilter->filterUrls($PageInfo); } // Add Cookies to Cookie-cache if ($this->cookie_handling_enabled == true) { $this->CookieCache->addCookies($PageInfo->cookies); } // Add filtered links to URL-cache $this->LinkCache->addURLs($PageInfo->links_found_url_descriptors); PHPCrawlerBenchmark::stop("processing_url"); // Complete PageInfo-Object with benchmarks $PageInfo->benchmarks = PHPCrawlerBenchmark::getAllBenchmarks(); if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { $this->DocumentInfoQueue->addDocumentInfo($PageInfo); } // Mark URL as "followed" $this->LinkCache->markUrlAsFollowed($UrlDescriptor); PHPCrawlerBenchmark::resetAll(array("crawling_process")); return false; }