/** * Retreives the content of a robots.txt-file * * @param PHPCrawlerURLDescriptor $Url The URL of the robots.txt-file * @return string The content of the robots.txt or NULL if no robots.txt was found. */ protected function getRobotsTxtContent(PHPCrawlerURLDescriptor $Url) { // Request robots-txt $this->PageRequest->setUrl($Url); $PageInfo = $this->PageRequest->sendRequest(); // Return content of the robots.txt-file if it was found, otherwie // reutrn NULL if ($PageInfo->http_status_code == 200) { return $PageInfo->content; } else { return null; } }
/** * Gets the content from the given file or URL * * @param string $uri The URI (like "file://../myfile.txt" or "http://foo.com") * @param string $request_user_agent_string The UrserAgent-string to use for URL-requests * @param bool $throw_exception If set to true, an exception will get thrown in case of an IO-error * @return string The content of thr URI or NULL if the content couldn't be read */ public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false) { $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri); $error_str = ""; // If protocol is "file" if ($UriParts->protocol == "file://") { $file = preg_replace("#^file://#", "", $uri); if (file_exists($file) && is_readable($file)) { return file_get_contents($file); } else { $error_str = "Error reading from file '" . $file . "'"; } } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") { $uri = self::normalizeURL($uri); $Request = new PHPCrawlerHTTPRequest(); $Request->setUrl(new PHPCrawlerURLDescriptor($uri)); if ($request_user_agent_string !== null) { $Request->userAgentString = $request_user_agent_string; } $DocInfo = $Request->sendRequest(); if ($DocInfo->received == true) { return $DocInfo->source; } else { $error_str = "Error reading from URL '" . $uri . "'"; } } else { $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'"; } // Throw exception? if ($throw_exception == true) { throw new Exception($error_str); } return null; }
/** * Enables support/requests for gzip-encoded content. * * If set to TRUE, the crawler will request gzip-encoded content from webservers. * This will result in reduced data traffic while crawling websites, but the CPU load * will rise because the encoded content has to be decoded locally. * * By default, gzip-requests are disabled for compatibility reasons to earlier versions of phpcrawl. * * Please note: If gzip-requests are disabled, but a webserver returns gzip-encoded content nevertheless, * the crawler will handle the encoded data correctly regardless of this setting. * * @param bool $mode Set to TRUE for enabling support/requests for gzip-encoded content, defaults to FALSE * @section 10 Other settings */ public function requestGzipContent($mode) { return $this->PageRequest->requestGzipContent($mode); }
/** * Adds a rule to the list of rules that decide in what kind of documents the crawler * should search for links in (regarding their content-type) * * By default the crawler ONLY searches for links in documents of type "text/html". * Use this method to add one or more other content-types the crawler should check for links. * * Example: * <code> * $crawler->addLinkSearchContentType("#text/css# i"); * $crawler->addLinkSearchContentType("#text/xml# i"); * </code> * These rules let the crawler search for links in HTML-, CSS- ans XML-documents. * * <b>Please note:</b> It is NOT recommended to let the crawler checkfor links in EVERY document- * type! This could slow down the crawling-process dramatically (e.g. if the crawler receives large * binary-files like images and tries to find links in them). * * @param string $regex Regular-expression defining the rule * @return bool TRUE if the rule was successfully added * * @section 6 Linkfinding settings */ public function addLinkSearchContentType($regex) { return $this->PageRequest->addLinkSearchContentType($regex); }
/** * Defines the sections of HTML-documents that will get ignroed by the link-finding algorithm. * * By default, phpcrawl is searching for links in the entire documents it receives during the crawling-process. * This sometimes brings up some non existing "phantom-URLs" because the crawler recognized i.e. some javascript-code * as a link that was not meant to be, or the crawler found a link inside an html-comment that doesn't exist anymore. * * By using this method, users can define what predefined sections of HTML-documents should get ignored when it comes * to finding links. * * See {@link PHPCrawlerLinkSearchDocumentSections}-constants for all predefined sections. * * Example 1: * <code> * // Let the crawler ignore script-sections and html-comment-sections when finding links * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::SCRIPT_SECTIONS | * PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS); * </code> * Example 2: * <code> * // Let the crawler ignore all special sections except HTML-comments * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS ^ * PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS); * </code> * * @param int $document_sections Bitwise combination of the {@link PHPCrawlerLinkSearchDocumentSections}-constants. * @section 6 Linkfinding settings */ public function excludeLinkSearchDocumentSections($document_sections) { return $this->PageRequest->excludeLinkSearchDocumentSections($document_sections); }