/**
  * Retreives the content of a robots.txt-file
  *
  * @param PHPCrawlerURLDescriptor $Url The URL of the robots.txt-file
  * @return string The content of the robots.txt or NULL if no robots.txt was found.
  */
 protected function getRobotsTxtContent(PHPCrawlerURLDescriptor $Url)
 {
     // Request robots-txt
     $this->PageRequest->setUrl($Url);
     $PageInfo = $this->PageRequest->sendRequest();
     // Return content of the robots.txt-file if it was found, otherwie
     // reutrn NULL
     if ($PageInfo->http_status_code == 200) {
         return $PageInfo->content;
     } else {
         return null;
     }
 }
 /**
  * Gets the content from the given file or URL
  *
  * @param string  $uri                        The URI (like "file://../myfile.txt" or "http://foo.com")
  * @param string  $request_user_agent_string  The UrserAgent-string to use for URL-requests
  * @param bool    $throw_exception            If set to true, an exception will get thrown in case of an IO-error
  * @return string The content of thr URI or NULL if the content couldn't be read
  */
 public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false)
 {
     $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri);
     $error_str = "";
     // If protocol is "file"
     if ($UriParts->protocol == "file://") {
         $file = preg_replace("#^file://#", "", $uri);
         if (file_exists($file) && is_readable($file)) {
             return file_get_contents($file);
         } else {
             $error_str = "Error reading from file '" . $file . "'";
         }
     } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") {
         $uri = self::normalizeURL($uri);
         $Request = new PHPCrawlerHTTPRequest();
         $Request->setUrl(new PHPCrawlerURLDescriptor($uri));
         if ($request_user_agent_string !== null) {
             $Request->userAgentString = $request_user_agent_string;
         }
         $DocInfo = $Request->sendRequest();
         if ($DocInfo->received == true) {
             return $DocInfo->source;
         } else {
             $error_str = "Error reading from URL '" . $uri . "'";
         }
     } else {
         $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'";
     }
     // Throw exception?
     if ($throw_exception == true) {
         throw new Exception($error_str);
     }
     return null;
 }
 /**
  * Enables support/requests for gzip-encoded content.
  *
  * If set to TRUE, the crawler will request gzip-encoded content from webservers.
  * This will result in reduced data traffic while crawling websites, but the CPU load
  * will rise because the encoded content has to be decoded locally.
  *
  * By default, gzip-requests are disabled for compatibility reasons to earlier versions of phpcrawl.
  *
  * Please note: If gzip-requests are disabled, but a webserver returns gzip-encoded content nevertheless,
  * the crawler will handle the encoded data correctly regardless of this setting.
  *
  * @param bool $mode Set to TRUE for enabling support/requests for gzip-encoded content, defaults to FALSE
  * @section 10 Other settings
  */
 public function requestGzipContent($mode)
 {
     return $this->PageRequest->requestGzipContent($mode);
 }
예제 #4
0
 /**
  * Adds a rule to the list of rules that decide in what kind of documents the crawler
  * should search for links in (regarding their content-type)
  *
  * By default the crawler ONLY searches for links in documents of type "text/html".
  * Use this method to add one or more other content-types the crawler should check for links.
  *
  * Example:
  * <code>
  * $crawler->addLinkSearchContentType("#text/css# i");
  * $crawler->addLinkSearchContentType("#text/xml# i");
  * </code>
  * These rules let the crawler search for links in HTML-, CSS- ans XML-documents.
  *
  * <b>Please note:</b> It is NOT recommended to let the crawler checkfor links in EVERY document-
  * type! This could slow down the crawling-process dramatically (e.g. if the crawler receives large
  * binary-files like images and tries to find links in them).
  *
  * @param string $regex Regular-expression defining the rule
  * @return bool         TRUE if the rule was successfully added
  *
  * @section 6 Linkfinding settings
  */
 public function addLinkSearchContentType($regex)
 {
     return $this->PageRequest->addLinkSearchContentType($regex);
 }
예제 #5
0
 /**
  * Defines the sections of HTML-documents that will get ignroed by the link-finding algorithm.
  *
  * By default, phpcrawl is searching for links in the entire documents it receives during the crawling-process.
  * This sometimes brings up some non existing "phantom-URLs" because the crawler recognized i.e. some javascript-code
  * as a link that was not meant to be, or the crawler found a link inside an html-comment that doesn't exist anymore.
  *
  * By using this method, users can define what predefined sections of HTML-documents should get ignored when it comes
  * to finding links.
  *
  * See {@link PHPCrawlerLinkSearchDocumentSections}-constants for all predefined sections.
  *
  * Example 1:
  * <code>
  * // Let the crawler ignore script-sections and html-comment-sections when finding links
  * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::SCRIPT_SECTIONS |
  *                                             PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS);
  * </code>
  * Example 2:
  * <code>
  * // Let the crawler ignore all special sections except HTML-comments 
  * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS ^
  *                                             PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS);
  * </code>
  *
  * @param int $document_sections Bitwise combination of the {@link PHPCrawlerLinkSearchDocumentSections}-constants.
  * @section 6 Linkfinding settings 
  */
 public function excludeLinkSearchDocumentSections($document_sections)
 {
     return $this->PageRequest->excludeLinkSearchDocumentSections($document_sections);
 }