示例#1
0
 /**
  * Defines the sections of HTML-documents that will get ignroed by the link-finding algorithm.
  *
  * By default, phpcrawl is searching for links in the entire documents it receives during the crawling-process.
  * This sometimes brings up some non existing "phantom-URLs" because the crawler recognized i.e. some javascript-code
  * as a link that was not meant to be, or the crawler found a link inside an html-comment that doesn't exist anymore.
  *
  * By using this method, users can define what predefined sections of HTML-documents should get ignored when it comes
  * to finding links.
  *
  * See {@link PHPCrawlerLinkSearchDocumentSections}-constants for all predefined sections.
  *
  * Example 1:
  * <code>
  * // Let the crawler ignore script-sections and html-comment-sections when finding links
  * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::SCRIPT_SECTIONS |
  *                                             PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS);
  * </code>
  * Example 2:
  * <code>
  * // Let the crawler ignore all special sections except HTML-comments
  * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS ^
  *                                             PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS);
  * </code>
  *
  * @param int $document_sections Bitwise combination of the {@link PHPCrawlerLinkSearchDocumentSections}-constants.
  * @section 6 Linkfinding settings
  */
 public function excludeLinkSearchDocumentSections($document_sections)
 {
     return $this->PageRequest->excludeLinkSearchDocumentSections($document_sections);
 }
示例#2
0
 /**
  * Gets the content from the given file or URL
  *
  * @param string $uri The URI (like "file://../myfile.txt" or "http://foo.com")
  * @param string $request_user_agent_string The UrserAgent-string to use for URL-requests
  * @param bool $throw_exception If set to true, an exception will get thrown in case of an IO-error
  * @return string The content of thr URI or NULL if the content couldn't be read
  */
 public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false)
 {
     $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri);
     $error_str = "";
     // If protocol is "file"
     if ($UriParts->protocol == "file://") {
         $file = preg_replace("#^file://#", "", $uri);
         if (file_exists($file) && is_readable($file)) {
             return file_get_contents($file);
         } else {
             $error_str = "Error reading from file '" . $file . "'";
         }
     } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") {
         $uri = self::normalizeURL($uri);
         $Request = new PHPCrawlerHTTPRequest();
         $Request->setUrl(new PHPCrawlerURLDescriptor($uri));
         if ($request_user_agent_string !== null) {
             $Request->userAgentString = $request_user_agent_string;
         }
         $DocInfo = $Request->sendRequest();
         if ($DocInfo->received == true) {
             return $DocInfo->source;
         } else {
             $error_str = "Error reading from URL '" . $uri . "'";
         }
     } else {
         $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'";
     }
     // Throw exception?
     if ($throw_exception == true) {
         throw new Exception($error_str);
     }
     return null;
 }