/** * Defines the sections of HTML-documents that will get ignroed by the link-finding algorithm. * * By default, phpcrawl is searching for links in the entire documents it receives during the crawling-process. * This sometimes brings up some non existing "phantom-URLs" because the crawler recognized i.e. some javascript-code * as a link that was not meant to be, or the crawler found a link inside an html-comment that doesn't exist anymore. * * By using this method, users can define what predefined sections of HTML-documents should get ignored when it comes * to finding links. * * See {@link PHPCrawlerLinkSearchDocumentSections}-constants for all predefined sections. * * Example 1: * <code> * // Let the crawler ignore script-sections and html-comment-sections when finding links * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::SCRIPT_SECTIONS | * PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS); * </code> * Example 2: * <code> * // Let the crawler ignore all special sections except HTML-comments * $crawler->excludeLinkSearchDocumentSections(PHPCrawlerLinkSearchDocumentSections::ALL_SPECIAL_SECTIONS ^ * PHPCrawlerLinkSearchDocumentSections::HTML_COMMENT_SECTIONS); * </code> * * @param int $document_sections Bitwise combination of the {@link PHPCrawlerLinkSearchDocumentSections}-constants. * @section 6 Linkfinding settings */ public function excludeLinkSearchDocumentSections($document_sections) { return $this->PageRequest->excludeLinkSearchDocumentSections($document_sections); }
/** * Gets the content from the given file or URL * * @param string $uri The URI (like "file://../myfile.txt" or "http://foo.com") * @param string $request_user_agent_string The UrserAgent-string to use for URL-requests * @param bool $throw_exception If set to true, an exception will get thrown in case of an IO-error * @return string The content of thr URI or NULL if the content couldn't be read */ public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false) { $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri); $error_str = ""; // If protocol is "file" if ($UriParts->protocol == "file://") { $file = preg_replace("#^file://#", "", $uri); if (file_exists($file) && is_readable($file)) { return file_get_contents($file); } else { $error_str = "Error reading from file '" . $file . "'"; } } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") { $uri = self::normalizeURL($uri); $Request = new PHPCrawlerHTTPRequest(); $Request->setUrl(new PHPCrawlerURLDescriptor($uri)); if ($request_user_agent_string !== null) { $Request->userAgentString = $request_user_agent_string; } $DocInfo = $Request->sendRequest(); if ($DocInfo->received == true) { return $DocInfo->source; } else { $error_str = "Error reading from URL '" . $uri . "'"; } } else { $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'"; } // Throw exception? if ($throw_exception == true) { throw new Exception($error_str); } return null; }