/** * Searches for links in the given HTML-chunk and adds found links the the internal link-cache. */ public function findLinksInHTMLChunk(&$html_source) { PHPCrawlerBenchmark::start("searching_for_links_in_page"); // Check for meta-base-URL and meta-tags in top of HTML-source if ($this->top_lines_processed == false) { $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source); if ($meta_base_url != null) { $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts); $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url); } // Get all meta-tags $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source); // Set flag that top-lines of source were processed $this->top_lines_processed = true; } // Prepare HTML-chunk $this->prepareHTMLChunk($html_source); // Build the RegEx-part for html-tags to search links in $tag_regex_part = ""; $cnt = count($this->extract_tags); for ($x = 0; $x < $cnt; $x++) { $tag_regex_part .= "|" . $this->extract_tags[$x]; } $tag_regex_part = substr($tag_regex_part, 1); // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link) // Get the link AND the linktext from these tags // This has to be done FIRST !! preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches); $cnt = count($matches[0]); for ($x = 0; $x < $cnt; $x++) { $link_raw = trim($matches[1][$x]); $linktext = $matches[2][$x]; $linkcode = trim($matches[0][$x]); if (!empty($link_raw)) { $this->addLinkToCache($link_raw, $linkcode, $linktext); } } // Second regex (everything that could be a link inside of <>-tags) preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches); $cnt = count($matches[0]); for ($x = 0; $x < $cnt; $x++) { $link_raw = trim($matches[1][$x]); $linktext = ""; $linkcode = trim($matches[0][$x]); if (!empty($link_raw)) { $this->addLinkToCache($link_raw, $linkcode, $linktext); } } // Now, if agressive_mode is set to true, we look for some // other things $pregs = array(); if ($this->aggressive_search == true) { // Links like "...:url("animage.gif")..." $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is"; // Everything like "...href="bla.html"..." with qoutes $pregs[] = "/[\\s\\.:;\"'](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is"; // Everything like "...href=bla.html..." without qoutes $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is"; for ($x = 0; $x < count($pregs); $x++) { unset($matches); preg_match_all($pregs[$x], $html_source, $matches); $cnt = count($matches[0]); for ($y = 0; $y < $cnt; $y++) { $link_raw = trim($matches[2][$y]); $linkcode = trim($matches[0][$y]); $linktext = ""; $this->addLinkToCache($link_raw, $linkcode, $linktext); } } } $this->found_links_map = array(); PHPCrawlerBenchmark::stop("searching_for_links_in_page"); }
/** * Gets the content from the given file or URL * * @param string $uri The URI (like "file://../myfile.txt" or "http://foo.com") * @param string $request_user_agent_string The UrserAgent-string to use for URL-requests * @param bool $throw_exception If set to true, an exception will get thrown in case of an IO-error * @return string The content of thr URI or NULL if the content couldn't be read */ public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false) { $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri); $error_str = ""; // If protocol is "file" if ($UriParts->protocol == "file://") { $file = preg_replace("#^file://#", "", $uri); if (file_exists($file) && is_readable($file)) { return file_get_contents($file); } else { $error_str = "Error reading from file '" . $file . "'"; } } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") { $uri = self::normalizeURL($uri); $Request = new PHPCrawlerHTTPRequest(); $Request->setUrl(new PHPCrawlerURLDescriptor($uri)); if ($request_user_agent_string !== null) { $Request->userAgentString = $request_user_agent_string; } $DocInfo = $Request->sendRequest(); if ($DocInfo->received == true) { return $DocInfo->source; } else { $error_str = "Error reading from URL '" . $uri . "'"; } } else { $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'"; } // Throw exception? if ($throw_exception == true) { throw new Exception($error_str); } return null; }