Ejemplo n.º 1
  * Gets the content from the given file or URL
  * @param string  $uri                        The URI (like "file://../myfile.txt" or "http://foo.com")
  * @param string  $request_user_agent_string  The UrserAgent-string to use for URL-requests
  * @param bool    $throw_exception            If set to true, an exception will get thrown in case of an IO-error
  * @return string The content of thr URI or NULL if the content couldn't be read
 public static function getURIContent($uri, $request_user_agent_string = null, $throw_exception = false)
     $UriParts = PHPCrawlerUrlPartsDescriptor::fromURL($uri);
     $error_str = "";
     // If protocol is "file"
     if ($UriParts->protocol == "file://") {
         $file = preg_replace("#^file://#", "", $uri);
         if (file_exists($file) && is_readable($file)) {
             return file_get_contents($file);
         } else {
             $error_str = "Error reading from file '" . $file . "'";
     } elseif ($UriParts->protocol == "http://" || $UriParts->protocol == "https://") {
         $uri = self::normalizeURL($uri);
         $Request = new PHPCrawlerHTTPRequest();
         $Request->setUrl(new PHPCrawlerURLDescriptor($uri));
         if ($request_user_agent_string !== null) {
             $Request->userAgentString = $request_user_agent_string;
         $DocInfo = $Request->sendRequest();
         if ($DocInfo->received == true) {
             return $DocInfo->source;
         } else {
             $error_str = "Error reading from URL '" . $uri . "'";
     } else {
         $error_str = "Unsupported protocol-type '" . $UriParts->protocol . "'";
     // Throw exception?
     if ($throw_exception == true) {
         throw new Exception($error_str);
     return null;
Ejemplo n.º 2
  * Reconstructs a full qualified and normalized URL from a given link relating to the URL the link was found in.
  * @param string $link          The link (i.e. "../page.htm")
  * @param PHPCrawlerUrlPartsDescriptor $BaseUrlParts  The parts of the URL the link was found in (i.e. "http://www.foo.com/folder/index.html")
  * @return string The rebuild, full qualified and normilazed URL the link is leading to (i.e. "http://www.foo.com/page.htm")
  *                Or NULL if the link couldn't be rebuild correctly.
 public static function buildURLFromLink($link, PHPCrawlerUrlPartsDescriptor $BaseUrlParts)
     $url_parts = $BaseUrlParts->toArray();
     // Entities-replacements
     $entities = array("'&(quot|#34);'i", "'&(amp|#38);'i", "'&(lt|#60);'i", "'&(gt|#62);'i", "'&(nbsp|#160);'i", "'&(iexcl|#161);'i", "'&(cent|#162);'i", "'&(pound|#163);'i", "'&(copy|#169);'i");
     $replace = array("\"", "&", "<", ">", " ", chr(161), chr(162), chr(163), chr(169));
     // Remove "#..." at end, but ONLY at the end,
     // not if # is at the beginning !
     $link = preg_replace("/^(.{1,})#.{0,}\$/", "\\1", $link);
     // Cases
     // Strange link like "//foo.htm" -> make it to "http://foo.html"
     if (substr($link, 0, 2) == "//") {
         $link = "http:" . $link;
     } elseif (substr($link, 0, 1) == "/") {
         $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $link;
     } elseif (substr($link, 0, 2) == "./") {
         $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . substr($link, 2);
     } elseif (preg_match("#^[a-z0-9]{1,}(:\\/\\/)# i", $link)) {
         $link = $link;
     } elseif (preg_match("/^[a-zA-Z]{0,}:[^\\/]{0,1}/", $link)) {
         $link = "";
     } elseif (substr($link, 0, 3) == "../") {
         $new_path = $url_parts["path"];
         while (substr($link, 0, 3) == "../") {
             $new_path = preg_replace('/\\/[^\\/]{0,}\\/$/', "/", $new_path);
             $link = substr($link, 3);
         $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $new_path . $link;
     } elseif (substr($link, 0, 1) == "#") {
         $link = "";
     } elseif (substr($link, 0, 1) == "?") {
         $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . $url_parts["file"] . $link;
     } else {
         $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . $link;
     if ($link == "") {
         return null;
     // Now, at least, replace all HTMLENTITIES with normal text !!
     // Fe: HTML-Code of the link is: <a href="index.php?x=1&amp;y=2">
     // -> Link has to be "index.php?x=1&y=2"
     $link = preg_replace($entities, $replace, $link);
     // Replace linebreaks in the link with "" (happens if a links in the sourcecode
     // linebreaks)
     $link = str_replace(array("\n", "\r"), "", $link);
     // "Normalize" URL
     $link = self::normalizeUrl($link);
     return $link;
  * Searches for links in the given HTML-chunk and adds found links the the internal link-cache.
 public function findLinksInHTMLChunk(&$html_source)
     // Check for meta-base-URL and meta-tags in top of HTML-source
     if ($this->top_lines_processed == false) {
         $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source);
         if ($meta_base_url != null) {
             $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts);
             $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url);
         // Get all meta-tags
         $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source);
         // Set flag that top-lines of source were processed
         $this->top_lines_processed = true;
     // Build the RegEx-part for html-tags to search links in
     $tag_regex_part = "";
     $cnt = count($this->extract_tags);
     for ($x = 0; $x < $cnt; $x++) {
         $tag_regex_part .= "|" . $this->extract_tags[$x];
     $tag_regex_part = substr($tag_regex_part, 1);
     // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link)
     // Get the link AND the linktext from these tags
     // This has to be done FIRST !!
     preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = $matches[2][$x];
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
     // Second regex (everything that could be a link inside of <>-tags)
     preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = "";
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
     // Now, if agressive_mode is set to true, we look for some
     // other things
     $pregs = array();
     if ($this->aggressive_search == true) {
         // Links like "...:url("animage.gif")..."
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is";
         // Everything like "...href="bla.html"..." with qoutes
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is";
         // Everything like "...href=bla.html..." without qoutes
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is";
         for ($x = 0; $x < count($pregs); $x++) {
             preg_match_all($pregs[$x], $html_source, $matches);
             $cnt = count($matches[0]);
             for ($y = 0; $y < $cnt; $y++) {
                 $link_raw = trim($matches[2][$y]);
                 $linkcode = trim($matches[0][$y]);
                 $linktext = "";
                 $this->addLinkToCache($link_raw, $linkcode, $linktext);
     $this->found_links_map = array();