/** * Reconstructs a full qualified and normalized URL from a given link relating to the URL the link was found in. * * @param string $link The link (i.e. "../page.htm") * @param PHPCrawlerUrlPartsDescriptor $BaseUrlParts The parts of the URL the link was found in (i.e. "http://www.foo.com/folder/index.html") * * @return string The rebuild, full qualified and normilazed URL the link is leading to (i.e. "http://www.foo.com/page.htm") * Or NULL if the link couldn't be rebuild correctly. */ public static function buildURLFromLink($link, PHPCrawlerUrlPartsDescriptor $BaseUrlParts) { $url_parts = $BaseUrlParts->toArray(); // Entities-replacements $entities = array("'&(quot|#34);'i", "'&(amp|#38);'i", "'&(lt|#60);'i", "'&(gt|#62);'i", "'&(nbsp|#160);'i", "'&(iexcl|#161);'i", "'&(cent|#162);'i", "'&(pound|#163);'i", "'&(copy|#169);'i"); $replace = array("\"", "&", "<", ">", " ", chr(161), chr(162), chr(163), chr(169)); // Remove "#..." at end, but ONLY at the end, // not if # is at the beginning ! $link = preg_replace("/^(.{1,})#.{0,}\$/", "\\1", $link); // Cases // Strange link like "//foo.htm" -> make it to "http://foo.html" if (substr($link, 0, 2) == "//") { $link = "http:" . $link; } elseif (substr($link, 0, 1) == "/") { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $link; } elseif (substr($link, 0, 2) == "./") { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . substr($link, 2); } elseif (preg_match("#^[a-z0-9]{1,}(:\\/\\/)# i", $link)) { $link = $link; } elseif (preg_match("/^[a-zA-Z]{0,}:[^\\/]{0,1}/", $link)) { $link = ""; } elseif (substr($link, 0, 3) == "../") { $new_path = $url_parts["path"]; while (substr($link, 0, 3) == "../") { $new_path = preg_replace('/\\/[^\\/]{0,}\\/$/', "/", $new_path); $link = substr($link, 3); } $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $new_path . $link; } elseif (substr($link, 0, 1) == "#") { $link = ""; } elseif (substr($link, 0, 1) == "?") { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . $url_parts["file"] . $link; } else { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . $link; } if ($link == "") { return null; } // Now, at least, replace all HTMLENTITIES with normal text !! // Fe: HTML-Code of the link is: <a href="index.php?x=1&y=2"> // -> Link has to be "index.php?x=1&y=2" $link = preg_replace($entities, $replace, $link); // Replace linebreaks in the link with "" (happens if a links in the sourcecode // linebreaks) $link = str_replace(array("\n", "\r"), "", $link); // "Normalize" URL $link = self::normalizeUrl($link); return $link; }
/** * Reconstructs a full qualified and normalized URL from a given link relating to the URL the link was found in. * * @param string $link The link (i.e. "../page.htm") * @param PHPCrawlerUrlPartsDescriptor $BaseUrl The base-URL the link was found in as PHPCrawlerUrlPartsDescriptor-object * * @return string The rebuild, full qualified and normilazed URL the link is leading to (i.e. "http://www.foo.com/page.htm"), * or NULL if the link couldn't be rebuild correctly. */ public static function buildURLFromLink($link, PHPCrawlerUrlPartsDescriptor $BaseUrl) { $url_parts = $BaseUrl->toArray(); // Dedoce HTML-entities $link = PHPCrawlerEncodingUtils::decodeHtmlEntities($link); // Remove anchor ("#..."), but ONLY at the end, not if # is at the beginning ! $link = preg_replace("/^(.{1,})#.{0,}\$/", "\\1", $link); // Cases // Strange link like "//foo.htm" -> make it to "http://foo.html" if (substr($link, 0, 2) == "//") { $link = "http:" . $link; } elseif (substr($link, 0, 1) == "/") { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $link; } elseif (substr($link, 0, 2) == "./") { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . substr($link, 2); } elseif (preg_match("#^[a-z0-9-]{1,}(:\\/\\/)# i", $link)) { $link = $link; } elseif (preg_match("/^[a-zA-Z]{0,}:[^\\/]{0,1}/", $link)) { $link = ""; } elseif (substr($link, 0, 3) == "../") { $new_path = $url_parts["path"]; while (substr($link, 0, 3) == "../") { $new_path = preg_replace('/\\/[^\\/]{0,}\\/$/', "/", $new_path); $link = substr($link, 3); } $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $new_path . $link; } elseif (substr($link, 0, 1) == "#") { $link = ""; } elseif (substr($link, 0, 1) == "?") { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . $url_parts["file"] . $link; } else { $link = $url_parts["protocol"] . $url_parts["host"] . ":" . $url_parts["port"] . $url_parts["path"] . $link; } if ($link == "") { return null; } // Now, at least, replace all HTMLENTITIES with normal text. // I.E.: HTML-Code of the link is: <a href="index.php?x=1&y=2"> // -> Link has to be "index.php?x=1&y=2" //$link = PHPCrawlerEncodingUtils::decodeHtmlEntities($link); // Replace linebreaks in the link with "" (happens if a link in the sourcecode // linebreaks) $link = str_replace(array("\n", "\r"), "", $link); // "Normalize" URL $link = self::normalizeUrl($link); return $link; }