Пример #1
0
 function buildURL($link, $actual_url, $url_parts_actual = "")
 {
     // Important: Function has to return a FULL URL, ioncluing
     // the port !!
     if ($url_parts_actual == "") {
         $url_parts_actual = phpcrawlerutils::splitURL($actual_url);
     }
     // Entities-replacements
     $entities = array("'&(quot|#34);'i", "'&(amp|#38);'i", "'&(lt|#60);'i", "'&(gt|#62);'i", "'&(nbsp|#160);'i", "'&(iexcl|#161);'i", "'&(cent|#162);'i", "'&(pound|#163);'i", "'&(copy|#169);'i");
     $replace = array("\"", "&", "<", ">", " ", chr(161), chr(162), chr(163), chr(169));
     $link = str_replace("\n", "", $link);
     $link = str_replace("\r", "", $link);
     // Remove "#..." at end, but ONLY at the end,
     // not if # is at the beginning !
     $link = preg_replace("/^(.{1,})#.{0,}\$/", "\\1", $link);
     // Cases
     // Strange link like "//foo.htm" -> make it to "http://foo.html"
     if (substr($link, 0, 2) == "//") {
         $link = "http:" . $link;
         $link = phpcrawlerutils::rebuildURL(phpcrawlerutils::splitURL($link));
     } elseif (substr($link, 0, 1) == "/") {
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $link;
     } elseif (substr($link, 0, 2) == "./") {
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $url_parts_actual["path"] . substr($link, 2);
     } elseif (preg_match("/^[^\\/]{1,}(:\\/\\/)/", $link)) {
         if (substr($link, 0, 7) == "http://" || substr($link, 0, 8) == "https://") {
             $link = phpcrawlerutils::rebuildURL(phpcrawlerutils::splitURL($link));
         } else {
             $link = "";
         }
         // Kick out unsupported protocols
     } elseif (preg_match("/^[a-zA-Z]{0,}:[^\\/]{0,1}/", $link)) {
         $link = "";
     } elseif (substr($link, 0, 3) == "../") {
         $new_path = $url_parts_actual["path"];
         while (substr($link, 0, 3) == "../") {
             $new_path = preg_replace('/\\/[^\\/]{0,}\\/$/', "/", $new_path);
             $link = substr($link, 3);
         }
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $new_path . $link;
     } elseif (substr($link, 0, 1) == "#") {
         $link = "";
     } elseif ($link == "") {
         $link = $actual_url;
     } else {
         $link = $url_parts_actual["protocol"] . $url_parts_actual["host"] . ":" . $url_parts_actual["port"] . $url_parts_actual["path"] . $link;
     }
     // Now, at least, replace all HTMLENTITIES with normal text !!
     // Ie: HTML-Code of the link is: <a href="index.php?x=1&amp;y=2">
     // -> Link has to be "index.php?x=1&y=2"
     $link = preg_replace($entities, $replace, $link);
     $link = rawurldecode($link);
     $link = str_replace(" ", "%20", $link);
     // "Normalize" URL
     $link = PHPCrawlerUtils::normalizeUrl($link);
     return $link;
 }