示例#1
0
function Scan($url)
{
    global $scanned, $pf, $extension, $skip, $freq, $priority;
    echo "scan url {$url}\n";
    array_push($scanned, $url);
    $html = GetUrl($url);
    $a1 = explode("<a", $html);
    foreach ($a1 as $key => $val) {
        $parts = explode(">", $val);
        $a = $parts[0];
        $aparts = explode("href=", $a);
        $hrefparts = explode(" ", $aparts[1]);
        $hrefparts2 = explode("#", $hrefparts[0]);
        $href = str_replace("\"", "", $hrefparts2[0]);
        if (substr($href, 0, 7) != "http://" && substr($href, 0, 8) != "https://" && substr($href, 0, 6) != "ftp://") {
            if ($href[0] == '/') {
                $href = "{$scanned['0']}{$href}";
            } else {
                $href = Path($url) . $href;
            }
        }
        if (substr($href, 0, strlen($scanned[0])) == $scanned[0]) {
            $ignore = false;
            if (isset($skip)) {
                foreach ($skip as $k => $v) {
                    if (substr($href, 0, strlen($v)) == $v) {
                        $ignore = true;
                    }
                }
            }
            if (!$ignore && !in_array($href, $scanned) && strpos($href, $extension) > 0) {
                fwrite($pf, "<url>\n  <loc>{$href}</loc>\n" . "  <changefreq>{$freq}</changefreq>\n" . "  <priority>{$priority}</priority>\n</url>\n");
                echo $href . "\n";
                Scan($href);
            }
        }
    }
}
示例#2
0
function Scan($url)
{
    global $scanned, $pf, $extension, $skip, $freq, $priority;
    echo $url . NL;
    array_push($scanned, $url);
    $html = GetUrl($url);
    $a1 = explode("<a", $html);
    foreach ($a1 as $val) {
        $anker_parts = explode(">", $val);
        $a = $anker_parts[0];
        $href_split = explode("href=", $a);
        $href_string = $href_split[1];
        if ($href_string[0] == '"') {
            $next_url = GetQuotedUrl($href_string);
        } else {
            $spaces_split = explode(" ", $href_string);
            $next_url = str_replace("\"", "", $spaces_split[0]);
        }
        $fragment_split = explode("#", $next_url);
        $next_url = $fragment_split[0];
        if (substr($next_url, 0, 7) != "http://" && substr($next_url, 0, 8) != "https://" && substr($next_url, 0, 6) != "ftp://" && substr($next_url, 0, 7) != "mailto:") {
            if ($next_url[0] == '/') {
                $next_url = "{$scanned['0']}{$next_url}";
            } else {
                $next_url = Path($url) . $next_url;
            }
        }
        if (substr($next_url, 0, strlen($scanned[0])) == $scanned[0]) {
            $ignore = false;
            if (isset($skip)) {
                foreach ($skip as $v) {
                    if (substr($next_url, 0, strlen($v)) == $v) {
                        $ignore = true;
                    }
                }
            }
            if (!$ignore && !in_array($next_url, $scanned)) {
                foreach ($extension as $ext) {
                    //if (strpos ($next_url, $ext) < 0)
                    //{
                    fwrite($pf, "  <url>\n" . "    <loc>" . htmlentities($next_url) . "</loc>\n" . "    <changefreq>{$freq}</changefreq>\n" . "    <priority>{$priority}</priority>\n" . "  </url>\n");
                    Scan($next_url);
                    //}
                }
            }
        }
    }
}