Beispiel #1
0
            $scheme = $bp['scheme'];
        }
        $b = $scheme . "://" . $bp['host'] . "/";
    }
    if (substr($u, 0, 2) == "//") {
        $u = "http:" . $u;
    }
    if (substr($u, 0, 4) != "http") {
        $u = rel2abs($u, $b);
    }
    return $u;
}
function crawl_site($u)
{
    global $crawled_urls;
    $uen = urlencode($u);
    if (array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time()))) {
        $html = file_get_html($u);
        $crawled_urls[$uen] = date("YmdHis");
        foreach ($html->find("a") as $li) {
            $url = perfect_url($li->href, $u);
            $enurl = urlencode($url);
            if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) {
                $found_urls[$enurl] = 1;
                echo "<li><a target='_blank' href='" . $url . "'>" . $url . "</a></li>";
            }
        }
    }
}
crawl_site("https://crawlbot.isc");
Beispiel #2
0
        }
        $b = $scheme . "://" . $bp['host'] . "/";
    }
    if (substr($u, 0, 2) == "//") {
        $u = "http:" . $u;
    }
    if (substr($u, 0, 4) != "http") {
        $u = rel2abs($u, $b);
    }
    return $u;
}
function crawl_site($u)
{
    global $crawled_urls, $found_urls;
    $uen = urlencode($u);
    if (array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time()))) {
        $html = file_get_html($u);
        $crawled_urls[$uen] = date("YmdHis");
        foreach ($html->find("a") as $li) {
            $url = perfect_url($li->href, $u);
            $enurl = urlencode($url);
            if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) {
                $found_urls[$enurl] = 1;
                echo $url . "<br/>";
            }
        }
    }
}
crawl_site("http://facebook.com");
crawl_site($url);
Beispiel #3
0
    $content = $html->find('div[id=content]');
    $rss_head = "<rss version=\"2.0\"><channel>";
    $rss_title = "<title>HD-Area Suche</title>";
    $rss_desc = "<description>HD-Area-RSS-Feed-Generator</description>";
    $rss_link = "<link>http://www.hd-area.org/?s=search&q=" . $u . "/</link>";
    $rss_item = "";
    $rss_tail = "</channel></rss>";
    foreach ($html->find('div[id=content]') as $element) {
        $urls = $element->find('a');
        foreach ($urls as $url) {
            $title = $url->title;
            $link = $url->href;
            if (!empty($title)) {
                $rss_item = $rss_item . "<item>";
                $rss_item = $rss_item . "<title>" . $title . "</title>";
                $rss_item = $rss_item . "<link>" . $link . "</link>";
                $rss_item = $rss_item . "</item>";
            }
        }
    }
    echo $rss_head;
    echo $rss_title;
    echo $rss_desc;
    echo $rss_link;
    echo $rss_item;
    echo $rss_tail;
    echo "\n";
}
//crawl_site($argv[1]);
crawl_site($_GET['var']);
 if(($bp['path']!="/" && $bp['path']!="") || $bp['path']==''){
  if($bp['scheme']==""){$scheme="http";}else{$scheme=$bp['scheme'];}
  $b=$scheme."://".$bp['host']."/";
 }
 if(substr($u,0,2)=="//"){
  $u="http:".$u;
 }
 if(substr($u,0,4)!="http"){
  $u=rel2abs($u,$b);
 }
 return $u;
}
function crawl_site($u){
 global $crawled_urls;
 $uen=urlencode($u);
 if((array_key_exists($uen,$crawled_urls)==0 || $crawled_urls[$uen] < date("YmdHis",strtotime('-25 seconds', time())))){
  $html = file_get_html($u);
  $crawled_urls[$uen]=date("YmdHis");
  foreach($html->find("a") as $li){
   $url=perfect_url($li->href,$u);
   $enurl=urlencode($url);
   if($url!='' && substr($url,0,4)!="mail" && substr($url,0,4)!="java" && array_key_exists($enurl,$found_urls)==0){
    $found_urls[$enurl]=1;
    echo "<li><a target='_blank' href='".$url."'>".$url."</a></li>";
   }
  }
 }
}
crawl_site("http://www.subinsb.com");
?>