$scheme = $bp['scheme']; } $b = $scheme . "://" . $bp['host'] . "/"; } if (substr($u, 0, 2) == "//") { $u = "http:" . $u; } if (substr($u, 0, 4) != "http") { $u = rel2abs($u, $b); } return $u; } function crawl_site($u) { global $crawled_urls; $uen = urlencode($u); if (array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time()))) { $html = file_get_html($u); $crawled_urls[$uen] = date("YmdHis"); foreach ($html->find("a") as $li) { $url = perfect_url($li->href, $u); $enurl = urlencode($url); if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) { $found_urls[$enurl] = 1; echo "<li><a target='_blank' href='" . $url . "'>" . $url . "</a></li>"; } } } } crawl_site("https://crawlbot.isc");
} $b = $scheme . "://" . $bp['host'] . "/"; } if (substr($u, 0, 2) == "//") { $u = "http:" . $u; } if (substr($u, 0, 4) != "http") { $u = rel2abs($u, $b); } return $u; } function crawl_site($u) { global $crawled_urls, $found_urls; $uen = urlencode($u); if (array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time()))) { $html = file_get_html($u); $crawled_urls[$uen] = date("YmdHis"); foreach ($html->find("a") as $li) { $url = perfect_url($li->href, $u); $enurl = urlencode($url); if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) { $found_urls[$enurl] = 1; echo $url . "<br/>"; } } } } crawl_site("http://facebook.com"); crawl_site($url);
$content = $html->find('div[id=content]'); $rss_head = "<rss version=\"2.0\"><channel>"; $rss_title = "<title>HD-Area Suche</title>"; $rss_desc = "<description>HD-Area-RSS-Feed-Generator</description>"; $rss_link = "<link>http://www.hd-area.org/?s=search&q=" . $u . "/</link>"; $rss_item = ""; $rss_tail = "</channel></rss>"; foreach ($html->find('div[id=content]') as $element) { $urls = $element->find('a'); foreach ($urls as $url) { $title = $url->title; $link = $url->href; if (!empty($title)) { $rss_item = $rss_item . "<item>"; $rss_item = $rss_item . "<title>" . $title . "</title>"; $rss_item = $rss_item . "<link>" . $link . "</link>"; $rss_item = $rss_item . "</item>"; } } } echo $rss_head; echo $rss_title; echo $rss_desc; echo $rss_link; echo $rss_item; echo $rss_tail; echo "\n"; } //crawl_site($argv[1]); crawl_site($_GET['var']);
if(($bp['path']!="/" && $bp['path']!="") || $bp['path']==''){ if($bp['scheme']==""){$scheme="http";}else{$scheme=$bp['scheme'];} $b=$scheme."://".$bp['host']."/"; } if(substr($u,0,2)=="//"){ $u="http:".$u; } if(substr($u,0,4)!="http"){ $u=rel2abs($u,$b); } return $u; } function crawl_site($u){ global $crawled_urls; $uen=urlencode($u); if((array_key_exists($uen,$crawled_urls)==0 || $crawled_urls[$uen] < date("YmdHis",strtotime('-25 seconds', time())))){ $html = file_get_html($u); $crawled_urls[$uen]=date("YmdHis"); foreach($html->find("a") as $li){ $url=perfect_url($li->href,$u); $enurl=urlencode($url); if($url!='' && substr($url,0,4)!="mail" && substr($url,0,4)!="java" && array_key_exists($enurl,$found_urls)==0){ $found_urls[$enurl]=1; echo "<li><a target='_blank' href='".$url."'>".$url."</a></li>"; } } } } crawl_site("http://www.subinsb.com"); ?>