function crawl_url_once($url) { //Main crawling function if ($GLOBALS['debug']) { echo "crawl_url_once\n"; } save_crawled_url($url); $in = @file($url); if (!$in || !is_array($in)) { return 1; } foreach ($in as $line) { $line = spliti('href="http://', $line); if (sizeof($line) > 1) { array_shift($line); //print_r($line); //Debug foreach ($line as $nurl) { $nurl = spliti('(\\?|#|\\*|")', $nurl); $nurl = 'http://' . trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug if (check_crawl_url($nurl)) { save_crawled_url($nurl); } } } } }
echo "-Parsing: {$url}\n"; } if (!$use_curlbin) { $page = @file_get_contents($url, false, null, 0, $max_size); } else { $page = shell_curl_get($url, $socket_timeout, $max_size); } preg_match_all('(http:\\/\\/[_a-zA-Z0-9\\.\\-]+\\.[a-zA-Z]{2,4}\\/{1}[-_~&=\\ ?\\.a-z0-9\\/]*)', htmlspecialchars_decode($page), $new_urls); $new_urls = $new_urls[0]; foreach ($new_urls as $new_url) { //Process grabed URLs if ($debug) { $c++; } ///Debug if (check_crawl_url($new_url)) { echo $new_url . "\n"; fwrite($fp, $new_url . "\n"); if ($use_sortbin) { //Handle sorting (unique URLs) $sort_next--; if ($sort_next <= 0) { fclose($fp); shell_sort_file($file); if ($debug) { $sorted_db = 0; $fp = fopen($file, 'r'); while (!feof($fp)) { fgets($fp); $sorted_db++; }