Example #1
0
function crawl_url_once($url)
{
    //Main crawling function
    if ($GLOBALS['debug']) {
        echo "crawl_url_once\n";
    }
    save_crawled_url($url);
    $in = @file($url);
    if (!$in || !is_array($in)) {
        return 1;
    }
    foreach ($in as $line) {
        $line = spliti('href="http://', $line);
        if (sizeof($line) > 1) {
            array_shift($line);
            //print_r($line); //Debug
            foreach ($line as $nurl) {
                $nurl = spliti('(\\?|#|\\*|")', $nurl);
                $nurl = 'http://' . trim(htmlspecialchars_decode($nurl[0]));
                //echo($nurl."\n"); //Debug
                if (check_crawl_url($nurl)) {
                    save_crawled_url($nurl);
                }
            }
        }
    }
}
Example #2
0
     echo "-Parsing: {$url}\n";
 }
 if (!$use_curlbin) {
     $page = @file_get_contents($url, false, null, 0, $max_size);
 } else {
     $page = shell_curl_get($url, $socket_timeout, $max_size);
 }
 preg_match_all('(http:\\/\\/[_a-zA-Z0-9\\.\\-]+\\.[a-zA-Z]{2,4}\\/{1}[-_~&=\\ ?\\.a-z0-9\\/]*)', htmlspecialchars_decode($page), $new_urls);
 $new_urls = $new_urls[0];
 foreach ($new_urls as $new_url) {
     //Process grabed URLs
     if ($debug) {
         $c++;
     }
     ///Debug
     if (check_crawl_url($new_url)) {
         echo $new_url . "\n";
         fwrite($fp, $new_url . "\n");
         if ($use_sortbin) {
             //Handle sorting (unique URLs)
             $sort_next--;
             if ($sort_next <= 0) {
                 fclose($fp);
                 shell_sort_file($file);
                 if ($debug) {
                     $sorted_db = 0;
                     $fp = fopen($file, 'r');
                     while (!feof($fp)) {
                         fgets($fp);
                         $sorted_db++;
                     }