Exemple #1
0
/**
 *  crawl method
 *  Create the crawler class object and set the options for crawling
 * @param type $u URL
 */
function crawl($u)
{
    $C = new MyCrawler();
    $C->setURL($u);
    $C->addContentTypeReceiveRule("#text/html#");
    /* Only receive HTML pages */
    $C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)\$# i");
    /* We don't want to crawl non HTML pages */
    $C->setTrafficLimit(2000 * 1024);
    $C->obeyRobotsTxt(true);
    /* Should We follow robots.txt */
    $C->go();
}
Exemple #2
0
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("www.php.net");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
        // Now you should do something with the content of the actual
        // received page or file ($DocInfo->source), we skip it in this example
        echo $lb;
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl (the entry-page of the mysql-documentation on php.net)
$crawler->setURL("http://www.php.net/manual/en/book.mysql.php");
// Only receive content of documents with content-type "text/html"
$crawler->addReceiveContentType("#text/html#");
// Ignore links to pictures, css-documents etc (prefilter)
$crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i");
// Every URL within the mysql-documentation looks like
// "http://www.php.net/manual/en/function.mysql-affected-rows.php"
// or "http://www.php.net/manual/en/mysql.setup.php", they all contain
// "http://www.php.net/manual/en/" followed by  "mysql" somewhere.
// So we add a corresponding follow-rule to the crawler.
$crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i");
// That's it, start crawling using 5 processes
$crawler->goMultiProcessed(5);
// At the end, after the process is finished, we print a short
// report (see method getReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
Exemple #4
0
         $directoryName = 'crawled/' . $year . '-' . $semester . '/' . $code;
         if (!file_exists($directoryName)) {
             mkdir($directoryName, 0755, true);
         }
         $crawler = new MyCrawler();
         $crawler->code = $code;
         $crawler->file_count = 0;
         $crawler->directoryName = $directoryName;
         // URL to crawl
         $curriculum_year = $year < 2013 ? 2008 : 2013;
         $url = "https://six.akademik.itb.ac.id/publik/daftarkelas.php?ps=" . $code . "&semester=" . $semester . "&tahun=" . $year . "&th_kur=" . $curriculum_year;
         $crawler->setURL($url);
         // Only receive content of files with content-type "text/html"
         $crawler->addContentTypeReceiveRule("#text/html#");
         // Ignore links to pictures, dont even request pictures
         $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
         // Don't let it back to the main page
         $res = $crawler->addURLFilterRule("#displayprodikelas.php# i");
         // Thats enough, now here we go
         echo "Start crawling for year " . $year . " semester " . $semester;
         $crawler->go();
         // At the end, after the process is finished, we print a short
         // report (see method getProcessReport() for more information)
         $report = $crawler->getProcessReport();
         echo "Summary for " . $code . ":" . $lb;
         echo "Links followed: " . $report->links_followed . $lb;
         echo "Documents received: " . $report->files_received . $lb;
         echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
         echo "Process runtime: " . $report->process_runtime . " sec" . $lb;
     }
 }
        flush();
    }
}
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("localhost.p2.gta.charlie");
$crawler->obeyNoFollowTags(TRUE);
$crawler->obeyRobotsTxt(TRUE);
$crawler->enableAggressiveLinkSearch(FALSE);
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png|css|js)([?].*)?\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(1000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
}