PHP MyCrawler::addURLFollowRule示例

// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new MyCrawler();
// URL to crawl (the entry-page of the mysql-documentation on php.net)
$crawler->setURL("http://www.php.net/manual/en/book.mysql.php");
// Only receive content of documents with content-type "text/html"
$crawler->addReceiveContentType("#text/html#");
// Ignore links to pictures, css-documents etc (prefilter)
$crawler->addURLFilterRule("#\\.(jpg|gif|png|pdf|jpeg|css|js)\$# i");
// Every URL within the mysql-documentation looks like
// "http://www.php.net/manual/en/function.mysql-affected-rows.php"
// or "http://www.php.net/manual/en/mysql.setup.php", they all contain
// "http://www.php.net/manual/en/" followed by  "mysql" somewhere.
// So we add a corresponding follow-rule to the crawler.
$crawler->addURLFollowRule("#^http://www.php.net/manual/en/.*mysql[^a-z]# i");
// That's it, start crawling using 5 processes
$crawler->goMultiProcessed(5);
// At the end, after the process is finished, we print a short
// report (see method getReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {
    $lb = "<br />";
}
echo "Summary:" . $lb;
echo "Links followed: " . $report->links_followed . $lb;
echo "Documents received: " . $report->files_received . $lb;
echo "Bytes received: " . $report->bytes_received . " bytes" . $lb;
echo "Process runtime: " . $report->process_runtime . " sec" . $lb;

示例#2

显示文件

文件： example.php 项目： ant1234/Crawler

                } else {
                    echo 'skip these pages <br>';
                }
            }
            $html->clear();
            unset($html);
        }
        flush();
    }
}
// Bring the crawler out
$crawler = new MyCrawler();
// URL to crawl
$crawler->setURL("http://harcourts.co.nz/");
// Crawl only URL's with the word property in them
$crawler->addURLFollowRule("#property# i");
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 10mb
$crawler->setTrafficLimit(1000 * 104857600);
// Start crawler
$crawler->go();
// At the end, after the process is finished print report
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
} else {