function __construct(&$setup_array, &$output_array) { parent::__construct(); $this->setup_array_raw =& $setup_array; $this->output_array =& $output_array; $this->convertSetupArray(); }
function __construct(&$setup_array, &$output_array) { parent::__construct(); $this->setup_array_raw =& $setup_array; $this->output_array =& $output_array; $this->convertSetupArray(); $this->setHTTPProtocolVersion(PHPCrawlerHTTPProtocols::HTTP_1_0); }
public function __construct($primary) { parent::__construct(); $parsed = parse_url($primary); $primary = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; $this->primary_domain = $primary; $this->setURL($this->primary_domain); }
/** * Initiates a new crawler. */ public function __construct($crawler_id, $body_xpaths) { $this->crawler_id = $crawler_id; $this->body_xpaths = $body_xpaths; if (empty($this->body_xpaths)) { $this->body_xpaths = array('/html/body'); } parent::__construct(); }
<?php // It may take a whils to crawl a site ... set_time_limit(10000); $depth = 2; // Inculde the phpcrawl-mainclass include_once "PHPCrawl_083/libs/PHPCrawler.class.php"; // Extend the class and override the handleDocumentInfo()-method // Now, create a instance of your class, define the behaviour // of the crawler (see class-reference for more options and details) // and start the crawling-process. $crawler = new PHPCrawler(); // URL to crawl $crawler->setURL("www.vnexpress.net/"); $crawler->setCrawlingDepthLimit(2); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule("#text/html#"); // Ignore links to pictures, dont even request pictures $crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i"); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Set the traffic-limit to 1 MB (in bytes, // for testing we dont want to "suck" the whole site) $crawler->setTrafficLimit(10000 * 1024); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); if (PHP_SAPI == "cli") { $lb = "\n";
protected function initCrawlerProcess() { parent::initCrawlerProcess(); // Add additional URLs to crawl to the crawler's LinkCache // NOTE: This is using an undocumented API if ($extraURLs = $this->urlList->getExtraCrawlURLs()) { foreach ($extraURLs as $extraURL) { $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); } } // Prevent URLs that matches the exclude patterns to be fetched if ($excludePatterns = $this->urlList->getExcludePatterns()) { foreach ($excludePatterns as $pattern) { $validRegExp = $this->addURLFilterRule('|' . str_replace('|', '\\|', $pattern) . '|'); if (!$validRegExp) { throw new InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.'); } } } }