function __construct(&$setup_array, &$output_array)
 {
     parent::__construct();
     $this->setup_array_raw =& $setup_array;
     $this->output_array =& $output_array;
     $this->convertSetupArray();
 }
 function __construct(&$setup_array, &$output_array)
 {
     parent::__construct();
     $this->setup_array_raw =& $setup_array;
     $this->output_array =& $output_array;
     $this->convertSetupArray();
     $this->setHTTPProtocolVersion(PHPCrawlerHTTPProtocols::HTTP_1_0);
 }
Beispiel #3
0
 public function __construct($primary)
 {
     parent::__construct();
     $parsed = parse_url($primary);
     $primary = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
     $this->primary_domain = $primary;
     $this->setURL($this->primary_domain);
 }
 /**
  * Initiates a new crawler.
  */
 public function __construct($crawler_id, $body_xpaths)
 {
     $this->crawler_id = $crawler_id;
     $this->body_xpaths = $body_xpaths;
     if (empty($this->body_xpaths)) {
         $this->body_xpaths = array('/html/body');
     }
     parent::__construct();
 }
Beispiel #5
0
<?php

// It may take a whils to crawl a site ...
set_time_limit(10000);
$depth = 2;
// Inculde the phpcrawl-mainclass
include_once "PHPCrawl_083/libs/PHPCrawler.class.php";
// Extend the class and override the handleDocumentInfo()-method
// Now, create a instance of your class, define the behaviour
// of the crawler (see class-reference for more options and details)
// and start the crawling-process.
$crawler = new PHPCrawler();
// URL to crawl
$crawler->setURL("www.vnexpress.net/");
$crawler->setCrawlingDepthLimit(2);
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule("#text/html#");
// Ignore links to pictures, dont even request pictures
$crawler->addURLFilterRule("#\\.(jpg|jpeg|gif|png)\$# i");
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Set the traffic-limit to 1 MB (in bytes,
// for testing we dont want to "suck" the whole site)
$crawler->setTrafficLimit(10000 * 1024);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") {
    $lb = "\n";
 protected function initCrawlerProcess()
 {
     parent::initCrawlerProcess();
     // Add additional URLs to crawl to the crawler's LinkCache
     // NOTE: This is using an undocumented API
     if ($extraURLs = $this->urlList->getExtraCrawlURLs()) {
         foreach ($extraURLs as $extraURL) {
             $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL));
         }
     }
     // Prevent URLs that matches the exclude patterns to be fetched
     if ($excludePatterns = $this->urlList->getExcludePatterns()) {
         foreach ($excludePatterns as $pattern) {
             $validRegExp = $this->addURLFilterRule('|' . str_replace('|', '\\|', $pattern) . '|');
             if (!$validRegExp) {
                 throw new InvalidArgumentException('Exclude url pattern "' . $pattern . '" is not a valid regular expression.');
             }
         }
     }
 }