Ejemplo n.º 1
0
 /**
  * @param  array                 $config
  *                                       takes an array of url and optional data_config pairs eg:
  *                                       array(
  *                                       array( // page config
  *                                       'url' => 'http://www.example.com',
  *                                       'data_config' => array(
  *                                       'key' => function($pageObj){ return value },
  *                                       'key' => 'xpath',
  *                                       )
  *                                       ),
  *                                       )
  * @return PageBuilderCollection
  */
 public static function get(array $config)
 {
     $builders = new PageBuilderCollection();
     if (!empty($config)) {
         foreach ($config as $page_config) {
             if (isset($page_config['url'])) {
                 $page = new Page($page_config['url']);
                 $builder = new PageBuilder($page);
                 if (isset($page_config['data_config'])) {
                     $builder->setDataConfig($page_config['data_config']);
                 }
                 $builders->add($builder);
             }
         }
     }
     return $builders;
 }
Ejemplo n.º 2
0
<?php

libxml_use_internal_errors(true);
include __DIR__ . '/../vendor/autoload.php';
use PageScraper\Page\Page;
use PageScraper\Builder\PageBuilder;
use PageScraper\Director\PageBuilderDirector;
// create a page object
$page = new Page();
// set the url that needs to be fetched
$page->setUrl('https://news.ycombinator.com');
// builder contains the logic to fetch the remote page
// by default there is one builder right now
// which uses file_get_contents to fetch the remote pate
// we can add more builders which can use CURL or other
// technique to fetch the remote page
$builder = new PageBuilder($page);
// set the data that need to be retrieved from the remote page
$builder->setDataConfig(array('titles' => '//td[@class="title"]//a/text()', 'links' => '//td[@class="title"]//a/@href'));
// use the director to instruct the builder to configure the page object
$director = new PageBuilderDirector($builder);
// finally fetch the remote page and configure the Page object
$director->buildPage();
// get the queried data
$data = $page->getData();
// display it
echo '<pre>';
print_r(array_combine($data['titles'], $data['links']));
echo '</pre>';