Exemple #1
0
    if ($opts->x || $opts->y || $opts->i) {
        die_single_configfile();
    }
    $config = Spizer_Config::load($opts->p, Spizer_Config::PHP, $section);
} else {
    die_single_configfile();
}
// Make sure we have a URL
$args = $opts->getRemainingArgs();
$url = isset($args[0]) ? $args[0] : $config->url;
if (!$url) {
    spizer_usage();
    exit(1);
}
// Set up engine
$engine = new Spizer_Engine((array) $config->engine);
/**
 * set up next-link
 */
Diggin_Scraper_Helper_Simplexml_Pagerize::setCache($cache = Zend_Cache::factory($config->pagerize->cache->frontend, $config->pagerize->cache->backend, $config->pagerize->cache->frontendOptions->toArray(), $config->pagerize->cache->backendOptions->toArray()));
//siteinfo配列をセットします
Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('mysiteinfo', $config->siteinfo->toArray());
//request wedata
if (!($siteinfo = Diggin_Scraper_Helper_Simplexml_Pagerize::loadSiteinfo('wedata'))) {
    require_once 'Diggin/Service/Wedata.php';
    //if (Diggin_Version::compareVersion('0.7'))
    //$pagerize = Diggin_Service_Wedata::getItems('AutoPagerize');
    $wedata = new Diggin_Service_Wedata();
    $wedata->setDatabaseName('AutoPagerize');
    $pagerize = $wedata->getItems();
}
$url = $args[0];
if (!$url) {
    spizer_usage();
    exit(1);
}
// If we have pcntl - set up a handler for sigterm
if (function_exists('pcntl_signal')) {
    declare (ticks=1);
    pcntl_signal(SIGABRT, 'do_exit');
    pcntl_signal(SIGHUP, 'do_exit');
    pcntl_signal(SIGQUIT, 'do_exit');
    pcntl_signal(SIGINT, 'do_exit');
    pcntl_signal(SIGTERM, 'do_exit');
}
// Instantiate Spizer engine
$spizer = new Spizer_Engine(array('delay' => $delay, 'savecookies' => $opts->savecookies, 'lifo' => true));
// Set logger
$logger = new Spizer_Logger_Sqlite(array('dbfile' => $log));
$spizer->setLogger($logger);
// Set the spider to follow links, hrefs, images and script references
$spizer->addHandler(new Spizer_Handler_LinkAppender(array('domain' => parse_url($url, PHP_URL_HOST))));
// Add some handlers to be executed on 200 OK + text/html pages
$spizer->addHandler(new Spizer_Handler_StringMatch(array('match' => 'error', 'matchcase' => false, 'status' => 200, 'content-type' => 'text/html')));
$spizer->addHandler(new Spizer_Handler_StringMatch(array('match' => 'warning', 'matchcase' => false, 'status' => 200, 'content-type' => 'text/html')));
// Go!
$spizer->run($url);
do_exit();
// -- end here --
// Some functions
function spizer_usage()
{
 /**
  * Pass some data to the logger
  * 
  * @param array $data
  */
 protected function _log($data)
 {
     $this->_engine->log($this->_name, $data);
 }
Exemple #4
0
$url = (isset($args[0]) and parse_url($args[0], PHP_URL_HOST)) ? $args[0] : null;
if (!$url) {
    spizer_usage();
    exit(1);
}
// If we have pcntl - set up a handler for sigterm
if (function_exists('pcntl_signal')) {
    declare (ticks=1);
    pcntl_signal(SIGABRT, 'do_exit');
    pcntl_signal(SIGHUP, 'do_exit');
    pcntl_signal(SIGQUIT, 'do_exit');
    pcntl_signal(SIGINT, 'do_exit');
    pcntl_signal(SIGTERM, 'do_exit');
}
// Instantiate Spizer engine
$spizer = new Spizer_Engine(array('delay' => $delay, 'savecookies' => $opts->savecookies, 'lifo' => true));
// Set logger
$logger = new Spizer_Logger_Sqlite(array('dbfile' => $log));
$spizer->setLogger($logger);
// Set the spider to follow links, hrefs, images and script references
$spizer->addHandler(new Spizer_Handler_LinkAppender(array('domain' => parse_url($url, PHP_URL_HOST))));
$spizer->addHandler(new Kumo_Handler_Debug(array('do' => true)));
/**
$spizer->addHandler(new Kumo_Handler_ScrapeAndRequestSender(array(
    'queueAdapter' => 'Array',
    'queueOptions' => array(
        'name' => 'test'),
    'expression' => '//img',
    'type' => '@src',
)));
*/