if ($opts->x || $opts->y || $opts->i) { die_single_configfile(); } $config = Spizer_Config::load($opts->p, Spizer_Config::PHP, $section); } else { die_single_configfile(); } // Make sure we have a URL $args = $opts->getRemainingArgs(); $url = isset($args[0]) ? $args[0] : $config->url; if (!$url) { spizer_usage(); exit(1); } // Set up engine $engine = new Spizer_Engine((array) $config->engine); /** * set up next-link */ Diggin_Scraper_Helper_Simplexml_Pagerize::setCache($cache = Zend_Cache::factory($config->pagerize->cache->frontend, $config->pagerize->cache->backend, $config->pagerize->cache->frontendOptions->toArray(), $config->pagerize->cache->backendOptions->toArray())); //siteinfo配列をセットします Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('mysiteinfo', $config->siteinfo->toArray()); //request wedata if (!($siteinfo = Diggin_Scraper_Helper_Simplexml_Pagerize::loadSiteinfo('wedata'))) { require_once 'Diggin/Service/Wedata.php'; //if (Diggin_Version::compareVersion('0.7')) //$pagerize = Diggin_Service_Wedata::getItems('AutoPagerize'); $wedata = new Diggin_Service_Wedata(); $wedata->setDatabaseName('AutoPagerize'); $pagerize = $wedata->getItems(); }
$url = $args[0]; if (!$url) { spizer_usage(); exit(1); } // If we have pcntl - set up a handler for sigterm if (function_exists('pcntl_signal')) { declare (ticks=1); pcntl_signal(SIGABRT, 'do_exit'); pcntl_signal(SIGHUP, 'do_exit'); pcntl_signal(SIGQUIT, 'do_exit'); pcntl_signal(SIGINT, 'do_exit'); pcntl_signal(SIGTERM, 'do_exit'); } // Instantiate Spizer engine $spizer = new Spizer_Engine(array('delay' => $delay, 'savecookies' => $opts->savecookies, 'lifo' => true)); // Set logger $logger = new Spizer_Logger_Sqlite(array('dbfile' => $log)); $spizer->setLogger($logger); // Set the spider to follow links, hrefs, images and script references $spizer->addHandler(new Spizer_Handler_LinkAppender(array('domain' => parse_url($url, PHP_URL_HOST)))); // Add some handlers to be executed on 200 OK + text/html pages $spizer->addHandler(new Spizer_Handler_StringMatch(array('match' => 'error', 'matchcase' => false, 'status' => 200, 'content-type' => 'text/html'))); $spizer->addHandler(new Spizer_Handler_StringMatch(array('match' => 'warning', 'matchcase' => false, 'status' => 200, 'content-type' => 'text/html'))); // Go! $spizer->run($url); do_exit(); // -- end here -- // Some functions function spizer_usage() {
/** * Pass some data to the logger * * @param array $data */ protected function _log($data) { $this->_engine->log($this->_name, $data); }
$url = (isset($args[0]) and parse_url($args[0], PHP_URL_HOST)) ? $args[0] : null; if (!$url) { spizer_usage(); exit(1); } // If we have pcntl - set up a handler for sigterm if (function_exists('pcntl_signal')) { declare (ticks=1); pcntl_signal(SIGABRT, 'do_exit'); pcntl_signal(SIGHUP, 'do_exit'); pcntl_signal(SIGQUIT, 'do_exit'); pcntl_signal(SIGINT, 'do_exit'); pcntl_signal(SIGTERM, 'do_exit'); } // Instantiate Spizer engine $spizer = new Spizer_Engine(array('delay' => $delay, 'savecookies' => $opts->savecookies, 'lifo' => true)); // Set logger $logger = new Spizer_Logger_Sqlite(array('dbfile' => $log)); $spizer->setLogger($logger); // Set the spider to follow links, hrefs, images and script references $spizer->addHandler(new Spizer_Handler_LinkAppender(array('domain' => parse_url($url, PHP_URL_HOST)))); $spizer->addHandler(new Kumo_Handler_Debug(array('do' => true))); /** $spizer->addHandler(new Kumo_Handler_ScrapeAndRequestSender(array( 'queueAdapter' => 'Array', 'queueOptions' => array( 'name' => 'test'), 'expression' => '//img', 'type' => '@src', ))); */