public function handle(Spizer_Document $doc) { // Silently skip all non-HTML documents if (!$doc instanceof Spizer_Document_Html) { return; } // Add document URL to the list of visited pages $baseUrl = $doc->getUrl(); if (!in_array($baseUrl, $this->targets)) { $this->targets[] = $baseUrl; } $pagerize = new Diggin_Scraper_Helper_Simplexml_Pagerize(simplexml_import_dom($doc->getDomDocument()), array('baseUrl' => $this->toUrl($doc->getUrl()))); if ($nextLink = $pagerize->getNextLink()) { $max_follow = $this->_config['max_follow']; if (!$max_follow or $this->page_count <= $max_follow) { $this->addToQueue($nextLink, $baseUrl); ++$this->page_count; } } }
* set up next-link */ Diggin_Scraper_Helper_Simplexml_Pagerize::setCache($cache = Zend_Cache::factory($config->pagerize->cache->frontend, $config->pagerize->cache->backend, $config->pagerize->cache->frontendOptions->toArray(), $config->pagerize->cache->backendOptions->toArray())); //siteinfo配列をセットします Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('mysiteinfo', $config->siteinfo->toArray()); //request wedata if (!($siteinfo = Diggin_Scraper_Helper_Simplexml_Pagerize::loadSiteinfo('wedata'))) { require_once 'Diggin/Service/Wedata.php'; //if (Diggin_Version::compareVersion('0.7')) //$pagerize = Diggin_Service_Wedata::getItems('AutoPagerize'); $wedata = new Diggin_Service_Wedata(); $wedata->setDatabaseName('AutoPagerize'); $pagerize = $wedata->getItems(); } if (isset($pagerize)) { Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('wedata', new Diggin_Siteinfo_Iterator($pagerize)); } /** * Set up the logger object * * The logger type is defined in the configuration file - if it contains an * underscore in it's name, it is considered to be a user-defined logger - if * no underscore is found, the default 'Spizer_Logger_' prefix is added to * the class name. */ $type = $config->logger->type; if (strpos($type, '_') === false) { $type = 'Spizer_Logger_' . $type; } Zend_Loader::loadClass($type); $logger = new $type($config->logger->options->toArray());
} elseif ($opts->p) { if ($opts->x || $opts->y || $opts->i) { die_single_configfile(); } $config = Spizer_Config::load($opts->p, Spizer_Config::PHP, $section); } else { die_single_configfile(); } // Set up engine $engine = new Spizer_Engine((array) $config->engine); /** * set up next-link */ Diggin_Scraper_Helper_Simplexml_Pagerize::setCache(Zend_Cache::factory($config->pagerize->cache->frontend, $config->pagerize->cache->backend, $config->pagerize->cache->frontendOptions->toArray(), $config->pagerize->cache->backendOptions->toArray())); //siteinfo配列をセットします Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('mysiteinfo', array(array('url' => '^http://d.hatena.ne.jp/.+', 'nextLink' => '//a[@class="prev" and last()]'), array('url' => '^http://framework.zend.com/code/changelog/Standard_Library/', 'nextLink' => '//div[@class="changesetList"][last()]/a'), array('url' => '^http://musicrider.com/.*', 'nextLink' => '//a'))); /** * Set up the logger object * * The logger type is defined in the configuration file - if it contains an * underscore in it's name, it is considered to be a user-defined logger - if * no underscore is found, the default 'Spizer_Logger_' prefix is added to * the class name. */ $type = $config->logger->type; if (strpos($type, '_') === false) { $type = 'Spizer_Logger_' . $type; } Zend_Loader::loadClass($type); $logger = new $type($config->logger->options->toArray()); $engine->setLogger($logger);