Example #1
0
 public function handle(Spizer_Document $doc)
 {
     // Silently skip all non-HTML documents
     if (!$doc instanceof Spizer_Document_Html) {
         return;
     }
     // Add document URL to the list of visited pages
     $baseUrl = $doc->getUrl();
     if (!in_array($baseUrl, $this->targets)) {
         $this->targets[] = $baseUrl;
     }
     $pagerize = new Diggin_Scraper_Helper_Simplexml_Pagerize(simplexml_import_dom($doc->getDomDocument()), array('baseUrl' => $this->toUrl($doc->getUrl())));
     if ($nextLink = $pagerize->getNextLink()) {
         $max_follow = $this->_config['max_follow'];
         if (!$max_follow or $this->page_count <= $max_follow) {
             $this->addToQueue($nextLink, $baseUrl);
             ++$this->page_count;
         }
     }
 }
Example #2
0
 * set up next-link
 */
Diggin_Scraper_Helper_Simplexml_Pagerize::setCache($cache = Zend_Cache::factory($config->pagerize->cache->frontend, $config->pagerize->cache->backend, $config->pagerize->cache->frontendOptions->toArray(), $config->pagerize->cache->backendOptions->toArray()));
//siteinfo配列をセットします
Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('mysiteinfo', $config->siteinfo->toArray());
//request wedata
if (!($siteinfo = Diggin_Scraper_Helper_Simplexml_Pagerize::loadSiteinfo('wedata'))) {
    require_once 'Diggin/Service/Wedata.php';
    //if (Diggin_Version::compareVersion('0.7'))
    //$pagerize = Diggin_Service_Wedata::getItems('AutoPagerize');
    $wedata = new Diggin_Service_Wedata();
    $wedata->setDatabaseName('AutoPagerize');
    $pagerize = $wedata->getItems();
}
if (isset($pagerize)) {
    Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('wedata', new Diggin_Siteinfo_Iterator($pagerize));
}
/**
 * Set up the logger object
 * 
 * The logger type is defined in the configuration file - if it contains an 
 * underscore in it's name, it is considered to be a user-defined logger - if
 * no underscore is found, the default 'Spizer_Logger_' prefix is added to 
 * the class name.
 */
$type = $config->logger->type;
if (strpos($type, '_') === false) {
    $type = 'Spizer_Logger_' . $type;
}
Zend_Loader::loadClass($type);
$logger = new $type($config->logger->options->toArray());
Example #3
0
} elseif ($opts->p) {
    if ($opts->x || $opts->y || $opts->i) {
        die_single_configfile();
    }
    $config = Spizer_Config::load($opts->p, Spizer_Config::PHP, $section);
} else {
    die_single_configfile();
}
// Set up engine
$engine = new Spizer_Engine((array) $config->engine);
/**
 * set up next-link
 */
Diggin_Scraper_Helper_Simplexml_Pagerize::setCache(Zend_Cache::factory($config->pagerize->cache->frontend, $config->pagerize->cache->backend, $config->pagerize->cache->frontendOptions->toArray(), $config->pagerize->cache->backendOptions->toArray()));
//siteinfo配列をセットします
Diggin_Scraper_Helper_Simplexml_Pagerize::appendSiteInfo('mysiteinfo', array(array('url' => '^http://d.hatena.ne.jp/.+', 'nextLink' => '//a[@class="prev" and last()]'), array('url' => '^http://framework.zend.com/code/changelog/Standard_Library/', 'nextLink' => '//div[@class="changesetList"][last()]/a'), array('url' => '^http://musicrider.com/.*', 'nextLink' => '//a')));
/**
 * Set up the logger object
 * 
 * The logger type is defined in the configuration file - if it contains an 
 * underscore in it's name, it is considered to be a user-defined logger - if
 * no underscore is found, the default 'Spizer_Logger_' prefix is added to 
 * the class name.
 */
$type = $config->logger->type;
if (strpos($type, '_') === false) {
    $type = 'Spizer_Logger_' . $type;
}
Zend_Loader::loadClass($type);
$logger = new $type($config->logger->options->toArray());
$engine->setLogger($logger);