public function handle(Spizer_Document $document) { //check document is image $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl()); file_put_contents($filepath, $document->getBody()); $this->addHaveFiles($document->getUrl()); }
public function handle(Spizer_Document $document) { //check document is image $content_type = $document->getHeader('content-type'); if (!preg_match('#image/.*#i', $content_type)) { return; } $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl()); file_put_contents($filepath, $document->getBody(), FILE_BINARY); $this->addHaveFiles($document->getUrl()); }
public function handle(Spizer_Document $doc) { if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); try { $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); } catch (Diggin_Scraper_Exception $dse) { if (isset($this->_config['throwIfNotfound'])) { throw $dse; } } if (!isset($results)) { return; } if ($this->_config['debug']) { echo 'count scrape results ' . count($results['kumo']) . PHP_EOL; } foreach ($results['kumo'] as $src) { $this->send($src); } }
public function handle(Spizer_Document $doc) { // Silently skip all non-HTML documents if (!$doc instanceof Spizer_Document_Html) { return; } // Add document URL to the list of visited pages $baseUrl = $doc->getUrl(); if (!in_array($baseUrl, $this->targets)) { $this->targets[] = $baseUrl; } $pagerize = new Diggin_Scraper_Helper_Simplexml_Pagerize(simplexml_import_dom($doc->getDomDocument()), array('baseUrl' => $this->toUrl($doc->getUrl()))); if ($nextLink = $pagerize->getNextLink()) { $max_follow = $this->_config['max_follow']; if (!$max_follow or $this->page_count <= $max_follow) { $this->addToQueue($nextLink, $baseUrl); ++$this->page_count; } } }
public function handle(Spizer_Document $doc) { //$this->debug('********START****'); if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); //$this->debug($results); $targets = $this->filter(array_unique($results['kumo'])); foreach ($targets as $src) { //$request = new Spizer_Request($src); $request = new Kumo_Request($src); $request->setReferrer($doc->getUrl()); //if ($this->_config['referer'] === true) { $request->setHeader('Referer', $this->toRefererUrl($doc->getUrl())); //} $this->send($request); } }
public function handle(Spizer_Document $doc) { if ($this->_callonce) { if (true === $this->_callonce) { $this->_callonce = 1; } elseif (1 == $this->_callonce) { return; } } //var_dump(__METHOD__); if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); $this->_addQueue($results['kumo']); }
/** * Handle document * * @param Spizer_Document $document */ public function handle(Spizer_Document $document) { $string = "{$document->getUrl()} {$document->getStatus()} " . strlen($document->getBody()); // Decide log level according to status code switch (round($document->getStatus() / 100)) { case 1: case 2: $level = Zend_Log::INFO; break; case 3: $level = Zend_Log::NOTICE; break; case 4: $level = Zend_Log::WARN; break; case 5: $level = Zend_Log::ERR; break; } $this->_logger->log($string, $level); }
/** * Handle document - fetch links out of the document and add them to the * queue * * @param Spizer_Document_Html $doc */ public function handle(Spizer_Document $doc) { // If need, set the match domain according to the first URL if (!isset($this->_config['domain']) && $this->_config['samedomain']) { $this->_config['domain'] = $this->_engine->getBaseUri()->getHost(); } // Add document URL to the list of visited pages $baseUrl = (string) $doc->getUrl(); if (!in_array($baseUrl, $this->_targets)) { $this->_targets[] = $baseUrl; } // Silently skip all non-HTML documents if (!$doc instanceof Spizer_Document_Html) { return; } // Fetch links out of the document $links = array(); if ($this->_config['followhref']) { $links = array_merge($links, $doc->getLinks()); } if ($this->_config['followimg']) { $links = array_merge($links, $doc->getImages()); } if ($this->_config['followlink']) { $links = array_merge($links, $doc->getHeaderLinks()); } if ($this->_config['followscript']) { $links = array_merge($links, $doc->getScriptLinks()); } if ($this->_config['followframes']) { $links = array_merge($links, $doc->getFrameLinks()); } // Iterate over all document links foreach ($links as $link) { // Try to parse URL - if we fail, skip this link (should not happen normally) if (!($parts = @parse_url($link))) { continue; } // Skip non-http schemes if (isset($parts['scheme']) && ($parts['scheme'] != 'http' && $parts['scheme'] != 'https')) { continue; } // Full URI if (isset($parts['host'])) { if (preg_match('/' . preg_quote($this->_config['domain']) . '$/', $parts['host'])) { $this->_addToQueue($link, $baseUrl); } // Partial URI } elseif (isset($parts['path'])) { try { $linkUri = clone $doc->getUrl(); $linkUri->setQuery(isset($parts['query']) ? $parts['query'] : null); $linkUri->getFragment(isset($parts['fragment']) ? $parts['fragment'] : null); // Full absolute path if (substr_compare($parts['path'], '/', 0, 1) == 0) { $linkUri->setPath($parts['path']); // Relative path } else { $basePath = $doc->getUrl()->getPath(); $pos = strrpos($basePath, '/'); if ($pos === false) { $linkUri->setPath('/' . $parts['path']); } else { $linkUri->setPath(substr($basePath, 0, $pos + 1) . $parts['path']); } } $this->_addToQueue($linkUri, $baseUrl); // If any of the URL parts is invalid, an exception will be caught here } catch (Zend_Uri_Exception $e) { $this->_log(array('link' => $link, 'message' => 'Unable to parse link URL: ' . $e->getMessage())); } } } }
public function handle(Spizer_Document $doc) { echo (string) $doc->getUrl(), PHP_EOL; }