public function handle(Spizer_Document $doc) { if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); try { $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); } catch (Diggin_Scraper_Exception $dse) { if (isset($this->_config['throwIfNotfound'])) { throw $dse; } } if (!isset($results)) { return; } if ($this->_config['debug']) { echo 'count scrape results ' . count($results['kumo']) . PHP_EOL; } foreach ($results['kumo'] as $src) { $this->send($src); } }
public function handle(Spizer_Document $document) { //check document is image $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl()); file_put_contents($filepath, $document->getBody()); $this->addHaveFiles($document->getUrl()); }
public function handle(Spizer_Document $document) { //get! encoding_to_utf-8 $body = Diggin_Http_Response_Encoding::encode($document->getBody(), $document->getHeader('content-type')); if (preg_match($this->_config['match'], $body, $m, PREG_OFFSET_CAPTURE)) { $this->engine->log('RegexMatch', array('message' => 'Document body matched lookup expression', 'regex' => $this->_config['match'], 'match' => $m[0][0], 'offset' => $m[0][1])); } }
/** * Handle incoming documents * * @param Spizer_Document $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { $strpos = $this->_config['matchcase'] ? 'strpos' : 'stripos'; $body = $document->getBody(); if (($pos = $strpos($body, $this->_config['match'])) !== false) { $this->_log(array('message' => 'Document body matched lookup string', 'needle' => $this->_config['match'], 'offset' => $pos)); } }
public function handle(Spizer_Document $document) { //check document is image $content_type = $document->getHeader('content-type'); if (!preg_match('#image/.*#i', $content_type)) { return; } $filepath = $this->_config['save_dir'] . DIRECTORY_SEPARATOR . rawurlencode($document->getUrl()); file_put_contents($filepath, $document->getBody(), FILE_BINARY); $this->addHaveFiles($document->getUrl()); }
/** * Handle incoming documents * * @param Spizer_Document_Html $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { // Silently ignore non-HTML documents if (!$document instanceof Spizer_Document_Html) { return; } $images = $document->getImages(); foreach ($images as $img) { $img = basename($img); if (strpos($img, '_') !== false) { // Image file name contains underscore $this->engine->log('ZendImages', array('message' => 'Image contains underscore in it\'s file name', 'src' => $img)); } } }
/** * Check if the handler actually needs to be called (according to it's * content type and status code), and if so call ::handle() * * @param Spizer_Document $document */ public function call(Spizer_Document $document) { $status = $document->getStatus(); $type = $document->getHeader('content-type'); $call = true; if ($this->_config['status']) { if (is_array($this->_config['status'])) { if (!in_array($status, $this->_config['status'])) { $call = false; } } elseif ($this->_config['status'] != $status) { $call = false; } } if ($this->_config['content-type']) { if (is_array($this->_config['content-type'])) { if (!in_array($type, $this->_config['content-type'])) { $call = false; } } elseif ($this->_config['content-type'] != $type) { $call = false; } } if ($call) { $this->handle($document); } }
/** * Handle incoming documents * * @param Spizer_Document_Xml $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { // Silently ignore non-XML documents if (!$document instanceof Spizer_Document_Xml) { return; } $query = $this->_config['query']; $tags = $document->getXpath()->query($query); if ($tags->length == 0) { $data = array('query' => $query); if (isset($this->_config['message'])) { $data['message'] = $this->_config['message']; } $this->_log($data); } }
protected function __construct($url, $status, array $headers, $body) { parent::__construct($url, $status, $headers, $body); $this->_domDocument = new DOMDocument(); $this->_domDocument->preserveWhiteSpace = true; // We have to silence this out because invalid documents // tend to throw allot of warnings @$this->_domDocument->loadHtml($body); }
public function handle(Spizer_Document $doc) { // Silently skip all non-HTML documents if (!$doc instanceof Spizer_Document_Html) { return; } // Add document URL to the list of visited pages $baseUrl = $doc->getUrl(); if (!in_array($baseUrl, $this->targets)) { $this->targets[] = $baseUrl; } $pagerize = new Diggin_Scraper_Helper_Simplexml_Pagerize(simplexml_import_dom($doc->getDomDocument()), array('baseUrl' => $this->toUrl($doc->getUrl()))); if ($nextLink = $pagerize->getNextLink()) { $max_follow = $this->_config['max_follow']; if (!$max_follow or $this->page_count <= $max_follow) { $this->addToQueue($nextLink, $baseUrl); ++$this->page_count; } } }
public function handle(Spizer_Document $doc) { if ($this->_callonce) { if (true === $this->_callonce) { $this->_callonce = 1; } elseif (1 == $this->_callonce) { return; } } //var_dump(__METHOD__); if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); $this->_addQueue($results['kumo']); }
/** * Handle document * * @param Spizer_Document $document */ public function handle(Spizer_Document $document) { $string = "{$document->getUrl()} {$document->getStatus()} " . strlen($document->getBody()); // Decide log level according to status code switch (round($document->getStatus() / 100)) { case 1: case 2: $level = Zend_Log::INFO; break; case 3: $level = Zend_Log::NOTICE; break; case 4: $level = Zend_Log::WARN; break; case 5: $level = Zend_Log::ERR; break; } $this->_logger->log($string, $level); }
/** * Handle incoming documents * * @param Spizer_Document_Xml $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { // Silently ignore non-XML documents if (!$document instanceof Spizer_Document_Xml) { return; } $query = $this->_config['query']; $tags = $document->getXpath()->query($query); if ($tags instanceof DOMNodeList) { foreach ($tags as $tag) { $data = array('query' => $query); if (isset($this->_config['message'])) { $data['message'] = $this->_config['message']; } if (isset($this->_config['captureValue'])) { $value = $document->getXpath()->evaluate($this->_config['captureValue'], $tag); if ($value) { $data['captureValue'] = (string) $value; } } $this->_log($data); } } }
public function handle(Spizer_Document $doc) { //$this->debug('********START****'); if (!$doc instanceof Spizer_Document_Html) { return; } $headers = $doc->getAllHeaders(); //response is already decoded. unset($headers['transfer-encoding']); unset($headers['content-encoding']); $results = $this->scraper->scrape(new Zend_Http_Response($doc->getStatus(), $headers, $doc->getBody()), $doc->getUrl()); //$this->debug($results); $targets = $this->filter(array_unique($results['kumo'])); foreach ($targets as $src) { //$request = new Spizer_Request($src); $request = new Kumo_Request($src); $request->setReferrer($doc->getUrl()); //if ($this->_config['referer'] === true) { $request->setHeader('Referer', $this->toRefererUrl($doc->getUrl())); //} $this->send($request); } }
/** * Handle document - fetch links out of the document and add them to the * queue * * @param Spizer_Document_Html $doc */ public function handle(Spizer_Document $doc) { // If need, set the match domain according to the first URL if (!isset($this->_config['domain']) && $this->_config['samedomain']) { $this->_config['domain'] = $this->_engine->getBaseUri()->getHost(); } // Add document URL to the list of visited pages $baseUrl = (string) $doc->getUrl(); if (!in_array($baseUrl, $this->_targets)) { $this->_targets[] = $baseUrl; } // Silently skip all non-HTML documents if (!$doc instanceof Spizer_Document_Html) { return; } // Fetch links out of the document $links = array(); if ($this->_config['followhref']) { $links = array_merge($links, $doc->getLinks()); } if ($this->_config['followimg']) { $links = array_merge($links, $doc->getImages()); } if ($this->_config['followlink']) { $links = array_merge($links, $doc->getHeaderLinks()); } if ($this->_config['followscript']) { $links = array_merge($links, $doc->getScriptLinks()); } if ($this->_config['followframes']) { $links = array_merge($links, $doc->getFrameLinks()); } // Iterate over all document links foreach ($links as $link) { // Try to parse URL - if we fail, skip this link (should not happen normally) if (!($parts = @parse_url($link))) { continue; } // Skip non-http schemes if (isset($parts['scheme']) && ($parts['scheme'] != 'http' && $parts['scheme'] != 'https')) { continue; } // Full URI if (isset($parts['host'])) { if (preg_match('/' . preg_quote($this->_config['domain']) . '$/', $parts['host'])) { $this->_addToQueue($link, $baseUrl); } // Partial URI } elseif (isset($parts['path'])) { try { $linkUri = clone $doc->getUrl(); $linkUri->setQuery(isset($parts['query']) ? $parts['query'] : null); $linkUri->getFragment(isset($parts['fragment']) ? $parts['fragment'] : null); // Full absolute path if (substr_compare($parts['path'], '/', 0, 1) == 0) { $linkUri->setPath($parts['path']); // Relative path } else { $basePath = $doc->getUrl()->getPath(); $pos = strrpos($basePath, '/'); if ($pos === false) { $linkUri->setPath('/' . $parts['path']); } else { $linkUri->setPath(substr($basePath, 0, $pos + 1) . $parts['path']); } } $this->_addToQueue($linkUri, $baseUrl); // If any of the URL parts is invalid, an exception will be caught here } catch (Zend_Uri_Exception $e) { $this->_log(array('link' => $link, 'message' => 'Unable to parse link URL: ' . $e->getMessage())); } } } }
/** * Handle incoming documents * * @param Spizer_Document $document * @see Spizer_Handler_Abstract::handle() */ public function handle(Spizer_Document $document) { if (preg_match($this->_config['match'], $document->getBody(), $m, PREG_OFFSET_CAPTURE)) { $this->_log(array('message' => 'Document body matched lookup expression', 'regex' => $this->_config['match'], 'match' => $m[0][0], 'offset' => $m[0][1])); } }
public function handle(Spizer_Document $doc) { echo (string) $doc->getUrl(), PHP_EOL; }
/** * Call all handlers on document * * @param Spizer_Request $request * @param Spizer_Response $response */ protected function callHandlers(Spizer_Request $request, Spizer_Response $response) { $document = Spizer_Document::factory($request, $response); // Run all common handlers foreach ($this->_handlers as $handler) { $handler->call($document); } }