/** * Returns a document that can be persisted based on the resource. * * @param \VDB\Spider\Resource $resource * * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument */ public function getDocumentByResource(Resource $resource) { switch ($resource->getResponse()->getContentType()) { case 'application/pdf': case 'application/octet-stream': $data = $this->pdf->getData($resource); break; case 'application/msword': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': if (false !== stripos($resource->getUri()->toString(), '.docx')) { $data = $this->word2007->getData($resource); break; } $data = $this->msdoc->getData($resource); break; case 'application/rtf': $data = $this->rtf->getData($resource); break; case 'application/vnd.oasis.opendocument.text': $data = $this->odt->getData($resource); break; case 'text/html': default: $data = $this->html->getData($resource); break; } return new PersistableDocument($data); }
public function persist(Resource $resource) { $fileName = urlencode($resource->getUri()->toString()); $file = new \SplFileObject($this->getResultPath() . $fileName, 'w'); $rawResponse = $resource->getResponse()->__toString(); $this->totalSizePersisted += $file->fwrite($rawResponse); }
public function match(Resource $resource) { if (!$resource->getResponse()->isContentType($this->allowedMimeType)) { $mime = $resource->getResponse()->getContentType(); $resource->setFiltered(true, "Mime type '{$mime}' not allowed"); return true; } return false; }
/** * @covers VDB\Spider\Resource */ public function testSerialization() { $serialized = serialize($this->resource); $unserialized = unserialize($serialized); $this->assertInstanceOf('VDB\\Spider\\Resource', $unserialized); $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $unserialized->getResponse()); $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $unserialized->getUri()); $this->assertEquals($this->resource->getUri()->__toString(), $unserialized->getUri()->__toString()); $this->assertEquals($this->html, $unserialized->getResponse()->getBody()->__toString()); $this->assertEquals($this->resource->getCrawler()->html(), $unserialized->getCrawler()->html()); }
/** * Grabs the content from the crawled page and publishes a job on the queue. * * @param \VDB\Spider\Resource $resource * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob * * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException */ public function persist(Resource $resource, CrawlJob $crawlJob) { if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) { throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize)); } $document = $this->documentResolver->getDocumentByResource($resource); $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata()); $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent); $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]); $this->queue->publish($message); }
/** * @param Spider $spider * @param Resource $document * @return UriInterface[] */ public function discover(Spider $spider, Resource $document) { $crawler = $document->getCrawler()->filter($this->cssSelector); $uris = array(); foreach ($crawler as $node) { try { $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString()); } catch (UriSyntaxException $e) { $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage()); } } return $uris; }
/** * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $tempFile = tempnam(sys_get_temp_dir(), 'docx'); file_put_contents($tempFile, $resource->getResponse()->getBody()); $reader = $this->getReader(); //remove notice from library $errorReportingLevel = error_reporting(); error_reporting($errorReportingLevel ^ E_NOTICE); $phpword = $reader->load($tempFile); //back error reporting to previous state error_reporting($errorReportingLevel); unlink($tempFile); $writer = $this->getWriter($phpword); return strip_tags($this->stripBinaryContent($writer->getContent())); }
/** * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $tempFile = $this->getTempFileName('rtf'); file_put_contents($tempFile, $resource->getResponse()->getBody()); $reader = $this->getReader(); //remove notice from library $errorReportingLevel = error_reporting(); error_reporting($errorReportingLevel ^ E_NOTICE); try { $phpword = $reader->load($tempFile); } catch (\Exception $e) { // too bad } //back error reporting to previous state error_reporting($errorReportingLevel); unlink($tempFile); $writer = $this->getWriter($phpword); return strip_tags($this->stripBinaryContent($writer->getContent())); }
/** * Extracts all text content from the crawled resource exception javascript. * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $crawler = $resource->getCrawler(); if (null !== $this->cssBlacklist) { $crawler->filter($this->cssBlacklist)->each(function (Crawler $crawler) { foreach ($crawler as $node) { $node->parentNode->removeChild($node); } }); } $query = '//body//*[not(self::script)]/text()'; $content = ''; $crawler->filterXpath($query)->each(function (Crawler $crawler) use(&$content) { $text = trim($crawler->text()); if (strlen($text) > 0) { $content .= $text . ' '; } }); return trim($content); }
/** * @param Resource $resource * @return UriInterface[] */ public function discover(Resource $resource) { $this->markSeen($resource->getUri()); if ($this->isAtMaxDepth($resource->getUri())) { return []; } $discoveredUris = []; foreach ($this->discoverers as $discoverer) { $discoveredUris = array_merge($discoveredUris, $discoverer->discover($resource)); } $this->normalize($discoveredUris); $this->removeDuplicates($discoveredUris); $this->filterAlreadySeen($discoveredUris); $this->filter($discoveredUris); foreach ($discoveredUris as $uri) { $uri->setDepthFound($resource->getUri()->getDepthFound() + 1); $this->markSeen($uri); } return $discoveredUris; }
/** * * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string * @throws InvalidContentException * @throws ProcessFailedException */ public function extractContentFromResource(Resource $resource) { $tempPdfFile = $this->getTempFileName('pdf'); file_put_contents($tempPdfFile, $resource->getResponse()->getBody()); $command = sprintf('%s %s', $this->pdfToTxtCommand, $tempPdfFile); $process = $this->processCommand($command); // executes after the command finishes if (!$process->isSuccessful()) { unlink($tempPdfFile); throw new ProcessFailedException($process); } //remove tempPdf file unlink($tempPdfFile); //read txt file created by pdftotext command $tempTxtFile = sprintf('%s.txt', $tempPdfFile); $content = file_get_contents($tempTxtFile); if ($content == false) { unlink($tempTxtFile); throw new InvalidContentException(sprintf("PDF: Cant read temporary txt file converted from pdf: %s", $tempTxtFile)); } //remove tempTxt file unlink($tempTxtFile); return $content; }
protected function getFilteredCrawler(Resource $resource) { return $resource->getCrawler()->filter($this->selector); }
public function match(Resource $resource) { $contentType = $resource->getResponse()->getHeaderLine('Content-Type'); return $contentType !== $this->allowedMimeType; }
/** * @covers VDB\Spider\Resource::getIdentifier */ public function testGetIdentifier() { $this->assertEquals('http://example.org/domains/special', $this->resource->getIdentifier()); }
/** * Add a Resource to the processing queue * * @param Resource $resource * @return void */ protected function addToProcessQueue(Resource $resource) { if ($this->maxQueueSize != 0 && $this->currentQueueSize >= $this->maxQueueSize) { $resource->setFiltered(true, 'Maximum Queue Size of ' . $this->maxQueueSize . ' reached'); $this->getStatsHandler()->addToFiltered($resource); throw new QueueException('Maximum Queue Size of ' . $this->maxQueueSize . ' reached'); } $this->currentQueueSize++; $this->getPersistenceHandler()->persist($resource); $this->getStatsHandler()->addToQueued($resource->getUri()); }
/** * @covers VDB\Spider\Resource::getResponse */ public function testGetResponse() { $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse()); }
/** * * Extract content from resource * * @param \VDB\Spider\Resource $resource * * @return string */ public function extractContentFromResource(Resource $resource) { $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true)); return $this->stripBinaryContent($pdf->getText()); }