PHP VDB\Spider Resource примеры использования

Язык программирования: PHP

Пространство имен/Пакет: VDB\Spider

Класс/Тип: Resource

Примеров на hotexamples.com: 17

PHP VDB\Spider Resource - 17 примеров найдено. Это лучшие примеры PHP кода для VDB\Spider\Resource, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getResponse(11)

getCrawler(5)

getUri(5)

setFiltered(3)

getFilterReason(1)

getIdentifier(1)

isFiltered(1)

Пример #1

Показать файл

Файл: DocumentResolver.php Проект: smolowik/concurrent-spider-bundle

 /**
  * Returns a document that can be persisted based on the resource.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return \Simgroep\ConcurrentSpiderBundle\PersistableDocument
  */
 public function getDocumentByResource(Resource $resource)
 {
     switch ($resource->getResponse()->getContentType()) {
         case 'application/pdf':
         case 'application/octet-stream':
             $data = $this->pdf->getData($resource);
             break;
         case 'application/msword':
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
         case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template':
             if (false !== stripos($resource->getUri()->toString(), '.docx')) {
                 $data = $this->word2007->getData($resource);
                 break;
             }
             $data = $this->msdoc->getData($resource);
             break;
         case 'application/rtf':
             $data = $this->rtf->getData($resource);
             break;
         case 'application/vnd.oasis.opendocument.text':
             $data = $this->odt->getData($resource);
             break;
         case 'text/html':
         default:
             $data = $this->html->getData($resource);
             break;
     }
     return new PersistableDocument($data);
 }

Пример #2

Показать файл

Файл: FileRawResponsePersistenceHandler.php Проект: aigouzz/php-spider

 public function persist(Resource $resource)
 {
     $fileName = urlencode($resource->getUri()->toString());
     $file = new \SplFileObject($this->getResultPath() . $fileName, 'w');
     $rawResponse = $resource->getResponse()->__toString();
     $this->totalSizePersisted += $file->fwrite($rawResponse);
 }

Пример #3

Показать файл

Файл: MimeTypeFilter.php Проект: aigouzz/php-spider

 public function match(Resource $resource)
 {
     if (!$resource->getResponse()->isContentType($this->allowedMimeType)) {
         $mime = $resource->getResponse()->getContentType();
         $resource->setFiltered(true, "Mime type '{$mime}' not allowed");
         return true;
     }
     return false;
 }

Пример #4

Показать файл

Файл: ResourceTest.php Проект: aktuba/php-spider

 /**
  * @covers VDB\Spider\Resource
  */
 public function testSerialization()
 {
     $serialized = serialize($this->resource);
     $unserialized = unserialize($serialized);
     $this->assertInstanceOf('VDB\\Spider\\Resource', $unserialized);
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $unserialized->getResponse());
     $this->assertInstanceOf('VDB\\Spider\\Uri\\DiscoveredUri', $unserialized->getUri());
     $this->assertEquals($this->resource->getUri()->__toString(), $unserialized->getUri()->__toString());
     $this->assertEquals($this->html, $unserialized->getResponse()->getBody()->__toString());
     $this->assertEquals($this->resource->getCrawler()->html(), $unserialized->getCrawler()->html());
 }

Пример #5

Показать файл

Файл: RabbitMqPersistenceHandler.php Проект: smolowik/concurrent-spider-bundle

 /**
  * Grabs the content from the crawled page and publishes a job on the queue.
  *
  * @param \VDB\Spider\Resource                      $resource
  * @param \Simgroep\ConcurrentSpiderBundle\CrawlJob $crawlJob
  *
  * @throws \Simgroep\ConcurrentSpiderBundle\InvalidContentException
  */
 public function persist(Resource $resource, CrawlJob $crawlJob)
 {
     if (strlen($resource->getResponse()->getBody()) >= $this->maximumResourceSize) {
         throw new InvalidContentException(sprintf('Resource size exceeds limits (%s bytes)', $this->maximumResourceSize));
     }
     $document = $this->documentResolver->getDocumentByResource($resource);
     $persistenceEvent = new PersistenceEvent($document, $resource, $crawlJob->getMetadata());
     $this->eventDispatcher->dispatch(PersistenceEvents::PRE_PERSIST, $persistenceEvent);
     $message = new AMQPMessage(json_encode(array_merge(['document' => $document->toArray()], ['metadata' => $crawlJob->getMetadata()])), ['delivery_mode' => 1]);
     $this->queue->publish($message);
 }

Пример #6

Показать файл

Файл: CssSelectorDiscoverer.php Проект: aigouzz/php-spider

 /**
  * @param Spider $spider
  * @param Resource $document
  * @return UriInterface[]
  */
 public function discover(Spider $spider, Resource $document)
 {
     $crawler = $document->getCrawler()->filter($this->cssSelector);
     $uris = array();
     foreach ($crawler as $node) {
         try {
             $uris[] = new Uri($node->getAttribute('href'), $document->getUri()->toString());
         } catch (UriSyntaxException $e) {
             $spider->getStatsHandler()->addToFailed($node->getAttribute('href'), 'Invalid URI: ' . $e->getMessage());
         }
     }
     return $uris;
 }

Пример #7

Показать файл

Файл: Word2007.php Проект: smolowik/concurrent-spider-bundle

 /**
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempFile = tempnam(sys_get_temp_dir(), 'docx');
     file_put_contents($tempFile, $resource->getResponse()->getBody());
     $reader = $this->getReader();
     //remove notice from library
     $errorReportingLevel = error_reporting();
     error_reporting($errorReportingLevel ^ E_NOTICE);
     $phpword = $reader->load($tempFile);
     //back error reporting to previous state
     error_reporting($errorReportingLevel);
     unlink($tempFile);
     $writer = $this->getWriter($phpword);
     return strip_tags($this->stripBinaryContent($writer->getContent()));
 }

Пример #8

Показать файл

Файл: Rtf.php Проект: simgroep/concurrent-spider-bundle

 /**
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempFile = $this->getTempFileName('rtf');
     file_put_contents($tempFile, $resource->getResponse()->getBody());
     $reader = $this->getReader();
     //remove notice from library
     $errorReportingLevel = error_reporting();
     error_reporting($errorReportingLevel ^ E_NOTICE);
     try {
         $phpword = $reader->load($tempFile);
     } catch (\Exception $e) {
         // too bad
     }
     //back error reporting to previous state
     error_reporting($errorReportingLevel);
     unlink($tempFile);
     $writer = $this->getWriter($phpword);
     return strip_tags($this->stripBinaryContent($writer->getContent()));
 }

Пример #9

Показать файл

Файл: Html.php Проект: smolowik/concurrent-spider-bundle

 /**
  * Extracts all text content from the crawled resource exception javascript.
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $crawler = $resource->getCrawler();
     if (null !== $this->cssBlacklist) {
         $crawler->filter($this->cssBlacklist)->each(function (Crawler $crawler) {
             foreach ($crawler as $node) {
                 $node->parentNode->removeChild($node);
             }
         });
     }
     $query = '//body//*[not(self::script)]/text()';
     $content = '';
     $crawler->filterXpath($query)->each(function (Crawler $crawler) use(&$content) {
         $text = trim($crawler->text());
         if (strlen($text) > 0) {
             $content .= $text . ' ';
         }
     });
     return trim($content);
 }

Пример #10

Показать файл

Файл: DiscovererSet.php Проект: aktuba/php-spider

 /**
  * @param Resource $resource
  * @return UriInterface[]
  */
 public function discover(Resource $resource)
 {
     $this->markSeen($resource->getUri());
     if ($this->isAtMaxDepth($resource->getUri())) {
         return [];
     }
     $discoveredUris = [];
     foreach ($this->discoverers as $discoverer) {
         $discoveredUris = array_merge($discoveredUris, $discoverer->discover($resource));
     }
     $this->normalize($discoveredUris);
     $this->removeDuplicates($discoveredUris);
     $this->filterAlreadySeen($discoveredUris);
     $this->filter($discoveredUris);
     foreach ($discoveredUris as $uri) {
         $uri->setDepthFound($resource->getUri()->getDepthFound() + 1);
         $this->markSeen($uri);
     }
     return $discoveredUris;
 }

Пример #11

Показать файл

Файл: Pdf.php Проект: simgroep/concurrent-spider-bundle

 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  * @throws InvalidContentException
  * @throws ProcessFailedException
  */
 public function extractContentFromResource(Resource $resource)
 {
     $tempPdfFile = $this->getTempFileName('pdf');
     file_put_contents($tempPdfFile, $resource->getResponse()->getBody());
     $command = sprintf('%s %s', $this->pdfToTxtCommand, $tempPdfFile);
     $process = $this->processCommand($command);
     // executes after the command finishes
     if (!$process->isSuccessful()) {
         unlink($tempPdfFile);
         throw new ProcessFailedException($process);
     }
     //remove tempPdf file
     unlink($tempPdfFile);
     //read txt file created by pdftotext command
     $tempTxtFile = sprintf('%s.txt', $tempPdfFile);
     $content = file_get_contents($tempTxtFile);
     if ($content == false) {
         unlink($tempTxtFile);
         throw new InvalidContentException(sprintf("PDF: Cant read temporary txt file converted from pdf: %s", $tempTxtFile));
     }
     //remove tempTxt file
     unlink($tempTxtFile);
     return $content;
 }

Пример #12

Показать файл

Файл: CssSelectorDiscoverer.php Проект: ggnet/php-spider

 protected function getFilteredCrawler(Resource $resource)
 {
     return $resource->getCrawler()->filter($this->selector);
 }

Пример #13

Показать файл

Файл: MimeTypeFilter.php Проект: ggnet/php-spider

 public function match(Resource $resource)
 {
     $contentType = $resource->getResponse()->getHeaderLine('Content-Type');
     return $contentType !== $this->allowedMimeType;
 }

Пример #14

Показать файл

Файл: ResourceTest.php Проект: aigouzz/php-spider

 /**
  * @covers VDB\Spider\Resource::getIdentifier
  */
 public function testGetIdentifier()
 {
     $this->assertEquals('http://example.org/domains/special', $this->resource->getIdentifier());
 }

Пример #15

Показать файл

Файл: Spider.php Проект: aigouzz/php-spider

 /**
  * Add a Resource to the processing queue
  *
  * @param Resource $resource
  * @return void
  */
 protected function addToProcessQueue(Resource $resource)
 {
     if ($this->maxQueueSize != 0 && $this->currentQueueSize >= $this->maxQueueSize) {
         $resource->setFiltered(true, 'Maximum Queue Size of ' . $this->maxQueueSize . ' reached');
         $this->getStatsHandler()->addToFiltered($resource);
         throw new QueueException('Maximum Queue Size of ' . $this->maxQueueSize . ' reached');
     }
     $this->currentQueueSize++;
     $this->getPersistenceHandler()->persist($resource);
     $this->getStatsHandler()->addToQueued($resource->getUri());
 }

Пример #16

Показать файл

Файл: ResourceTest.php Проект: ggnet/php-spider

 /**
  * @covers VDB\Spider\Resource::getResponse
  */
 public function testGetResponse()
 {
     $this->assertInstanceOf('Psr\\Http\\Message\\ResponseInterface', $this->resource->getResponse());
 }

Пример #17

Показать файл

Файл: Pdf.php Проект: smolowik/concurrent-spider-bundle

 /**
  *
  * Extract content from resource
  *
  * @param \VDB\Spider\Resource $resource
  *
  * @return string
  */
 public function extractContentFromResource(Resource $resource)
 {
     $pdf = $this->pdfParser->parseContent($resource->getResponse()->getBody(true));
     return $this->stripBinaryContent($pdf->getText());
 }