PHP Simgroep\ConcurrentSpiderBundle UrlCheck Examples

Programming Language: PHP

Namespace/Package Name: Simgroep\ConcurrentSpiderBundle

Class/Type: UrlCheck

Examples at hotexamples.com: 4

PHP Simgroep\ConcurrentSpiderBundle UrlCheck - 4 examples found. These are the top rated real world PHP examples of Simgroep\ConcurrentSpiderBundle\UrlCheck extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

isUrlBlacklisted(3)

isAllowedToCrawl(2)

fixUrl(2)

areHostsEqual(1)

isUrlWhitelisted(1)

Example #1

Show file

File: UrlCheckTest.php Project: simgroep/concurrent-spider-bundle

 public function testIsAllowedToCrawl()
 {
     $whitelist = ["^http://example.com", "^https://facebook.com/user1"];
     $blacklist = ['^(file|ftp|mailto):', '&type\\=rubriek', 'http://example.com/Some%20Thing/'];
     $this->assertTrue(UrlCheck::isAllowedToCrawl("http://example.com/Some%20Thing", "http://example.com/item/article1", $blacklist, $whitelist));
     $this->assertFalse(UrlCheck::isAllowedToCrawl("http://example.com/Some%20Thing/article1", "http://example.com/item/article1", $blacklist, $whitelist));
     $this->assertTrue(UrlCheck::isAllowedToCrawl("https://facebook.com/user1", "http://example.com/item/article1", $blacklist, $whitelist));
 }

Example #2

Show file

File: DiscoverUrlListener.php Project: simgroep/concurrent-spider-bundle

 /**
  * Writes the found URL as a job on the queue.
  *
  * And URL is only persisted to the queue when it not has been indexed yet.
  *
  * @param \Symfony\Component\EventDispatcher\GenericEvent $event
  */
 public function onDiscoverUrl(Event $event)
 {
     $crawlJob = $event->getSubject()->getCurrentCrawlJob();
     foreach ($event['uris'] as $uri) {
         if ($position = strpos($uri, '#')) {
             $uri = new Uri(substr($uri, 0, $position));
         }
         $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist());
         if ($isBlacklisted) {
             $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri]));
             continue;
             //url blacklisted, so go to next one
         }
         if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) {
             $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist());
             if ($job->isAllowedToCrawl()) {
                 $this->queue->publishJob($job);
             }
         }
     }
 }

Example #3

Show file

File: RecrawlCommand.php Project: simgroep/concurrent-spider-bundle

 public function dropBlacklistedDocuments(array $blacklist, $metadata)
 {
     $result = $this->indexer->getDocumentUrlsInCore(['core' => $metadata->core]);
     $toDelete = [];
     foreach ($result as $document) {
         if (UrlCheck::isUrlBlacklisted($document->url, $blacklist)) {
             $toDelete[] = ['core' => $metadata->core, 'id' => $document->id, 'url' => $document->url];
         }
     }
     foreach ($toDelete as $document) {
         $this->indexer->deleteDocumentById(['core' => $document['core']], $document['id']);
         $this->logMessage("info", sprintf("Delete document %s. URL: %s", $document['id'], $document['url']), $document['url']);
     }
 }

Example #4

Show file

File: CrawlJob.php Project: simgroep/concurrent-spider-bundle

 /**
  * Check if url form job is allowed to be crawled
  *
  * @return boolean
  */
 public function isAllowedToCrawl()
 {
     return UrlCheck::isAllowedToCrawl($this->url, $this->baseUrl, $this->blacklist, $this->whitelist);
 }