public function testIsAllowedToCrawl()
 {
     $whitelist = ["^http://example.com", "^https://facebook.com/user1"];
     $blacklist = ['^(file|ftp|mailto):', '&type\\=rubriek', 'http://example.com/Some%20Thing/'];
     $this->assertTrue(UrlCheck::isAllowedToCrawl("http://example.com/Some%20Thing", "http://example.com/item/article1", $blacklist, $whitelist));
     $this->assertFalse(UrlCheck::isAllowedToCrawl("http://example.com/Some%20Thing/article1", "http://example.com/item/article1", $blacklist, $whitelist));
     $this->assertTrue(UrlCheck::isAllowedToCrawl("https://facebook.com/user1", "http://example.com/item/article1", $blacklist, $whitelist));
 }
 /**
  * Writes the found URL as a job on the queue.
  *
  * And URL is only persisted to the queue when it not has been indexed yet.
  *
  * @param \Symfony\Component\EventDispatcher\GenericEvent $event
  */
 public function onDiscoverUrl(Event $event)
 {
     $crawlJob = $event->getSubject()->getCurrentCrawlJob();
     foreach ($event['uris'] as $uri) {
         if ($position = strpos($uri, '#')) {
             $uri = new Uri(substr($uri, 0, $position));
         }
         $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist());
         if ($isBlacklisted) {
             $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri]));
             continue;
             //url blacklisted, so go to next one
         }
         if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) {
             $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist());
             if ($job->isAllowedToCrawl()) {
                 $this->queue->publishJob($job);
             }
         }
     }
 }
 public function dropBlacklistedDocuments(array $blacklist, $metadata)
 {
     $result = $this->indexer->getDocumentUrlsInCore(['core' => $metadata->core]);
     $toDelete = [];
     foreach ($result as $document) {
         if (UrlCheck::isUrlBlacklisted($document->url, $blacklist)) {
             $toDelete[] = ['core' => $metadata->core, 'id' => $document->id, 'url' => $document->url];
         }
     }
     foreach ($toDelete as $document) {
         $this->indexer->deleteDocumentById(['core' => $document['core']], $document['id']);
         $this->logMessage("info", sprintf("Delete document %s. URL: %s", $document['id'], $document['url']), $document['url']);
     }
 }
 /**
  * Check if url form job is allowed to be crawled
  *
  * @return boolean
  */
 public function isAllowedToCrawl()
 {
     return UrlCheck::isAllowedToCrawl($this->url, $this->baseUrl, $this->blacklist, $this->whitelist);
 }