public function testIsUrlBlacklisted() { $blacklist = ['^(file|ftp|mailto):', '&type\\=rubriek', 'http://example.com/Some%20Thing/']; $this->assertFalse(UrlCheck::isUrlBlacklisted("http://example.com/Some%20Thing", $blacklist)); $this->assertTrue(UrlCheck::isUrlBlacklisted("http://example.com/Some%20Thing/", $blacklist)); $this->assertTrue(UrlCheck::isUrlBlacklisted("mailto:admin@example.com", $blacklist)); $this->assertTrue(UrlCheck::isUrlBlacklisted("http://example.com/article?param1=6&type=rubriek", $blacklist)); $this->assertFalse(UrlCheck::isUrlBlacklisted("http://example.com", $blacklist)); }
/** * Writes the found URL as a job on the queue. * * And URL is only persisted to the queue when it not has been indexed yet. * * @param \Symfony\Component\EventDispatcher\GenericEvent $event */ public function onDiscoverUrl(Event $event) { $crawlJob = $event->getSubject()->getCurrentCrawlJob(); foreach ($event['uris'] as $uri) { if ($position = strpos($uri, '#')) { $uri = new Uri(substr($uri, 0, $position)); } $isBlacklisted = UrlCheck::isUrlBlacklisted($uri->normalize()->toString(), $crawlJob->getBlacklist()); if ($isBlacklisted) { $this->eventDispatcher->dispatch("spider.crawl.blacklisted", new Event($this, ['uri' => $uri])); continue; //url blacklisted, so go to next one } if (!$this->indexer->isUrlIndexedandNotExpired(UrlCheck::fixUrl($uri->toString()), $crawlJob->getMetadata())) { $job = new CrawlJob(UrlCheck::fixUrl($uri->normalize()->toString()), (new Uri($crawlJob->getUrl()))->normalize()->toString(), $crawlJob->getBlacklist(), $crawlJob->getMetadata(), $crawlJob->getWhitelist()); if ($job->isAllowedToCrawl()) { $this->queue->publishJob($job); } } } }
public function dropBlacklistedDocuments(array $blacklist, $metadata) { $result = $this->indexer->getDocumentUrlsInCore(['core' => $metadata->core]); $toDelete = []; foreach ($result as $document) { if (UrlCheck::isUrlBlacklisted($document->url, $blacklist)) { $toDelete[] = ['core' => $metadata->core, 'id' => $document->id, 'url' => $document->url]; } } foreach ($toDelete as $document) { $this->indexer->deleteDocumentById(['core' => $document['core']], $document['id']); $this->logMessage("info", sprintf("Delete document %s. URL: %s", $document['id'], $document['url']), $document['url']); } }