public function match(FilterableUri $uri) { if (null !== $uri->getQuery()) { $uri->setFiltered(true, 'URI with query string'); return true; } return false; }
public function match(FilterableUri $uri) { if (null !== $uri->getFragment()) { $uri->setFiltered(true, 'URI with hash fragment'); return true; } return false; }
/** * @return bool */ public function match(FilterableUri $uri) { $scheme = $uri->getScheme(); if (!in_array($scheme, $this->allowedSchemes)) { $uri->setFiltered(true, 'Scheme not allowed'); return true; } return false; }
public function match(FilterableUri $uri) { /* * if the URI does not contain the seed, it is not allowed */ if (false === stripos($uri->toString(), $this->seed->toString())) { $uri->setFiltered(true, 'Doesn\'t match base URI'); return true; } return false; }
public function match(FilterableUri $uri) { $currentHostname = $uri->getHost(); if ($this->allowSubDomains) { // only use hostname.tld for comparison $currentHostname = join('.', array_slice(explode('.', $currentHostname), -2)); } if (!in_array($currentHostname, $this->allowedHosts)) { $uri->setFiltered(true, 'Hostname not allowed'); return true; } return false; }
/** * Function that crawls each provided URI * It applies all processors and listeners set on the Spider * * This is a either depth first algorithm as explained here: * https://en.wikipedia.org/wiki/Depth-first_search#Example * Note that because we don't do it recursive, but iteratively, * results will be in a different order from the example, because * we always take the right-most child first, whereas a recursive * variant would always take the left-most child first * * or * * a breadth first algorithm * * @return void */ private function doCrawl() { while (count($this->traversalQueue)) { /** @var $currentUri Uri */ $currentUri = $this->getNextUriFromQueue(); // Fetch the document if (!($resource = $this->fetchResource($currentUri))) { continue; } $this->dispatch(SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH, new GenericEvent($this, array('document' => $resource))); if ($this->matchesPostfetchFilter($resource)) { $this->getStatsHandler()->addToFiltered($resource); continue; } // The document was not filtered, so we add it to the processing queue $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, new GenericEvent($this, array('document' => $resource))); $this->addToProcessQueue($resource); $nextLevel = $this->alreadySeenUris[$currentUri->toString()] + 1; if ($nextLevel > $this->maxDepth) { continue; } // Once the document is enqueued, apply the discoverers to look for more links to follow $discoveredUris = $this->executeDiscoverers($resource); foreach ($discoveredUris as $uri) { // normalize the URI $uri->normalize(); // Decorate the link to make it filterable $uri = new FilterableUri($uri); // Always skip nodes we already visited if (array_key_exists($uri->toString(), $this->alreadySeenUris)) { continue; } $this->dispatch(SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH, new GenericEvent($this, array('uri' => $uri))); if ($this->matchesPrefetchFilter($uri)) { $this->getStatsHandler()->addToFiltered($uri); } else { // The URI was not matched by any filter, mark as visited and add to queue array_push($this->traversalQueue, $uri); } $this->alreadySeenUris[$uri->toString()] = $nextLevel; } } }