public function match(FilterableUri $uri)
 {
     if (null !== $uri->getQuery()) {
         $uri->setFiltered(true, 'URI with query string');
         return true;
     }
     return false;
 }
 public function match(FilterableUri $uri)
 {
     if (null !== $uri->getFragment()) {
         $uri->setFiltered(true, 'URI with hash fragment');
         return true;
     }
     return false;
 }
 /**
  * @return bool
  */
 public function match(FilterableUri $uri)
 {
     $scheme = $uri->getScheme();
     if (!in_array($scheme, $this->allowedSchemes)) {
         $uri->setFiltered(true, 'Scheme not allowed');
         return true;
     }
     return false;
 }
 public function match(FilterableUri $uri)
 {
     /*
      * if the URI does not contain the seed, it is not allowed
      */
     if (false === stripos($uri->toString(), $this->seed->toString())) {
         $uri->setFiltered(true, 'Doesn\'t match base URI');
         return true;
     }
     return false;
 }
 public function match(FilterableUri $uri)
 {
     $currentHostname = $uri->getHost();
     if ($this->allowSubDomains) {
         // only use hostname.tld for comparison
         $currentHostname = join('.', array_slice(explode('.', $currentHostname), -2));
     }
     if (!in_array($currentHostname, $this->allowedHosts)) {
         $uri->setFiltered(true, 'Hostname not allowed');
         return true;
     }
     return false;
 }
Example #6
0
 /**
  * Function that crawls each provided URI
  * It applies all processors and listeners set on the Spider
  *
  * This is a either depth first algorithm as explained here:
  *  https://en.wikipedia.org/wiki/Depth-first_search#Example
  * Note that because we don't do it recursive, but iteratively,
  * results will be in a different order from the example, because
  * we always take the right-most child first, whereas a recursive
  * variant would always take the left-most child first
  *
  * or
  *
  * a breadth first algorithm
  *
  * @return void
  */
 private function doCrawl()
 {
     while (count($this->traversalQueue)) {
         /** @var $currentUri Uri */
         $currentUri = $this->getNextUriFromQueue();
         // Fetch the document
         if (!($resource = $this->fetchResource($currentUri))) {
             continue;
         }
         $this->dispatch(SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH, new GenericEvent($this, array('document' => $resource)));
         if ($this->matchesPostfetchFilter($resource)) {
             $this->getStatsHandler()->addToFiltered($resource);
             continue;
         }
         // The document was not filtered, so we add it to the processing queue
         $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, new GenericEvent($this, array('document' => $resource)));
         $this->addToProcessQueue($resource);
         $nextLevel = $this->alreadySeenUris[$currentUri->toString()] + 1;
         if ($nextLevel > $this->maxDepth) {
             continue;
         }
         // Once the document is enqueued, apply the discoverers to look for more links to follow
         $discoveredUris = $this->executeDiscoverers($resource);
         foreach ($discoveredUris as $uri) {
             // normalize the URI
             $uri->normalize();
             // Decorate the link to make it filterable
             $uri = new FilterableUri($uri);
             // Always skip nodes we already visited
             if (array_key_exists($uri->toString(), $this->alreadySeenUris)) {
                 continue;
             }
             $this->dispatch(SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH, new GenericEvent($this, array('uri' => $uri)));
             if ($this->matchesPrefetchFilter($uri)) {
                 $this->getStatsHandler()->addToFiltered($uri);
             } else {
                 // The URI was not matched by any filter, mark as visited and add to queue
                 array_push($this->traversalQueue, $uri);
             }
             $this->alreadySeenUris[$uri->toString()] = $nextLevel;
         }
     }
 }