PHP VDB\Spider\Uri FilterableUri Examples

Programming Language: PHP

Namespace/Package Name: VDB\Spider\Uri

Class/Type: FilterableUri

Examples at hotexamples.com: 6

PHP VDB\Spider\Uri FilterableUri - 6 examples found. These are the top rated real world PHP examples of VDB\Spider\Uri\FilterableUri extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

setFiltered(5)

toString(2)

getFragment(1)

getHost(1)

getQuery(1)

getScheme(1)

Example #1

Show file

File: UriWithQueryStringFilter.php Project: aigouzz/php-spider

 public function match(FilterableUri $uri)
 {
     if (null !== $uri->getQuery()) {
         $uri->setFiltered(true, 'URI with query string');
         return true;
     }
     return false;
 }

Example #2

Show file

File: UriWithHashFragmentFilter.php Project: aigouzz/php-spider

 public function match(FilterableUri $uri)
 {
     if (null !== $uri->getFragment()) {
         $uri->setFiltered(true, 'URI with hash fragment');
         return true;
     }
     return false;
 }

Example #3

Show file

File: AllowedSchemeFilter.php Project: aigouzz/php-spider

 /**
  * @return bool
  */
 public function match(FilterableUri $uri)
 {
     $scheme = $uri->getScheme();
     if (!in_array($scheme, $this->allowedSchemes)) {
         $uri->setFiltered(true, 'Scheme not allowed');
         return true;
     }
     return false;
 }

Example #4

Show file

File: RestrictToBaseUriFilter.php Project: aigouzz/php-spider

 public function match(FilterableUri $uri)
 {
     /*
      * if the URI does not contain the seed, it is not allowed
      */
     if (false === stripos($uri->toString(), $this->seed->toString())) {
         $uri->setFiltered(true, 'Doesn\'t match base URI');
         return true;
     }
     return false;
 }

Example #5

Show file

File: AllowedHostsFilter.php Project: aigouzz/php-spider

 public function match(FilterableUri $uri)
 {
     $currentHostname = $uri->getHost();
     if ($this->allowSubDomains) {
         // only use hostname.tld for comparison
         $currentHostname = join('.', array_slice(explode('.', $currentHostname), -2));
     }
     if (!in_array($currentHostname, $this->allowedHosts)) {
         $uri->setFiltered(true, 'Hostname not allowed');
         return true;
     }
     return false;
 }

Example #6

Show file

File: Spider.php Project: aigouzz/php-spider

 /**
  * Function that crawls each provided URI
  * It applies all processors and listeners set on the Spider
  *
  * This is a either depth first algorithm as explained here:
  *  https://en.wikipedia.org/wiki/Depth-first_search#Example
  * Note that because we don't do it recursive, but iteratively,
  * results will be in a different order from the example, because
  * we always take the right-most child first, whereas a recursive
  * variant would always take the left-most child first
  *
  * or
  *
  * a breadth first algorithm
  *
  * @return void
  */
 private function doCrawl()
 {
     while (count($this->traversalQueue)) {
         /** @var $currentUri Uri */
         $currentUri = $this->getNextUriFromQueue();
         // Fetch the document
         if (!($resource = $this->fetchResource($currentUri))) {
             continue;
         }
         $this->dispatch(SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH, new GenericEvent($this, array('document' => $resource)));
         if ($this->matchesPostfetchFilter($resource)) {
             $this->getStatsHandler()->addToFiltered($resource);
             continue;
         }
         // The document was not filtered, so we add it to the processing queue
         $this->dispatch(SpiderEvents::SPIDER_CRAWL_PRE_ENQUEUE, new GenericEvent($this, array('document' => $resource)));
         $this->addToProcessQueue($resource);
         $nextLevel = $this->alreadySeenUris[$currentUri->toString()] + 1;
         if ($nextLevel > $this->maxDepth) {
             continue;
         }
         // Once the document is enqueued, apply the discoverers to look for more links to follow
         $discoveredUris = $this->executeDiscoverers($resource);
         foreach ($discoveredUris as $uri) {
             // normalize the URI
             $uri->normalize();
             // Decorate the link to make it filterable
             $uri = new FilterableUri($uri);
             // Always skip nodes we already visited
             if (array_key_exists($uri->toString(), $this->alreadySeenUris)) {
                 continue;
             }
             $this->dispatch(SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH, new GenericEvent($this, array('uri' => $uri)));
             if ($this->matchesPrefetchFilter($uri)) {
                 $this->getStatsHandler()->addToFiltered($uri);
             } else {
                 // The URI was not matched by any filter, mark as visited and add to queue
                 array_push($this->traversalQueue, $uri);
             }
             $this->alreadySeenUris[$uri->toString()] = $nextLevel;
         }
     }
 }