Example #1
0
 /**
  * @return bool
  */
 private function next()
 {
     $message = "" . count($this->urlsDone) . " done" . PHP_EOL;
     $message .= "" . count($this->urlsToCrawl) . " left" . PHP_EOL;
     if (count($this->urlsToCrawl) == 0) {
         return false;
     }
     $url = array_shift($this->urlsToCrawl);
     if (in_array($url, $this->urlsDone)) {
         return true;
     }
     $message .= "Loading : " . $url . PHP_EOL;
     $this->dispatchEvent(new SimpleCrawlerEvent(SimpleCrawlerEvent::OUTPUT, $message));
     $r = new Request($url);
     try {
         $d = $r->execute();
     } catch (\Exception $e) {
         $d = false;
     }
     if (!$d) {
         $this->urlsDone[] = $url;
         $redirect = $r->getRedirectURL();
         if ($redirect) {
             $this->urlsToCrawl[] = $redirect;
         } else {
             $this->log($url, $r->getResponseHTTPCode(), $r->getRedirectURL());
         }
         return true;
     }
     $baseHref = $this->extract('/\\<base href="([^"]+)"/', $d);
     $title = $this->extract('/\\<title\\>([^<]+)/', $d);
     $description = $this->extract('/\\<meta name="description" content="([^"]+)"/', $d);
     preg_match_all('/href\\="([^"]+)"/', $d, $matches);
     if (isset($matches[1]) && !empty($matches[1])) {
         foreach ($matches[1] as $u) {
             if (strpos($u, 'http://') === 0 || strpos($u, 'https://') === 0 || strpos($u, 'javascript:') === 0 || strpos($u, '#') === 0 || $u === "/") {
                 continue;
             }
             if (strpos($u, "/") === 0) {
                 $u = substr($u, 1, strlen($u));
             }
             $u = $baseHref . $u;
             if ($this->deepRunning && !in_array($u, $this->urlsToCrawl) && !in_array($u, $this->urlsDone)) {
                 $this->urlsToCrawl[] = $u;
             }
         }
     }
     $this->urlsDone[] = $url;
     $this->log($url, $title, $description);
     return true;
 }