/** * @return bool */ private function next() { $message = "" . count($this->urlsDone) . " done" . PHP_EOL; $message .= "" . count($this->urlsToCrawl) . " left" . PHP_EOL; if (count($this->urlsToCrawl) == 0) { return false; } $url = array_shift($this->urlsToCrawl); if (in_array($url, $this->urlsDone)) { return true; } $message .= "Loading : " . $url . PHP_EOL; $this->dispatchEvent(new SimpleCrawlerEvent(SimpleCrawlerEvent::OUTPUT, $message)); $r = new Request($url); try { $d = $r->execute(); } catch (\Exception $e) { $d = false; } if (!$d) { $this->urlsDone[] = $url; $redirect = $r->getRedirectURL(); if ($redirect) { $this->urlsToCrawl[] = $redirect; } else { $this->log($url, $r->getResponseHTTPCode(), $r->getRedirectURL()); } return true; } $baseHref = $this->extract('/\\<base href="([^"]+)"/', $d); $title = $this->extract('/\\<title\\>([^<]+)/', $d); $description = $this->extract('/\\<meta name="description" content="([^"]+)"/', $d); preg_match_all('/href\\="([^"]+)"/', $d, $matches); if (isset($matches[1]) && !empty($matches[1])) { foreach ($matches[1] as $u) { if (strpos($u, 'http://') === 0 || strpos($u, 'https://') === 0 || strpos($u, 'javascript:') === 0 || strpos($u, '#') === 0 || $u === "/") { continue; } if (strpos($u, "/") === 0) { $u = substr($u, 1, strlen($u)); } $u = $baseHref . $u; if ($this->deepRunning && !in_array($u, $this->urlsToCrawl) && !in_array($u, $this->urlsDone)) { $this->urlsToCrawl[] = $u; } } } $this->urlsDone[] = $url; $this->log($url, $title, $description); return true; }