コード例 #1
0
ファイル: Crawler.php プロジェクト: phmlabs/crawler
 public function next()
 {
     if (count($this->responseCache) == 0) {
         $urls = $this->pageContainer->pop($this->parallelReqeusts);
         if (empty($urls)) {
             return false;
         }
         $requests = array();
         foreach ($urls as $url) {
             if (!$this->isFiltered($url)) {
                 $request = RequestFactory::getRequest($url, 'GET', 'php://memory', [], []);
                 $requests[] = $request;
             }
         }
         if (empty($requests)) {
             return $this->next();
         }
         try {
             $this->responseCache = $this->httpClient->sendRequests($requests);
         } catch (MultiHttpAdapterException $e) {
             $exceptions = $e->getExceptions();
             $errorMessages = "";
             foreach ($exceptions as $exception) {
                 // @fixme this must be part of the http client
                 $message = $exception->getMessage();
                 if (strpos($message, "An error occurred when fetching the URI") === 0) {
                     $url = substr($message, "41", strpos($message, '"', 41) - 41);
                     if (strpos($url, '/') === 0) {
                         $this->pageContainer->push(new Uri($this->startUri->getScheme() . '://' . $this->startUri->getHost() . $url));
                     }
                 } else {
                     $errorMessages .= $exception->getMessage() . "\n";
                 }
             }
             if ($errorMessages != "") {
                 throw new \RuntimeException($errorMessages);
             }
         }
     }
     if (empty($this->responseCache)) {
         return $this->next();
     }
     $response = array_pop($this->responseCache);
     if ($response->hasHeader('Content-Type')) {
         $contentTypeElements = explode(';', $response->getHeader('Content-Type')[0]);
         $contentType = array_shift($contentTypeElements);
         if ($contentType === "text/html") {
             $document = new Document((string) $response->getBody(), true);
             $elements = $document->getUnorderedDependencies($response->getUri());
             foreach ($elements as $element) {
                 $urlString = $this->createCleanUriString($element);
                 if (!array_key_exists($urlString, $this->comingFrom)) {
                     $this->comingFrom[$urlString] = $response->getUri();
                 }
                 $this->pageContainer->push($element);
             }
         }
     }
     return $response;
 }