Example #1
0
 public function crawl($url, $level, $parent)
 {
     if (count($this->crawledURLs) > $this->pageLimit) {
         return;
     }
     $this->collection->log("crawl - [level:{$level}] [page:" . count($this->crawledURLs) . "]  - {$url} ");
     $document = $this->httpClient->getDocument($url);
     $document->level = $level;
     if ($document->contentType == "application/pdf") {
         $p = new PDFRobot($this->collection->accountId);
         $document->content = $p->clean($document);
         $document->content = htmlentities($document->content, ENT_QUOTES);
         array_push($this->crawledURLs, $url);
         return $document->save($this->collection->id);
     } else {
         if (!$this->shouldCrawl($url)) {
             $this->collection->log("SKIP {$url}");
             array_push($this->foundURLs, $url);
             //skip document
             return false;
         }
         preg_match_all("/\\<a.*?(?:src|href)=\"([^\"]*?)\"/i", $document->content, $matches);
         foreach ($matches[1] as $item) {
             $fullUrl = URL::expandUrl($item, $url);
             if ($this->shouldCrawl($fullUrl)) {
                 $link = new Document();
                 $link->url = $fullUrl;
                 $link->level = $level + 1;
                 array_push($this->foundURLs, $link);
                 array_push($this->processURLs, $link);
             }
         }
         $document->content = htmlentities($document->content, ENT_QUOTES);
         $document->save($this->collection->id);
         array_push($this->crawledURLs, $url);
         if (count($this->crawledURLs) > $this->pageLimit) {
             $this->collection->log("hit page limit!");
             $this->collection->log("#crawledURLs:" . count($this->crawledURLs));
             return;
         }
         while ($child = array_shift($this->processURLs)) {
             if ($child->url != "") {
                 if (!in_array($child->url, $this->crawledURLs)) {
                     $this->crawl($child->url, $child->level, $url);
                 }
             }
         }
     }
 }
Example #2
0
 public function crawl($url, $level, $parent)
 {
     print "crawl [{$level}] - {$url} \r\n";
     $document = $this->httpClient->getDocument($url);
     $document->level = $level;
     if ($document->contentType == "application/pdf") {
         $p = new PDFRobot($this->accountId);
         $document->content = $p->clean($document);
         $document->content = htmlentities($document->content, ENT_QUOTES);
         array_push($this->crawled, $url);
         return $document->save($this->accountId);
     } else {
         if (!$document->shouldCrawl()) {
             array_push($this->found, $url);
             //skip document
             return false;
         }
         preg_match_all("/\\<a.*?(?:src|href)=\"([^\"]*?)\"/i", $document->content, $matches);
         foreach ($matches[1] as $item) {
             $fullUrl = URL::expandUrl($item, $url);
             if ($this->shouldCrawl($fullUrl)) {
                 $link = new Document();
                 $link->url = $fullUrl;
                 $link->level = $level + 1;
                 array_push($this->found, $link);
                 array_push($this->process, $link);
             }
         }
         $document->content = htmlentities($document->content, ENT_QUOTES);
         $document->save($this->accountId);
         array_push($this->crawled, $url);
         while ($child = array_shift($this->process)) {
             if ($child->url != "") {
                 if (!in_array($child->url, $this->crawled)) {
                     $this->crawl($child->url, $child->level, $url);
                 }
             }
         }
     }
 }