public function crawl($url, $level, $parent) { if (count($this->crawledURLs) > $this->pageLimit) { return; } $this->collection->log("crawl - [level:{$level}] [page:" . count($this->crawledURLs) . "] - {$url} "); $document = $this->httpClient->getDocument($url); $document->level = $level; if ($document->contentType == "application/pdf") { $p = new PDFRobot($this->collection->accountId); $document->content = $p->clean($document); $document->content = htmlentities($document->content, ENT_QUOTES); array_push($this->crawledURLs, $url); return $document->save($this->collection->id); } else { if (!$this->shouldCrawl($url)) { $this->collection->log("SKIP {$url}"); array_push($this->foundURLs, $url); //skip document return false; } preg_match_all("/\\<a.*?(?:src|href)=\"([^\"]*?)\"/i", $document->content, $matches); foreach ($matches[1] as $item) { $fullUrl = URL::expandUrl($item, $url); if ($this->shouldCrawl($fullUrl)) { $link = new Document(); $link->url = $fullUrl; $link->level = $level + 1; array_push($this->foundURLs, $link); array_push($this->processURLs, $link); } } $document->content = htmlentities($document->content, ENT_QUOTES); $document->save($this->collection->id); array_push($this->crawledURLs, $url); if (count($this->crawledURLs) > $this->pageLimit) { $this->collection->log("hit page limit!"); $this->collection->log("#crawledURLs:" . count($this->crawledURLs)); return; } while ($child = array_shift($this->processURLs)) { if ($child->url != "") { if (!in_array($child->url, $this->crawledURLs)) { $this->crawl($child->url, $child->level, $url); } } } } }
public function crawl($url, $level, $parent) { print "crawl [{$level}] - {$url} \r\n"; $document = $this->httpClient->getDocument($url); $document->level = $level; if ($document->contentType == "application/pdf") { $p = new PDFRobot($this->accountId); $document->content = $p->clean($document); $document->content = htmlentities($document->content, ENT_QUOTES); array_push($this->crawled, $url); return $document->save($this->accountId); } else { if (!$document->shouldCrawl()) { array_push($this->found, $url); //skip document return false; } preg_match_all("/\\<a.*?(?:src|href)=\"([^\"]*?)\"/i", $document->content, $matches); foreach ($matches[1] as $item) { $fullUrl = URL::expandUrl($item, $url); if ($this->shouldCrawl($fullUrl)) { $link = new Document(); $link->url = $fullUrl; $link->level = $level + 1; array_push($this->found, $link); array_push($this->process, $link); } } $document->content = htmlentities($document->content, ENT_QUOTES); $document->save($this->accountId); array_push($this->crawled, $url); while ($child = array_shift($this->process)) { if ($child->url != "") { if (!in_array($child->url, $this->crawled)) { $this->crawl($child->url, $child->level, $url); } } } } }