Пример #1
0
 function crawlLoop()
 {
     Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
     while (1) {
         //extrage urmatorul link neprelucrat din baza de date
         $link = $this->getNextLink();
         if ($link) {
             Applog::log('current URL: ' . $link->canonicalUrl);
             //download pagina
             $pageContent = $this->getPage($link->canonicalUrl);
             //setam url-ul curent pentru store in Database
             $this->currentUrl = $link->canonicalUrl;
             $this->urlResource = StringUtil::parseUtf8Url($link->canonicalUrl);
             $links = $this->processPage($pageContent);
             $this->setStorePageParams();
             //salveaza o intrare despre pagina curenta in baza de date
             $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
             //daca pagina nu e in format html (e imagine sau alt fisier)
             //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
             if (!$this->pageOk()) {
                 continue;
             }
             foreach ($links as $link) {
                 $this->processLink($link);
             }
         }
         // Sleep until we're guaranteed to have something to crawl, but no less than 1 second.
         $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time());
         Applog::log("Sleeping for {$sleepTime} seconds");
         sleep($sleepTime);
     }
 }