function crawlLoop() { Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started"); while (1) { //extrage urmatorul link neprelucrat din baza de date $link = $this->getNextLink(); if ($link) { Applog::log('current URL: ' . $link->canonicalUrl); //download pagina $pageContent = $this->getPage($link->canonicalUrl); //setam url-ul curent pentru store in Database $this->currentUrl = $link->canonicalUrl; $this->urlResource = StringUtil::parseUtf8Url($link->canonicalUrl); $links = $this->processPage($pageContent); $this->setStorePageParams(); //salveaza o intrare despre pagina curenta in baza de date $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp); //daca pagina nu e in format html (e imagine sau alt fisier) //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta if (!$this->pageOk()) { continue; } foreach ($links as $link) { $this->processLink($link); } } // Sleep until we're guaranteed to have something to crawl, but no less than 1 second. $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time()); Applog::log("Sleeping for {$sleepTime} seconds"); sleep($sleepTime); } }