function crawlLoop() { Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started"); while (1) { //extrage urmatorul link neprelucrat din baza de date $link = $this->getNextLink(); if ($link) { Applog::log('current URL: ' . $link->canonicalUrl); //download pagina $pageContent = $this->getPage($link->canonicalUrl); //setam url-ul curent pentru store in Database $this->currentUrl = $link->canonicalUrl; $this->urlResource = StringUtil::parseUtf8Url($link->canonicalUrl); $links = $this->processPage($pageContent); $this->setStorePageParams(); //salveaza o intrare despre pagina curenta in baza de date $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp); //daca pagina nu e in format html (e imagine sau alt fisier) //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta if (!$this->pageOk()) { continue; } foreach ($links as $link) { $this->processLink($link); } } // Sleep until we're guaranteed to have something to crawl, but no less than 1 second. $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time()); Applog::log("Sleeping for {$sleepTime} seconds"); sleep($sleepTime); } }
function getNextFile() { Applog::log("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line ' . __LINE__, 4); while (1) { $crawledPage = CrawledPage::getNextDiacriticsFile(); if ($crawledPage == null) { return null; } $this->showProcessingFileStatus($crawledPage); FilesUsedInDiacritics::save2Db($crawledPage->id); if (is_file($crawledPage->parsedTextPath) || $crawledPage->httpStatus < 400) { return $this->toLower(file_get_contents($crawledPage->parsedTextPath)); } } return null; }
<?php require_once '../../phplib/util.php'; //util_assertModerator(PRIV_ADMIN); $rows = CrawledPage::getListOfDomains(); $options = array('all', 'most recent domain'); $last = end($rows); $values = array('all', $last->domain); foreach ($rows as $obj) { array_push($options, $obj->domain); array_push($values, $obj->domain); } //var_dump($options); SmartyWrap::assign('page_title', 'Romanian Crawler Log'); SmartyWrap::assign('values', $values); SmartyWrap::assign('options', $options); //SmartyWrap::smartyDisplay('crawler/crawler.ihtml'); SmartyWrap::assign('jqueryLibPath', '../js/jquery-1.8.3.min.js'); SmartyWrap::displayWithoutSkin('crawler/crawler.ihtml');