Beispiel #1
0
 function crawlLoop()
 {
     Applog::log("Crawling: " . $this->getDomain($this->currentUrl) . " started");
     while (1) {
         //extrage urmatorul link neprelucrat din baza de date
         $link = $this->getNextLink();
         if ($link) {
             Applog::log('current URL: ' . $link->canonicalUrl);
             //download pagina
             $pageContent = $this->getPage($link->canonicalUrl);
             //setam url-ul curent pentru store in Database
             $this->currentUrl = $link->canonicalUrl;
             $this->urlResource = StringUtil::parseUtf8Url($link->canonicalUrl);
             $links = $this->processPage($pageContent);
             $this->setStorePageParams();
             //salveaza o intrare despre pagina curenta in baza de date
             $this->currentPageId = CrawledPage::savePage2DB($this->currentUrl, $this->httpResponse(), $this->pageContent, $this->plainText, $this->rawPagePath, $this->parsedTextPath, $this->currentTimestamp);
             //daca pagina nu e in format html (e imagine sau alt fisier)
             //sau daca am primit un cod HTTP de eroare, sarim peste pagina acesta
             if (!$this->pageOk()) {
                 continue;
             }
             foreach ($links as $link) {
                 $this->processLink($link);
             }
         }
         // Sleep until we're guaranteed to have something to crawl, but no less than 1 second.
         $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time());
         Applog::log("Sleeping for {$sleepTime} seconds");
         sleep($sleepTime);
     }
 }
 function getNextFile()
 {
     Applog::log("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line ' . __LINE__, 4);
     while (1) {
         $crawledPage = CrawledPage::getNextDiacriticsFile();
         if ($crawledPage == null) {
             return null;
         }
         $this->showProcessingFileStatus($crawledPage);
         FilesUsedInDiacritics::save2Db($crawledPage->id);
         if (is_file($crawledPage->parsedTextPath) || $crawledPage->httpStatus < 400) {
             return $this->toLower(file_get_contents($crawledPage->parsedTextPath));
         }
     }
     return null;
 }
Beispiel #3
0
<?php

require_once '../../phplib/util.php';
//util_assertModerator(PRIV_ADMIN);
$rows = CrawledPage::getListOfDomains();
$options = array('all', 'most recent domain');
$last = end($rows);
$values = array('all', $last->domain);
foreach ($rows as $obj) {
    array_push($options, $obj->domain);
    array_push($values, $obj->domain);
}
//var_dump($options);
SmartyWrap::assign('page_title', 'Romanian Crawler Log');
SmartyWrap::assign('values', $values);
SmartyWrap::assign('options', $options);
//SmartyWrap::smartyDisplay('crawler/crawler.ihtml');
SmartyWrap::assign('jqueryLibPath', '../js/jquery-1.8.3.min.js');
SmartyWrap::displayWithoutSkin('crawler/crawler.ihtml');