function start() { Applog::log("Crawler started"); // Salvam întregul whiteList in tabelul Link pentru a incepe extragerea. // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0. foreach (Config::get('crawler.whiteList') as $startUrl) { $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt); $rec = StringUtil::parseUtf8Url($startUrl); Link::saveLink2DB($startUrl, $rec['host'], 0); } $this->crawlLoop(); }
function start() { Applog::log("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line ' . __LINE__, 4); while (($file = $this->getNextFile()) != null) { $this->processFile($file); MemoryManagement::clean(); } Applog::log("Finished"); }
function processLink($url) { Applog::log('Processing link: ' . $url); $canonicalUrl = null; if ($this->isRelativeLink($url)) { $url = $this->makeAbsoluteLink($url); } //sterge slash-uri in plus si directory index file $canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt); if (!$this->eligibleUrl($url)) { return; } $rec = StringUtil::parseUtf8Url($canonicalUrl); if ($rec['host'] == $this->getDomain($url)) { Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId); } }