Esempio n. 1
0
 function start()
 {
     Applog::log("Crawler started");
     // Salvam întregul whiteList in tabelul Link pentru a incepe extragerea.
     // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
     foreach (Config::get('crawler.whiteList') as $startUrl) {
         $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
         $rec = StringUtil::parseUtf8Url($startUrl);
         Link::saveLink2DB($startUrl, $rec['host'], 0);
     }
     $this->crawlLoop();
 }
Esempio n. 2
0
 function start()
 {
     Applog::log("INSIDE " . __FILE__ . ' - ' . __CLASS__ . '::' . __FUNCTION__ . '() - ' . 'line ' . __LINE__, 4);
     while (($file = $this->getNextFile()) != null) {
         $this->processFile($file);
         MemoryManagement::clean();
     }
     Applog::log("Finished");
 }
Esempio n. 3
0
 function processLink($url)
 {
     Applog::log('Processing link: ' . $url);
     $canonicalUrl = null;
     if ($this->isRelativeLink($url)) {
         $url = $this->makeAbsoluteLink($url);
     }
     //sterge slash-uri in plus si directory index file
     $canonicalUrl = StringUtil::urlCleanup($url, $this->directoryIndexFile, $this->indexFileExt);
     if (!$this->eligibleUrl($url)) {
         return;
     }
     $rec = StringUtil::parseUtf8Url($canonicalUrl);
     if ($rec['host'] == $this->getDomain($url)) {
         Link::saveLink2DB($canonicalUrl, $this->getDomain($url), $this->currentPageId);
     }
 }