public function extract() { $timeBefore = Date::Now(); if (false === $this->crawler->beforeStart()) { $this->logError("Error in crawler's initialization for profile " . $this->profileName); return false; } $this->logInfo('Getting URLs from the crawler...'); while (false !== ($url = $this->crawler->getNextUrlToCrawl())) { $this->context['urls'][] = $url; $this->logInfo(" {$url}"); if (count($this->context['urls']) % 10 == 0) { $this->_saveContext(); } } $this->_saveContext(); $allUrlsCounter = count($this->context['urls']); $this->context['urls'] = array_unique($this->context['urls']); $urlsCounter = count($this->context['urls']); $this->logInfo("{$allUrlsCounter} URLs found and {$urlsCounter} after clearing doublons"); $cmpt = 0; foreach ($this->context['urls'] as $url) { $cmpt++; $this->logInfo("Crawling informations from {$url} ({$cmpt}/{$urlsCounter})"); if (false === ($informations = $this->crawler->getInformationsFromUrl($url))) { continue; } $informations = $this->catalog->add($informations); $this->logFormattedArrInfo($informations, " ", 100); } $this->logInfo('Saving catalog...'); if (false === $this->catalog->save()) { $this->logError('Cannot save catalog to ' . $this->conf['catalog']['save-to']); } $timeAfter = Date::Now(); $elapsedTime = $timeAfter - $timeBefore; $this->logInfo('Operation took ' . Date::TimestampToString($elapsedTime)); }
/** * Write the content to each file's descriptors of a given * channel. If content is an array, it'll be tranformed * into a string. * * @param string * @param mixed */ protected function _log($channel, $content) { if (gettype($content) == 'array') { $content = print_r($content, true); } if (substr($content, -1) != "\n") { $content .= "\n"; } $content = '[' . Date::TimestampToShortDateTime(Date::Now()) . '] ' . $content; foreach ($this->channels[$channel] as $fd) { fwrite($fd, $content); } return true; }