/** * Prepares and executes requests by given request_paths values * * @param array $info * @param Varien_Http_Adapter_Curl $adapter */ protected function _executeRequests(array $info, Varien_Http_Adapter_Curl $adapter) { $storeId = $info['store_id']; $options = array(CURLOPT_USERAGENT => self::USER_AGENT, CURLOPT_SSL_VERIFYPEER => 0); $threads = $this->_getCrawlerThreads($storeId); if (!$threads) { $threads = 1; } if (!empty($info['cookie'])) { $options[CURLOPT_COOKIE] = $info['cookie']; } $urls = array(); $offset = 0; Mage::getSingleton('core/session')->setCrawlerOffset($offset); while ($rewrites = $this->_getResource()->getRequestPaths($storeId)) { foreach ($rewrites as $rewriteRow) { $url = $this->_getUrlByRewriteRow($rewriteRow, $info['base_url'], $storeId); $urls[] = $url; if (count($urls) == $threads) { $adapter->multiRequest($urls, $options); $urls = array(); } } $offset += self::BATCH_SIZE; Mage::getSingleton('core/session')->setCrawlerOffset($offset); } if (!empty($urls)) { $adapter->multiRequest($urls, $options); } }
public function requestUrls($urls, $verbose = true) { $adapter = new Varien_Http_Adapter_Curl(); $options = array(CURLOPT_USERAGENT => self::USER_AGENT, CURLOPT_HEADER => true); $multiResult = $adapter->multiRequest($urls, $options); foreach ($multiResult as $urlId => $content) { $urlModel = Mage::getModel('fpc/crawler_url')->load($urlId); $this->_removeDublicates($urlModel); $matches = array(); preg_match('/Fpc-Cache-Id: (' . Mirasvit_Fpc_Model_Processor::REQUEST_ID_PREFIX . '[a-z0-9]{32})/', $content, $matches); if (count($matches) == 2) { $cacheId = $matches[1]; if ($urlModel->getCacheId() != $cacheId) { $urlModel->setCacheId($cacheId)->save(); } if ($verbose) { echo 'CACHED ' . $urls[$urlId] . PHP_EOL; } } else { if ($verbose) { echo 'REMOVED ' . $urls[$urlId] . PHP_EOL; } $urlModel->delete(); } } return $this; }
public function crawl() { return $this; Mage::register('custom_entry_point', true, true); $counter = 0; $timeStart = time(); $storesInfo = $this->getStoresInfo(); $adapter = new Varien_Http_Adapter_Curl(); foreach ($storesInfo as $info) { $options = array(CURLOPT_USERAGENT => self::USER_AGENT); $storeId = $info['store_id']; $threads = 1; if (!empty($info['cookie'])) { $options[CURLOPT_COOKIE] = $info['cookie']; } $urls = array(); $urlsCount = 0; $totalCount = 0; $queries = Mage::getModel('catalogsearch/query')->getCollection()->addFieldToFilter('store_id', $storeId)->setOrder('popularity', 'desc'); foreach ($queries as $query) { $queryText = $query->getQueryText(); $part = ''; for ($i = 0; $i < strlen($queryText); $i++) { $part .= $queryText[$i]; $url = $info['base_url'] . 'searchautocomplete/ajax/get/?q=' . $part . '&cat=0'; $urls[] = $url; $urlsCount++; $totalCount++; $counter++; if ($urlsCount == $threads) { $result = $adapter->multiRequest($urls, $options); $urlsCount = 0; $urls = array(); } } if (time() - $timeStart > 1 * 60 * 60) { return $this; } } if (!empty($urls)) { $adapter->multiRequest($urls, $options); } } return $this; }
/** * Crawl all system urls * @return Enterprise_PageCache_Model_Crawler */ public function crawl() { $storesInfo = $this->getStoresInfo(); $adapter = new Varien_Http_Adapter_Curl(); foreach ($storesInfo as $info) { $options = array(CURLOPT_USERAGENT => self::USER_AGENT); $storeId = $info['store_id']; if (!Mage::app()->getStore($storeId)->getConfig(self::XML_PATH_CRAWLER_ENABLED)) { continue; } $threads = (int) Mage::app()->getStore($storeId)->getConfig(self::XML_PATH_CRAWLER_THREADS); if (!$threads) { $threads = 1; } $stmt = $this->_getResource()->getUrlStmt($storeId); $baseUrl = $info['base_url']; if (!empty($info['cookie'])) { $options[CURLOPT_COOKIE] = $info['cookie']; } $urls = array(); $urlsCount = 0; $totalCount = 0; while ($row = $stmt->fetch()) { $urls[] = $baseUrl . $row['request_path']; $urlsCount++; $totalCount++; if ($urlsCount == $threads) { $adapter->multiRequest($urls, $options); $urlsCount = 0; $urls = array(); } } if (!empty($urls)) { $adapter->multiRequest($urls, $options); } } return $this; }
/** * Prepares and executes requests by given request_paths values * * @param array $info * @param Varien_Http_Adapter_Curl $adapter */ protected function _executeRequests(array $info, Varien_Http_Adapter_Curl $adapter) { $storeId = $info['store_id']; $options = array(CURLOPT_USERAGENT => self::USER_AGENT); $threads = $this->_getCrawlerThreads($storeId); if (!$threads) { $threads = 1; } if (!empty($info['cookie'])) { $options[CURLOPT_COOKIE] = $info['cookie']; } $urls = array(); $urlsCount = $totalCount = 0; foreach ($this->_getResource()->getRequestPaths($storeId) as $requestPath) { $url = $info['base_url'] . $requestPath; $urlHash = md5($url); if (isset($this->_visitedUrls[$urlHash])) { continue; } $urls[] = $url; $this->_visitedUrls[$urlHash] = true; $urlsCount++; $totalCount++; if ($urlsCount == $threads) { $adapter->multiRequest($urls, $options); $urlsCount = 0; $urls = array(); } } if (!empty($urls)) { $adapter->multiRequest($urls, $options); } }
/** * Crawl all system urls * * @return Enterprise_PageCache_Model_Crawler */ public function crawl() { if (!Mage::app()->useCache('full_page')) { return $this; } $storesInfo = $this->getStoresInfo(); $adapter = new Varien_Http_Adapter_Curl(); foreach ($storesInfo as $info) { $options = array(CURLOPT_USERAGENT => self::USER_AGENT); $storeId = $info['store_id']; $this->_visitedUrls = array(); if (!Mage::app()->getStore($storeId)->getConfig(self::XML_PATH_CRAWLER_ENABLED)) { continue; } $threads = (int) Mage::app()->getStore($storeId)->getConfig(self::XML_PATH_CRAWLER_THREADS); if (!$threads) { $threads = 1; } if (!empty($info['cookie'])) { $options[CURLOPT_COOKIE] = $info['cookie']; } $urls = array(); $baseUrl = $info['base_url']; $urlsCount = $totalCount = 0; $urlsPaths = $this->_getResource()->getUrlsPaths($storeId); foreach ($urlsPaths as $urlPath) { $url = $baseUrl . $urlPath; $urlHash = md5($url); if (isset($this->_visitedUrls[$urlHash])) { continue; } $urls[] = $url; $this->_visitedUrls[$urlHash] = true; $urlsCount++; $totalCount++; if ($urlsCount == $threads) { $adapter->multiRequest($urls, $options); $urlsCount = 0; $urls = array(); } } if (!empty($urls)) { $adapter->multiRequest($urls, $options); } } return $this; }