function getContent($url, $enableProxy = true, $logCrawl = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } // user agent assignment $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); // update crawl log in database for future reference if ($logCrawl) { $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_status'] = $ret['error'] ? 0 : 1; $crawlInfo['ref_id'] = $crawlInfo['crawl_link'] = addslashes($url); $crawlInfo['crawl_referer'] = addslashes($this->_CURLOPT_REFERER); $crawlInfo['crawl_cookie'] = addslashes($this->_CURLOPT_COOKIE); $crawlInfo['crawl_post_fields'] = addslashes($this->_CURLOPT_POSTFIELDS); $crawlInfo['crawl_useragent'] = addslashes($this->_CURLOPT_USERAGENT); $crawlInfo['proxy_id'] = $proxyInfo['id']; $crawlInfo['log_message'] = addslashes($ret['errmsg']); $ret['log_id'] = $crawlLogCtrl->createCrawlLog($crawlInfo); } // disable proxy if not working if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) { // deactivate proxy if (PROXY_DEACTIVATE_CRAWL) { $proxyCtrler->__changeStatus($proxyInfo['id'], 0); } // chekc with another proxy if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { $ret = $this->getContent($url, $enableProxy); } } return $ret; }
function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true) { $crawlResult = array(); $websiteUrl = $keywordInfo['url']; if (empty($websiteUrl)) { return $crawlResult; } if (empty($keywordInfo['name'])) { return $crawlResult; } $time = mktime(0, 0, 0, date('m'), date('d'), date('Y')); $seList = explode(':', $keywordInfo['searchengines']); foreach ($seList as $seInfoId) { // function to execute only passed search engine if (!empty($seId) && $seInfoId != $seId) { continue; } // if search engine not found continue if (empty($this->seList[$seInfoId])) { continue; } $this->seFound = 1; // if execution from cron check whether cron already executed /*if ($cron) { if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue; }*/ $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']); $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl); $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl); if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) { $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl); } $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl); // if google add special parameters $isGoogle = false; if (stristr($this->seList[$seInfoId]['url'], 'google')) { $isGoogle = true; $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code']; } if (!empty($this->seList[$seInfoId]['cookie_send'])) { $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']); $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send']; } $result = $this->spider->getContent($seUrl); $pageContent = $this->formatPageContent($seInfoId, $result['page']); $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'keyword'; $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id']; $crawlInfo['subject'] = $seInfoId; $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset']; while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) { $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); sleep(SP_CRAWL_DELAY); $seUrl = str_replace('[--start--]', $seStart, $searchUrl); $result = $this->spider->getContent($seUrl); $pageContent .= $this->formatPageContent($seInfoId, $result['page']); $seStart += $this->seList[$seInfoId]['start_offset']; } # to check whether utf8 conversion needed if (!empty($this->seList[$seInfoId]['encoding'])) { $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']); } $crawlStatus = 0; if (empty($result['error'])) { // to update cron that report executed for akeyword on a search engine if (SP_MULTIPLE_CRON_EXEC && $cron) { $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time); } if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) { $urlList = $matches[$this->seList[$seInfoId]['url_index']]; $crawlResult[$seInfoId]['matched'] = array(); $rank = 1; $previousDomain = ""; foreach ($urlList as $i => $url) { $url = urldecode(strip_tags($url)); // add special condition for baidu if (stristr($this->seList[$seInfoId]['domain'], "baidu")) { $url = addHttpToUrl($url); $url = str_replace("...", "", $url); } if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) { continue; } // check for to remove msn ad links in page if (stristr($url, 'r.msn.com')) { continue; } // check to remove duplicates from same domain if google is the search engine if ($removeDuplicate && $isGoogle) { $currentDomain = parse_url($url, PHP_URL_HOST); if ($previousDomain == $currentDomain) { continue; } $previousDomain = $currentDomain; } if ($this->showAll || stristr($url, $websiteUrl)) { if ($this->showAll && stristr($url, $websiteUrl)) { $matchInfo['found'] = 1; } else { $matchInfo['found'] = 0; } $matchInfo['url'] = $url; $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]); $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]); $matchInfo['rank'] = $rank; $crawlResult[$seInfoId]['matched'][] = $matchInfo; } $rank++; } $crawlStatus = 1; } else { // set crawl log info $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!"; if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>"; } } } else { if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>"; } } $crawlResult[$seInfoId]['status'] = $crawlStatus; sleep(SP_CRAWL_DELAY); // update crawl log $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); } // if proxy enabled if crawl failed try to check next item if (SP_ENABLE_PROXY && CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { // max proxy checked in one execution is exeeded if ($this->proxyCheckCount < CHECK_MAX_PROXY_COUNT_IF_FAILED) { // if proxy is available for execution $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { $this->proxyCheckCount++; sleep(SP_CRAWL_DELAY); $crawlResult = $this->crawlKeyword($keywordInfo, $seId, $cron, $removeDuplicate); } } else { $this->proxyCheckCount = 1; } } return $crawlResult; }
function getContent($url, $enableProxy = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, 1); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); return $ret; }
function getContent($url, $enableProxy = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } // user agent assignment $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); // disable proxy if not working if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) { // deactivate proxy if (PROXY_DEACTIVATE_CRAWL) { echo "Deactivating proxy - " . $proxyInfo['proxy'] . "....<br>\n"; $proxyCtrler->__changeStatus($proxyInfo['id'], 0); } // chekc with another proxy if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { echo "Checking with another proxy....<br>\n"; $ret = $this->getContent($url, $enableProxy); } } return $ret; }