function getContent($url, $enableProxy = true, $logCrawl = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } // user agent assignment $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); // update crawl log in database for future reference if ($logCrawl) { $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_status'] = $ret['error'] ? 0 : 1; $crawlInfo['ref_id'] = $crawlInfo['crawl_link'] = addslashes($url); $crawlInfo['crawl_referer'] = addslashes($this->_CURLOPT_REFERER); $crawlInfo['crawl_cookie'] = addslashes($this->_CURLOPT_COOKIE); $crawlInfo['crawl_post_fields'] = addslashes($this->_CURLOPT_POSTFIELDS); $crawlInfo['crawl_useragent'] = addslashes($this->_CURLOPT_USERAGENT); $crawlInfo['proxy_id'] = $proxyInfo['id']; $crawlInfo['log_message'] = addslashes($ret['errmsg']); $ret['log_id'] = $crawlLogCtrl->createCrawlLog($crawlInfo); } // disable proxy if not working if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) { // deactivate proxy if (PROXY_DEACTIVATE_CRAWL) { $proxyCtrler->__changeStatus($proxyInfo['id'], 0); } // chekc with another proxy if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { $ret = $this->getContent($url, $enableProxy); } } return $ret; }
function __getBacklinks($engine) { if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) { return 0; } $backlinkCount = 0; switch ($engine) { #google case 'google': $url = $this->backUrlList[$engine] . urlencode($this->url); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> linking/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; #msn #msn case 'msn': $url = formatUrl($this->url, false); $url = $this->backUrlList[$engine] . urlencode(addHttpToUrl($url)); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; # alexa # alexa case 'alexa': $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($this->url); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/<LINKSIN NUM="(.*?)"/si', $pageContent, $r)) { $backlinkCount = !empty($r[1]) ? intval($r[1]) : 0; } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } break; } // update crawl log $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'backlink'; $crawlInfo['ref_id'] = $this->url; $crawlInfo['subject'] = $engine; $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo); return $backlinkCount; }
* This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ include_once "includes/sp-load.php"; checkAdminLoggedIn(); include_once SP_CTRLPATH . "/crawllog.ctrl.php"; include_once SP_CTRLPATH . "/keyword.ctrl.php"; include_once SP_CTRLPATH . "/searchengine.ctrl.php"; $controller = new CrawlLogController(); $controller->view->menu = 'adminpanel'; $controller->layout = 'ajax'; $controller->set('spTextPanel', $controller->getLanguageTexts('panel', $_SESSION['lang_code'])); $controller->spTextLog = $controller->getLanguageTexts('log', $_SESSION['lang_code']); $controller->set('spTextLog', $controller->spTextLog); if ($_SERVER['REQUEST_METHOD'] == 'POST') { switch ($_POST['sec']) { case "delete_all_crawl_log": if (!empty($_POST['ids'])) { foreach ($_POST['ids'] as $id) { $controller->deleteCrawlLog($id); } } $controller->listCrawlLog($_POST); break;
function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true) { $crawlResult = array(); $websiteUrl = formatUrl($keywordInfo['url'], false); if (empty($websiteUrl)) { return $crawlResult; } if (empty($keywordInfo['name'])) { return $crawlResult; } $time = mktime(0, 0, 0, date('m'), date('d'), date('Y')); $seList = explode(':', $keywordInfo['searchengines']); foreach ($seList as $seInfoId) { if (!empty($seId) && $seInfoId != $seId) { continue; } $this->seFound = 1; // if execution from cron check whether cron already executed /*if ($cron) { if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue; }*/ $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']); $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl); $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl); if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) { $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl); } $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl); // if google add special parameters $isGoogle = false; if (stristr($this->seList[$seInfoId]['url'], 'google')) { $isGoogle = true; $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code']; } if (!empty($this->seList[$seInfoId]['cookie_send'])) { $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']); $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send']; } $result = $this->spider->getContent($seUrl); $pageContent = $this->formatPageContent($seInfoId, $result['page']); $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'keyword'; $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id']; $crawlInfo['subject'] = $seInfoId; $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset']; while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) { $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); sleep(SP_CRAWL_DELAY); $seUrl = str_replace('[--start--]', $seStart, $searchUrl); $result = $this->spider->getContent($seUrl); $pageContent .= $this->formatPageContent($seInfoId, $result['page']); $seStart += $this->seList[$seInfoId]['start_offset']; } # to check whether utf8 conversion needed if (!empty($this->seList[$seInfoId]['encoding'])) { $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']); } $crawlStatus = 0; if (empty($result['error'])) { // to update cron that report executed for akeyword on a search engine if (SP_MULTIPLE_CRON_EXEC && $cron) { $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time); } if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) { $urlList = $matches[$this->seList[$seInfoId]['url_index']]; $crawlResult[$seInfoId]['matched'] = array(); $rank = 1; $previousDomain = ""; foreach ($urlList as $i => $url) { $url = urldecode(strip_tags($url)); // add special condition for baidu if (stristr($this->seList[$seInfoId]['domain'], "baidu")) { $url = addHttpToUrl($url); $url = str_replace("...", "", $url); } if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) { continue; } // check for to remove msn ad links in page if (stristr($url, 'r.msn.com')) { continue; } // check to remove duplicates from same domain if google is the search engine if ($removeDuplicate && $isGoogle) { $currentDomain = parse_url($url, PHP_URL_HOST); if ($previousDomain == $currentDomain) { continue; } $previousDomain = $currentDomain; } if ($this->showAll || stristr($url, $websiteUrl)) { if ($this->showAll && stristr($url, $websiteUrl)) { $matchInfo['found'] = 1; } else { $matchInfo['found'] = 0; } $matchInfo['url'] = $url; $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]); $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]); $matchInfo['rank'] = $rank; $crawlResult[$seInfoId]['matched'][] = $matchInfo; } $rank++; } $crawlStatus = 1; } else { // set crawl log info $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!"; if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>"; } } } else { if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>"; } } $crawlResult[$seInfoId]['status'] = $crawlStatus; sleep(SP_CRAWL_DELAY); // update crawl log $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); } return $crawlResult; }
function __getSaturationRank($engine) { if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) { return 0; } $saturationCount = 0; switch ($engine) { #google case 'google': $url = $this->saturationUrlList[$engine] . urlencode($this->url); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> from/si', $pageContent, $r)) { } elseif (preg_match('/of <b>([0-9\\,]+)<\\/b>/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; #msn #msn case 'msn': $url = $this->saturationUrlList[$engine] . urlencode(addHttpToUrl($this->url)); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; } // update crawl log $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'saturation'; $crawlInfo['ref_id'] = $this->url; $crawlInfo['subject'] = $engine; $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo); return $saturationCount; }
} else { switch ($_GET['sec']) { case "generate": $controller->routeCronJob($_GET['website_id'], $_GET['repTools']); break; case "croncommand": $controller->showCronCommand(); break; default: $controller->showReportGenerationManager(); break; } } } else { # the section for generate reports using system cron job include_once "includes/sp-load.php"; include_once SP_CTRLPATH . "/cron.ctrl.php"; include_once SP_CTRLPATH . "/report.ctrl.php"; include_once SP_CTRLPATH . "/searchengine.ctrl.php"; include_once SP_CTRLPATH . "/keyword.ctrl.php"; $controller = new CronController(); $controller->timeStamp = mktime(0, 0, 0, date('m'), date('d'), date('Y')); $includeList = array(); // the only included seo tools id $controller->executeCron($includeList); // delete crawl logs before 2 months include_once SP_CTRLPATH . "/crawllog.ctrl.php"; $crawlLog = new CrawlLogController(); $crawlLog->clearCrawlLog(SP_CRAWL_LOG_CLEAR_TIME); echo "Clearing crawl logs before " . SP_CRAWL_LOG_CLEAR_TIME . " days"; }
function __getAlexaRank($url) { if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) { return 0; } $websiteUrl = $url; $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($url); $ret = $this->spider->getContent($url); $rank = 0; // parse rank from teh page if (!empty($ret['page'])) { if (preg_match('/\\<popularity url\\="(.*?)" TEXT\\="([0-9]+)"/si', $ret['page'], $matches)) { $rank = empty($matches[2]) ? 0 : $matches[2]; } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($ret['page']) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!"; } } // update crawl log $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'rank'; $crawlInfo['ref_id'] = $websiteUrl; $crawlInfo['subject'] = "alexa"; $crawlLogCtrl->updateCrawlLog($ret['log_id'], $crawlInfo); return $rank; }