/** * @return ControllersCollection **/ public function add(Controller $controller, RequestType $requestType = null) { if (!$requestType) { $requestType = $this->defaultRequestType; } $this->innerControllers[get_class($controller)] = ProxyController::create()->setInner($controller)->setRequestType($requestType); return $this; }
function getContent($url, $enableProxy = true, $logCrawl = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } // user agent assignment $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); // update crawl log in database for future reference if ($logCrawl) { $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_status'] = $ret['error'] ? 0 : 1; $crawlInfo['ref_id'] = $crawlInfo['crawl_link'] = addslashes($url); $crawlInfo['crawl_referer'] = addslashes($this->_CURLOPT_REFERER); $crawlInfo['crawl_cookie'] = addslashes($this->_CURLOPT_COOKIE); $crawlInfo['crawl_post_fields'] = addslashes($this->_CURLOPT_POSTFIELDS); $crawlInfo['crawl_useragent'] = addslashes($this->_CURLOPT_USERAGENT); $crawlInfo['proxy_id'] = $proxyInfo['id']; $crawlInfo['log_message'] = addslashes($ret['errmsg']); $ret['log_id'] = $crawlLogCtrl->createCrawlLog($crawlInfo); } // disable proxy if not working if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) { // deactivate proxy if (PROXY_DEACTIVATE_CRAWL) { $proxyCtrler->__changeStatus($proxyInfo['id'], 0); } // chekc with another proxy if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { $ret = $this->getContent($url, $enableProxy); } } return $ret; }
function getContent($url, $enableProxy = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, 1); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); return $ret; }
* (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ include_once "includes/sp-load.php"; checkAdminLoggedIn(); include_once SP_CTRLPATH . "/proxy.ctrl.php"; $controller = new ProxyController(); $controller->view->menu = 'adminpanel'; $controller->layout = 'ajax'; $controller->set('spTextPanel', $controller->getLanguageTexts('panel', $_SESSION['lang_code'])); $controller->spTextProxy = $controller->getLanguageTexts('proxy', $_SESSION['lang_code']); $controller->set('spTextProxy', $controller->spTextProxy); if ($_SERVER['REQUEST_METHOD'] == 'POST') { switch ($_POST['sec']) { case "create": $controller->createProxy($_POST); break; case "update": $controller->updateProxy($_POST); break; } } else {
* (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ include_once "includes/sp-load.php"; checkAdminLoggedIn(); include_once SP_CTRLPATH . "/proxy.ctrl.php"; $controller = new ProxyController(); $controller->view->menu = 'adminpanel'; $controller->layout = 'ajax'; $controller->set('spTextPanel', $controller->getLanguageTexts('panel', $_SESSION['lang_code'])); $controller->spTextProxy = $controller->getLanguageTexts('proxy', $_SESSION['lang_code']); $controller->set('spTextProxy', $controller->spTextProxy); $controller->set('spTextSA', $controller->getLanguageTexts('siteauditor', $_SESSION['lang_code'])); if ($_SERVER['REQUEST_METHOD'] == 'POST') { switch ($_POST['sec']) { case "create": $_POST = sanitizeData($_POST, true, true); $controller->createProxy($_POST); break; case "update": $_POST = sanitizeData($_POST, true, true); $controller->updateProxy($_POST);
* Copyright (C) 2009-2011 by Geo Varghese(www.seofreetools.net) * * sendtogeo@gmail.com * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ include_once "includes/sp-load.php"; if (empty($_SERVER['REQUEST_METHOD'])) { // get all proxies include_once SP_CTRLPATH . "/proxy.ctrl.php"; $proxyCtrler = new ProxyController(); $proxyList = $proxyCtrler->__getAllProxys(false); foreach ($proxyList as $proxyInfo) { $proxyCtrler->checkStatus($proxyInfo['id']); echo "checking proxy: " . $proxyInfo['proxy'] . ":" . $proxyInfo['port'] . "...\n"; } } else { showErrorMsg("<p style='color:red'>You don't have permission to access this page!</p>"); }
function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true) { $crawlResult = array(); $websiteUrl = $keywordInfo['url']; if (empty($websiteUrl)) { return $crawlResult; } if (empty($keywordInfo['name'])) { return $crawlResult; } $time = mktime(0, 0, 0, date('m'), date('d'), date('Y')); $seList = explode(':', $keywordInfo['searchengines']); foreach ($seList as $seInfoId) { // function to execute only passed search engine if (!empty($seId) && $seInfoId != $seId) { continue; } // if search engine not found continue if (empty($this->seList[$seInfoId])) { continue; } $this->seFound = 1; // if execution from cron check whether cron already executed /*if ($cron) { if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue; }*/ $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']); $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl); $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl); if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) { $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl); } $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl); // if google add special parameters $isGoogle = false; if (stristr($this->seList[$seInfoId]['url'], 'google')) { $isGoogle = true; $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code']; } if (!empty($this->seList[$seInfoId]['cookie_send'])) { $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']); $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send']; } $result = $this->spider->getContent($seUrl); $pageContent = $this->formatPageContent($seInfoId, $result['page']); $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'keyword'; $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id']; $crawlInfo['subject'] = $seInfoId; $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset']; while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) { $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); sleep(SP_CRAWL_DELAY); $seUrl = str_replace('[--start--]', $seStart, $searchUrl); $result = $this->spider->getContent($seUrl); $pageContent .= $this->formatPageContent($seInfoId, $result['page']); $seStart += $this->seList[$seInfoId]['start_offset']; } # to check whether utf8 conversion needed if (!empty($this->seList[$seInfoId]['encoding'])) { $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']); } $crawlStatus = 0; if (empty($result['error'])) { // to update cron that report executed for akeyword on a search engine if (SP_MULTIPLE_CRON_EXEC && $cron) { $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time); } if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) { $urlList = $matches[$this->seList[$seInfoId]['url_index']]; $crawlResult[$seInfoId]['matched'] = array(); $rank = 1; $previousDomain = ""; foreach ($urlList as $i => $url) { $url = urldecode(strip_tags($url)); // add special condition for baidu if (stristr($this->seList[$seInfoId]['domain'], "baidu")) { $url = addHttpToUrl($url); $url = str_replace("...", "", $url); } if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) { continue; } // check for to remove msn ad links in page if (stristr($url, 'r.msn.com')) { continue; } // check to remove duplicates from same domain if google is the search engine if ($removeDuplicate && $isGoogle) { $currentDomain = parse_url($url, PHP_URL_HOST); if ($previousDomain == $currentDomain) { continue; } $previousDomain = $currentDomain; } if ($this->showAll || stristr($url, $websiteUrl)) { if ($this->showAll && stristr($url, $websiteUrl)) { $matchInfo['found'] = 1; } else { $matchInfo['found'] = 0; } $matchInfo['url'] = $url; $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]); $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]); $matchInfo['rank'] = $rank; $crawlResult[$seInfoId]['matched'][] = $matchInfo; } $rank++; } $crawlStatus = 1; } else { // set crawl log info $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!"; if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>"; } } } else { if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>"; } } $crawlResult[$seInfoId]['status'] = $crawlStatus; sleep(SP_CRAWL_DELAY); // update crawl log $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); } // if proxy enabled if crawl failed try to check next item if (SP_ENABLE_PROXY && CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { // max proxy checked in one execution is exeeded if ($this->proxyCheckCount < CHECK_MAX_PROXY_COUNT_IF_FAILED) { // if proxy is available for execution $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { $this->proxyCheckCount++; sleep(SP_CRAWL_DELAY); $crawlResult = $this->crawlKeyword($keywordInfo, $seId, $cron, $removeDuplicate); } } else { $this->proxyCheckCount = 1; } } return $crawlResult; }
function getContent($url, $enableProxy = true) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url); curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR); @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION); curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER); curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR); curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER); if (!empty($this->_CURLOPT_COOKIE)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE); } if (!empty($this->_CURLOPT_REFERER)) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER); } if (strlen($this->_CURLOPT_POSTFIELDS) > 1) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST); curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS); } // user agent assignment $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT; if (strlen($this->_CURLOPT_USERAGENT) > 0) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT); } if (strlen($this->_CURLOPT_USERPWD) > 2) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD); } // to use proxy if proxy enabled if (SP_ENABLE_PROXY && $enableProxy) { $proxyCtrler = new ProxyController(); if ($proxyInfo = $proxyCtrler->getRandomProxy()) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']); curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL); if (!empty($proxyInfo['proxy_auth'])) { curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']); } } else { showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel."); } } $ret['page'] = curl_exec($this->_CURL_RESOURCE); $ret['error'] = curl_errno($this->_CURL_RESOURCE); $ret['errmsg'] = curl_error($this->_CURL_RESOURCE); // disable proxy if not working if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) { // deactivate proxy if (PROXY_DEACTIVATE_CRAWL) { echo "Deactivating proxy - " . $proxyInfo['proxy'] . "....<br>\n"; $proxyCtrler->__changeStatus($proxyInfo['id'], 0); } // chekc with another proxy if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) { echo "Checking with another proxy....<br>\n"; $ret = $this->getContent($url, $enableProxy); } } return $ret; }