Esempio n. 1
0
 function getContent($url, $enableProxy = true, $logCrawl = true)
 {
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR);
     @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER);
     if (!empty($this->_CURLOPT_COOKIE)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE);
     }
     if (!empty($this->_CURLOPT_REFERER)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER);
     }
     if (strlen($this->_CURLOPT_POSTFIELDS) > 1) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST);
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS);
     }
     // user agent assignment
     $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT;
     if (strlen($this->_CURLOPT_USERAGENT) > 0) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT);
     }
     if (strlen($this->_CURLOPT_USERPWD) > 2) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD);
     }
     // to use proxy if proxy enabled
     if (SP_ENABLE_PROXY && $enableProxy) {
         $proxyCtrler = new ProxyController();
         if ($proxyInfo = $proxyCtrler->getRandomProxy()) {
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']);
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL);
             if (!empty($proxyInfo['proxy_auth'])) {
                 curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']);
             }
         } else {
             showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel.");
         }
     }
     $ret['page'] = curl_exec($this->_CURL_RESOURCE);
     $ret['error'] = curl_errno($this->_CURL_RESOURCE);
     $ret['errmsg'] = curl_error($this->_CURL_RESOURCE);
     // update crawl log in database for future reference
     if ($logCrawl) {
         $crawlLogCtrl = new CrawlLogController();
         $crawlInfo['crawl_status'] = $ret['error'] ? 0 : 1;
         $crawlInfo['ref_id'] = $crawlInfo['crawl_link'] = addslashes($url);
         $crawlInfo['crawl_referer'] = addslashes($this->_CURLOPT_REFERER);
         $crawlInfo['crawl_cookie'] = addslashes($this->_CURLOPT_COOKIE);
         $crawlInfo['crawl_post_fields'] = addslashes($this->_CURLOPT_POSTFIELDS);
         $crawlInfo['crawl_useragent'] = addslashes($this->_CURLOPT_USERAGENT);
         $crawlInfo['proxy_id'] = $proxyInfo['id'];
         $crawlInfo['log_message'] = addslashes($ret['errmsg']);
         $ret['log_id'] = $crawlLogCtrl->createCrawlLog($crawlInfo);
     }
     // disable proxy if not working
     if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) {
         // deactivate proxy
         if (PROXY_DEACTIVATE_CRAWL) {
             $proxyCtrler->__changeStatus($proxyInfo['id'], 0);
         }
         // chekc with another proxy
         if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) {
             $ret = $this->getContent($url, $enableProxy);
         }
     }
     return $ret;
 }
Esempio n. 2
0
 function __getBacklinks($engine)
 {
     if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) {
         return 0;
     }
     $backlinkCount = 0;
     switch ($engine) {
         #google
         case 'google':
             $url = $this->backUrlList[$engine] . urlencode($this->url);
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> linking/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
             #msn
         #msn
         case 'msn':
             $url = formatUrl($this->url, false);
             $url = $this->backUrlList[$engine] . urlencode(addHttpToUrl($url));
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
             # alexa
         # alexa
         case 'alexa':
             $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($this->url);
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/<LINKSIN NUM="(.*?)"/si', $pageContent, $r)) {
                 $backlinkCount = !empty($r[1]) ? intval($r[1]) : 0;
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             break;
     }
     // update crawl log
     $crawlLogCtrl = new CrawlLogController();
     $crawlInfo['crawl_type'] = 'backlink';
     $crawlInfo['ref_id'] = $this->url;
     $crawlInfo['subject'] = $engine;
     $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo);
     return $backlinkCount;
 }
Esempio n. 3
0
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
include_once "includes/sp-load.php";
checkAdminLoggedIn();
include_once SP_CTRLPATH . "/crawllog.ctrl.php";
include_once SP_CTRLPATH . "/keyword.ctrl.php";
include_once SP_CTRLPATH . "/searchengine.ctrl.php";
$controller = new CrawlLogController();
$controller->view->menu = 'adminpanel';
$controller->layout = 'ajax';
$controller->set('spTextPanel', $controller->getLanguageTexts('panel', $_SESSION['lang_code']));
$controller->spTextLog = $controller->getLanguageTexts('log', $_SESSION['lang_code']);
$controller->set('spTextLog', $controller->spTextLog);
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
    switch ($_POST['sec']) {
        case "delete_all_crawl_log":
            if (!empty($_POST['ids'])) {
                foreach ($_POST['ids'] as $id) {
                    $controller->deleteCrawlLog($id);
                }
            }
            $controller->listCrawlLog($_POST);
            break;
Esempio n. 4
0
 function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true)
 {
     $crawlResult = array();
     $websiteUrl = formatUrl($keywordInfo['url'], false);
     if (empty($websiteUrl)) {
         return $crawlResult;
     }
     if (empty($keywordInfo['name'])) {
         return $crawlResult;
     }
     $time = mktime(0, 0, 0, date('m'), date('d'), date('Y'));
     $seList = explode(':', $keywordInfo['searchengines']);
     foreach ($seList as $seInfoId) {
         if (!empty($seId) && $seInfoId != $seId) {
             continue;
         }
         $this->seFound = 1;
         // if execution from cron check whether cron already executed
         /*if ($cron) {
         		    if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue;
         		}*/
         $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']);
         $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl);
         $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl);
         if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) {
             $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl);
         }
         $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl);
         // if google add special parameters
         $isGoogle = false;
         if (stristr($this->seList[$seInfoId]['url'], 'google')) {
             $isGoogle = true;
             $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code'];
         }
         if (!empty($this->seList[$seInfoId]['cookie_send'])) {
             $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']);
             $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send'];
         }
         $result = $this->spider->getContent($seUrl);
         $pageContent = $this->formatPageContent($seInfoId, $result['page']);
         $crawlLogCtrl = new CrawlLogController();
         $crawlInfo['crawl_type'] = 'keyword';
         $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id'];
         $crawlInfo['subject'] = $seInfoId;
         $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset'];
         while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) {
             $logId = $result['log_id'];
             $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo);
             sleep(SP_CRAWL_DELAY);
             $seUrl = str_replace('[--start--]', $seStart, $searchUrl);
             $result = $this->spider->getContent($seUrl);
             $pageContent .= $this->formatPageContent($seInfoId, $result['page']);
             $seStart += $this->seList[$seInfoId]['start_offset'];
         }
         # to check whether utf8 conversion needed
         if (!empty($this->seList[$seInfoId]['encoding'])) {
             $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']);
         }
         $crawlStatus = 0;
         if (empty($result['error'])) {
             // to update cron that report executed for akeyword on a search engine
             if (SP_MULTIPLE_CRON_EXEC && $cron) {
                 $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time);
             }
             if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) {
                 $urlList = $matches[$this->seList[$seInfoId]['url_index']];
                 $crawlResult[$seInfoId]['matched'] = array();
                 $rank = 1;
                 $previousDomain = "";
                 foreach ($urlList as $i => $url) {
                     $url = urldecode(strip_tags($url));
                     // add special condition for baidu
                     if (stristr($this->seList[$seInfoId]['domain'], "baidu")) {
                         $url = addHttpToUrl($url);
                         $url = str_replace("...", "", $url);
                     }
                     if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) {
                         continue;
                     }
                     // check for to remove msn ad links in page
                     if (stristr($url, 'r.msn.com')) {
                         continue;
                     }
                     // check to remove duplicates from same domain if google is the search engine
                     if ($removeDuplicate && $isGoogle) {
                         $currentDomain = parse_url($url, PHP_URL_HOST);
                         if ($previousDomain == $currentDomain) {
                             continue;
                         }
                         $previousDomain = $currentDomain;
                     }
                     if ($this->showAll || stristr($url, $websiteUrl)) {
                         if ($this->showAll && stristr($url, $websiteUrl)) {
                             $matchInfo['found'] = 1;
                         } else {
                             $matchInfo['found'] = 0;
                         }
                         $matchInfo['url'] = $url;
                         $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]);
                         $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]);
                         $matchInfo['rank'] = $rank;
                         $crawlResult[$seInfoId]['matched'][] = $matchInfo;
                     }
                     $rank++;
                 }
                 $crawlStatus = 1;
             } else {
                 // set crawl log info
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!";
                 if (SP_DEBUG) {
                     echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>";
                 }
             }
         } else {
             if (SP_DEBUG) {
                 echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>";
             }
         }
         $crawlResult[$seInfoId]['status'] = $crawlStatus;
         sleep(SP_CRAWL_DELAY);
         // update crawl log
         $logId = $result['log_id'];
         $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo);
     }
     return $crawlResult;
 }
 function __getSaturationRank($engine)
 {
     if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) {
         return 0;
     }
     $saturationCount = 0;
     switch ($engine) {
         #google
         case 'google':
             $url = $this->saturationUrlList[$engine] . urlencode($this->url);
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> from/si', $pageContent, $r)) {
             } elseif (preg_match('/of <b>([0-9\\,]+)<\\/b>/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
             #msn
         #msn
         case 'msn':
             $url = $this->saturationUrlList[$engine] . urlencode(addHttpToUrl($this->url));
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
     }
     // update crawl log
     $crawlLogCtrl = new CrawlLogController();
     $crawlInfo['crawl_type'] = 'saturation';
     $crawlInfo['ref_id'] = $this->url;
     $crawlInfo['subject'] = $engine;
     $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo);
     return $saturationCount;
 }
Esempio n. 6
0
    } else {
        switch ($_GET['sec']) {
            case "generate":
                $controller->routeCronJob($_GET['website_id'], $_GET['repTools']);
                break;
            case "croncommand":
                $controller->showCronCommand();
                break;
            default:
                $controller->showReportGenerationManager();
                break;
        }
    }
} else {
    # the section for generate reports using system cron job
    include_once "includes/sp-load.php";
    include_once SP_CTRLPATH . "/cron.ctrl.php";
    include_once SP_CTRLPATH . "/report.ctrl.php";
    include_once SP_CTRLPATH . "/searchengine.ctrl.php";
    include_once SP_CTRLPATH . "/keyword.ctrl.php";
    $controller = new CronController();
    $controller->timeStamp = mktime(0, 0, 0, date('m'), date('d'), date('Y'));
    $includeList = array();
    // the only included seo tools id
    $controller->executeCron($includeList);
    // delete crawl logs before 2 months
    include_once SP_CTRLPATH . "/crawllog.ctrl.php";
    $crawlLog = new CrawlLogController();
    $crawlLog->clearCrawlLog(SP_CRAWL_LOG_CLEAR_TIME);
    echo "Clearing crawl logs before " . SP_CRAWL_LOG_CLEAR_TIME . " days";
}
Esempio n. 7
0
 function __getAlexaRank($url)
 {
     if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) {
         return 0;
     }
     $websiteUrl = $url;
     $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($url);
     $ret = $this->spider->getContent($url);
     $rank = 0;
     // parse rank from teh page
     if (!empty($ret['page'])) {
         if (preg_match('/\\<popularity url\\="(.*?)" TEXT\\="([0-9]+)"/si', $ret['page'], $matches)) {
             $rank = empty($matches[2]) ? 0 : $matches[2];
         } else {
             $crawlInfo['crawl_status'] = 0;
             $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($ret['page']) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!";
         }
     }
     // update crawl log
     $crawlLogCtrl = new CrawlLogController();
     $crawlInfo['crawl_type'] = 'rank';
     $crawlInfo['ref_id'] = $websiteUrl;
     $crawlInfo['subject'] = "alexa";
     $crawlLogCtrl->updateCrawlLog($ret['log_id'], $crawlInfo);
     return $rank;
 }