function __getBacklinks($engine)
 {
     if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) {
         return 0;
     }
     $backlinkCount = 0;
     switch ($engine) {
         #google
         case 'google':
             $url = $this->backUrlList[$engine] . urlencode($this->url);
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> linking/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
             #msn
         #msn
         case 'msn':
             $url = formatUrl($this->url, false);
             $url = $this->backUrlList[$engine] . urlencode(addHttpToUrl($url));
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
             # alexa
         # alexa
         case 'alexa':
             $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($this->url);
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/<LINKSIN NUM="(.*?)"/si', $pageContent, $r)) {
                 $backlinkCount = !empty($r[1]) ? intval($r[1]) : 0;
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             break;
     }
     // update crawl log
     $crawlLogCtrl = new CrawlLogController();
     $crawlInfo['crawl_type'] = 'backlink';
     $crawlInfo['ref_id'] = $this->url;
     $crawlInfo['subject'] = $engine;
     $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo);
     return $backlinkCount;
 }
 function __getSaturationRank($engine)
 {
     if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) {
         return 0;
     }
     $saturationCount = 0;
     switch ($engine) {
         #google
         case 'google':
             $url = $this->saturationUrlList[$engine] . urlencode($this->url);
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) {
             } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> from/si', $pageContent, $r)) {
             } elseif (preg_match('/of <b>([0-9\\,]+)<\\/b>/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
             #msn
         #msn
         case 'msn':
             $url = $this->saturationUrlList[$engine] . urlencode(addHttpToUrl($this->url));
             $v = $this->spider->getContent($url);
             $pageContent = empty($v['page']) ? '' : $v['page'];
             if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) {
             } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) {
             } else {
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!";
             }
             $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0;
             break;
     }
     // update crawl log
     $crawlLogCtrl = new CrawlLogController();
     $crawlInfo['crawl_type'] = 'saturation';
     $crawlInfo['ref_id'] = $this->url;
     $crawlInfo['subject'] = $engine;
     $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo);
     return $saturationCount;
 }
 function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true)
 {
     $crawlResult = array();
     $websiteUrl = formatUrl($keywordInfo['url'], false);
     if (empty($websiteUrl)) {
         return $crawlResult;
     }
     if (empty($keywordInfo['name'])) {
         return $crawlResult;
     }
     $time = mktime(0, 0, 0, date('m'), date('d'), date('Y'));
     $seList = explode(':', $keywordInfo['searchengines']);
     foreach ($seList as $seInfoId) {
         if (!empty($seId) && $seInfoId != $seId) {
             continue;
         }
         $this->seFound = 1;
         // if execution from cron check whether cron already executed
         /*if ($cron) {
         		    if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue;
         		}*/
         $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']);
         $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl);
         $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl);
         if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) {
             $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl);
         }
         $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl);
         // if google add special parameters
         $isGoogle = false;
         if (stristr($this->seList[$seInfoId]['url'], 'google')) {
             $isGoogle = true;
             $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code'];
         }
         if (!empty($this->seList[$seInfoId]['cookie_send'])) {
             $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']);
             $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send'];
         }
         $result = $this->spider->getContent($seUrl);
         $pageContent = $this->formatPageContent($seInfoId, $result['page']);
         $crawlLogCtrl = new CrawlLogController();
         $crawlInfo['crawl_type'] = 'keyword';
         $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id'];
         $crawlInfo['subject'] = $seInfoId;
         $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset'];
         while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) {
             $logId = $result['log_id'];
             $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo);
             sleep(SP_CRAWL_DELAY);
             $seUrl = str_replace('[--start--]', $seStart, $searchUrl);
             $result = $this->spider->getContent($seUrl);
             $pageContent .= $this->formatPageContent($seInfoId, $result['page']);
             $seStart += $this->seList[$seInfoId]['start_offset'];
         }
         # to check whether utf8 conversion needed
         if (!empty($this->seList[$seInfoId]['encoding'])) {
             $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']);
         }
         $crawlStatus = 0;
         if (empty($result['error'])) {
             // to update cron that report executed for akeyword on a search engine
             if (SP_MULTIPLE_CRON_EXEC && $cron) {
                 $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time);
             }
             if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) {
                 $urlList = $matches[$this->seList[$seInfoId]['url_index']];
                 $crawlResult[$seInfoId]['matched'] = array();
                 $rank = 1;
                 $previousDomain = "";
                 foreach ($urlList as $i => $url) {
                     $url = urldecode(strip_tags($url));
                     // add special condition for baidu
                     if (stristr($this->seList[$seInfoId]['domain'], "baidu")) {
                         $url = addHttpToUrl($url);
                         $url = str_replace("...", "", $url);
                     }
                     if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) {
                         continue;
                     }
                     // check for to remove msn ad links in page
                     if (stristr($url, 'r.msn.com')) {
                         continue;
                     }
                     // check to remove duplicates from same domain if google is the search engine
                     if ($removeDuplicate && $isGoogle) {
                         $currentDomain = parse_url($url, PHP_URL_HOST);
                         if ($previousDomain == $currentDomain) {
                             continue;
                         }
                         $previousDomain = $currentDomain;
                     }
                     if ($this->showAll || stristr($url, $websiteUrl)) {
                         if ($this->showAll && stristr($url, $websiteUrl)) {
                             $matchInfo['found'] = 1;
                         } else {
                             $matchInfo['found'] = 0;
                         }
                         $matchInfo['url'] = $url;
                         $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]);
                         $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]);
                         $matchInfo['rank'] = $rank;
                         $crawlResult[$seInfoId]['matched'][] = $matchInfo;
                     }
                     $rank++;
                 }
                 $crawlStatus = 1;
             } else {
                 // set crawl log info
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!";
                 if (SP_DEBUG) {
                     echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>";
                 }
             }
         } else {
             if (SP_DEBUG) {
                 echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>";
             }
         }
         $crawlResult[$seInfoId]['status'] = $crawlStatus;
         sleep(SP_CRAWL_DELAY);
         // update crawl log
         $logId = $result['log_id'];
         $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo);
     }
     return $crawlResult;
 }
Exemple #4
0
 function __getAlexaRank($url)
 {
     if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) {
         return 0;
     }
     $websiteUrl = $url;
     $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($url);
     $ret = $this->spider->getContent($url);
     $rank = 0;
     // parse rank from teh page
     if (!empty($ret['page'])) {
         if (preg_match('/\\<popularity url\\="(.*?)" TEXT\\="([0-9]+)"/si', $ret['page'], $matches)) {
             $rank = empty($matches[2]) ? 0 : $matches[2];
         } else {
             $crawlInfo['crawl_status'] = 0;
             $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($ret['page']) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!";
         }
     }
     // update crawl log
     $crawlLogCtrl = new CrawlLogController();
     $crawlInfo['crawl_type'] = 'rank';
     $crawlInfo['ref_id'] = $websiteUrl;
     $crawlInfo['subject'] = "alexa";
     $crawlLogCtrl->updateCrawlLog($ret['log_id'], $crawlInfo);
     return $rank;
 }