function __getBacklinks($engine) { if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) { return 0; } $backlinkCount = 0; switch ($engine) { #google case 'google': $url = $this->backUrlList[$engine] . urlencode($this->url); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> linking/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; #msn #msn case 'msn': $url = formatUrl($this->url, false); $url = $this->backUrlList[$engine] . urlencode(addHttpToUrl($url)); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $backlinkCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; # alexa # alexa case 'alexa': $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($this->url); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/<LINKSIN NUM="(.*?)"/si', $pageContent, $r)) { $backlinkCount = !empty($r[1]) ? intval($r[1]) : 0; } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } break; } // update crawl log $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'backlink'; $crawlInfo['ref_id'] = $this->url; $crawlInfo['subject'] = $engine; $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo); return $backlinkCount; }
function __getSaturationRank($engine) { if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) { return 0; } $saturationCount = 0; switch ($engine) { #google case 'google': $url = $this->saturationUrlList[$engine] . urlencode($this->url); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/about ([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/<div id=resultStats>([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/([0-9\\,]+) result/si', $pageContent, $r)) { } elseif (preg_match('/about <b>([0-9\\,]+)<\\/b> from/si', $pageContent, $r)) { } elseif (preg_match('/of <b>([0-9\\,]+)<\\/b>/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; #msn #msn case 'msn': $url = $this->saturationUrlList[$engine] . urlencode(addHttpToUrl($this->url)); $v = $this->spider->getContent($url); $pageContent = empty($v['page']) ? '' : $v['page']; if (preg_match('/([0-9\\,]+) results/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?\\(([0-9\\,]+).*?\\)/si', $pageContent, $r)) { } elseif (preg_match('/id="count".*?>.*?([0-9\\,]+).*?/si', $pageContent, $r)) { } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = "Regex not matched error occured while parsing search results!"; } $saturationCount = !empty($r[1]) ? str_replace(',', '', $r[1]) : 0; break; } // update crawl log $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'saturation'; $crawlInfo['ref_id'] = $this->url; $crawlInfo['subject'] = $engine; $crawlLogCtrl->updateCrawlLog($v['log_id'], $crawlInfo); return $saturationCount; }
function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true) { $crawlResult = array(); $websiteUrl = formatUrl($keywordInfo['url'], false); if (empty($websiteUrl)) { return $crawlResult; } if (empty($keywordInfo['name'])) { return $crawlResult; } $time = mktime(0, 0, 0, date('m'), date('d'), date('Y')); $seList = explode(':', $keywordInfo['searchengines']); foreach ($seList as $seInfoId) { if (!empty($seId) && $seInfoId != $seId) { continue; } $this->seFound = 1; // if execution from cron check whether cron already executed /*if ($cron) { if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue; }*/ $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']); $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl); $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl); if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) { $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl); } $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl); // if google add special parameters $isGoogle = false; if (stristr($this->seList[$seInfoId]['url'], 'google')) { $isGoogle = true; $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code']; } if (!empty($this->seList[$seInfoId]['cookie_send'])) { $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']); $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send']; } $result = $this->spider->getContent($seUrl); $pageContent = $this->formatPageContent($seInfoId, $result['page']); $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'keyword'; $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id']; $crawlInfo['subject'] = $seInfoId; $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset']; while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) { $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); sleep(SP_CRAWL_DELAY); $seUrl = str_replace('[--start--]', $seStart, $searchUrl); $result = $this->spider->getContent($seUrl); $pageContent .= $this->formatPageContent($seInfoId, $result['page']); $seStart += $this->seList[$seInfoId]['start_offset']; } # to check whether utf8 conversion needed if (!empty($this->seList[$seInfoId]['encoding'])) { $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']); } $crawlStatus = 0; if (empty($result['error'])) { // to update cron that report executed for akeyword on a search engine if (SP_MULTIPLE_CRON_EXEC && $cron) { $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time); } if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) { $urlList = $matches[$this->seList[$seInfoId]['url_index']]; $crawlResult[$seInfoId]['matched'] = array(); $rank = 1; $previousDomain = ""; foreach ($urlList as $i => $url) { $url = urldecode(strip_tags($url)); // add special condition for baidu if (stristr($this->seList[$seInfoId]['domain'], "baidu")) { $url = addHttpToUrl($url); $url = str_replace("...", "", $url); } if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) { continue; } // check for to remove msn ad links in page if (stristr($url, 'r.msn.com')) { continue; } // check to remove duplicates from same domain if google is the search engine if ($removeDuplicate && $isGoogle) { $currentDomain = parse_url($url, PHP_URL_HOST); if ($previousDomain == $currentDomain) { continue; } $previousDomain = $currentDomain; } if ($this->showAll || stristr($url, $websiteUrl)) { if ($this->showAll && stristr($url, $websiteUrl)) { $matchInfo['found'] = 1; } else { $matchInfo['found'] = 0; } $matchInfo['url'] = $url; $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]); $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]); $matchInfo['rank'] = $rank; $crawlResult[$seInfoId]['matched'][] = $matchInfo; } $rank++; } $crawlStatus = 1; } else { // set crawl log info $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!"; if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>"; } } } else { if (SP_DEBUG) { echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>"; } } $crawlResult[$seInfoId]['status'] = $crawlStatus; sleep(SP_CRAWL_DELAY); // update crawl log $logId = $result['log_id']; $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo); } return $crawlResult; }
function __getAlexaRank($url) { if (SP_DEMO && !empty($_SERVER['REQUEST_METHOD'])) { return 0; } $websiteUrl = $url; $url = 'http://data.alexa.com/data?cli=10&dat=snbamz&url=' . urlencode($url); $ret = $this->spider->getContent($url); $rank = 0; // parse rank from teh page if (!empty($ret['page'])) { if (preg_match('/\\<popularity url\\="(.*?)" TEXT\\="([0-9]+)"/si', $ret['page'], $matches)) { $rank = empty($matches[2]) ? 0 : $matches[2]; } else { $crawlInfo['crawl_status'] = 0; $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($ret['page']) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!"; } } // update crawl log $crawlLogCtrl = new CrawlLogController(); $crawlInfo['crawl_type'] = 'rank'; $crawlInfo['ref_id'] = $websiteUrl; $crawlInfo['subject'] = "alexa"; $crawlLogCtrl->updateCrawlLog($ret['log_id'], $crawlInfo); return $rank; }