Exemplo n.º 1
0
 function getContent($url, $enableProxy = true, $logCrawl = true)
 {
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR);
     @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER);
     if (!empty($this->_CURLOPT_COOKIE)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE);
     }
     if (!empty($this->_CURLOPT_REFERER)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER);
     }
     if (strlen($this->_CURLOPT_POSTFIELDS) > 1) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST);
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS);
     }
     // user agent assignment
     $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT;
     if (strlen($this->_CURLOPT_USERAGENT) > 0) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT);
     }
     if (strlen($this->_CURLOPT_USERPWD) > 2) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD);
     }
     // to use proxy if proxy enabled
     if (SP_ENABLE_PROXY && $enableProxy) {
         $proxyCtrler = new ProxyController();
         if ($proxyInfo = $proxyCtrler->getRandomProxy()) {
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']);
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL);
             if (!empty($proxyInfo['proxy_auth'])) {
                 curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']);
             }
         } else {
             showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel.");
         }
     }
     $ret['page'] = curl_exec($this->_CURL_RESOURCE);
     $ret['error'] = curl_errno($this->_CURL_RESOURCE);
     $ret['errmsg'] = curl_error($this->_CURL_RESOURCE);
     // update crawl log in database for future reference
     if ($logCrawl) {
         $crawlLogCtrl = new CrawlLogController();
         $crawlInfo['crawl_status'] = $ret['error'] ? 0 : 1;
         $crawlInfo['ref_id'] = $crawlInfo['crawl_link'] = addslashes($url);
         $crawlInfo['crawl_referer'] = addslashes($this->_CURLOPT_REFERER);
         $crawlInfo['crawl_cookie'] = addslashes($this->_CURLOPT_COOKIE);
         $crawlInfo['crawl_post_fields'] = addslashes($this->_CURLOPT_POSTFIELDS);
         $crawlInfo['crawl_useragent'] = addslashes($this->_CURLOPT_USERAGENT);
         $crawlInfo['proxy_id'] = $proxyInfo['id'];
         $crawlInfo['log_message'] = addslashes($ret['errmsg']);
         $ret['log_id'] = $crawlLogCtrl->createCrawlLog($crawlInfo);
     }
     // disable proxy if not working
     if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) {
         // deactivate proxy
         if (PROXY_DEACTIVATE_CRAWL) {
             $proxyCtrler->__changeStatus($proxyInfo['id'], 0);
         }
         // chekc with another proxy
         if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) {
             $ret = $this->getContent($url, $enableProxy);
         }
     }
     return $ret;
 }
Exemplo n.º 2
0
 function crawlKeyword($keywordInfo, $seId = '', $cron = false, $removeDuplicate = true)
 {
     $crawlResult = array();
     $websiteUrl = $keywordInfo['url'];
     if (empty($websiteUrl)) {
         return $crawlResult;
     }
     if (empty($keywordInfo['name'])) {
         return $crawlResult;
     }
     $time = mktime(0, 0, 0, date('m'), date('d'), date('Y'));
     $seList = explode(':', $keywordInfo['searchengines']);
     foreach ($seList as $seInfoId) {
         // function to execute only passed search engine
         if (!empty($seId) && $seInfoId != $seId) {
             continue;
         }
         // if search engine not found continue
         if (empty($this->seList[$seInfoId])) {
             continue;
         }
         $this->seFound = 1;
         // if execution from cron check whether cron already executed
         /*if ($cron) {
         		    if (SP_MULTIPLE_CRON_EXEC && $this->isCronExecuted($keywordInfo['id'], $seInfoId, $time)) continue;
         		}*/
         $searchUrl = str_replace('[--keyword--]', urlencode(stripslashes($keywordInfo['name'])), $this->seList[$seInfoId]['url']);
         $searchUrl = str_replace('[--lang--]', $keywordInfo['lang_code'], $searchUrl);
         $searchUrl = str_replace('[--country--]', $keywordInfo['country_code'], $searchUrl);
         if (empty($keywordInfo['country_code']) && stristr($searchUrl, '&cr=country&')) {
             $searchUrl = str_replace('&cr=country&', '&cr=&', $searchUrl);
         }
         $seUrl = str_replace('[--start--]', $this->seList[$seInfoId]['start'], $searchUrl);
         // if google add special parameters
         $isGoogle = false;
         if (stristr($this->seList[$seInfoId]['url'], 'google')) {
             $isGoogle = true;
             $seUrl .= "&ie=utf-8&pws=0&gl=" . $keywordInfo['country_code'];
         }
         if (!empty($this->seList[$seInfoId]['cookie_send'])) {
             $this->seList[$seInfoId]['cookie_send'] = str_replace('[--lang--]', $keywordInfo['lang_code'], $this->seList[$seInfoId]['cookie_send']);
             $this->spider->_CURLOPT_COOKIE = $this->seList[$seInfoId]['cookie_send'];
         }
         $result = $this->spider->getContent($seUrl);
         $pageContent = $this->formatPageContent($seInfoId, $result['page']);
         $crawlLogCtrl = new CrawlLogController();
         $crawlInfo['crawl_type'] = 'keyword';
         $crawlInfo['ref_id'] = empty($keywordInfo['id']) ? $keywordInfo['name'] : $keywordInfo['id'];
         $crawlInfo['subject'] = $seInfoId;
         $seStart = $this->seList[$seInfoId]['start'] + $this->seList[$seInfoId]['start_offset'];
         while (empty($result['error']) && $seStart < $this->seList[$seInfoId]['max_results']) {
             $logId = $result['log_id'];
             $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo);
             sleep(SP_CRAWL_DELAY);
             $seUrl = str_replace('[--start--]', $seStart, $searchUrl);
             $result = $this->spider->getContent($seUrl);
             $pageContent .= $this->formatPageContent($seInfoId, $result['page']);
             $seStart += $this->seList[$seInfoId]['start_offset'];
         }
         # to check whether utf8 conversion needed
         if (!empty($this->seList[$seInfoId]['encoding'])) {
             $pageContent = mb_convert_encoding($pageContent, "UTF-8", $this->seList[$seInfoId]['encoding']);
         }
         $crawlStatus = 0;
         if (empty($result['error'])) {
             // to update cron that report executed for akeyword on a search engine
             if (SP_MULTIPLE_CRON_EXEC && $cron) {
                 $this->saveCronTrackInfo($keywordInfo['id'], $seInfoId, $time);
             }
             if (preg_match_all($this->seList[$seInfoId]['regex'], $pageContent, $matches)) {
                 $urlList = $matches[$this->seList[$seInfoId]['url_index']];
                 $crawlResult[$seInfoId]['matched'] = array();
                 $rank = 1;
                 $previousDomain = "";
                 foreach ($urlList as $i => $url) {
                     $url = urldecode(strip_tags($url));
                     // add special condition for baidu
                     if (stristr($this->seList[$seInfoId]['domain'], "baidu")) {
                         $url = addHttpToUrl($url);
                         $url = str_replace("...", "", $url);
                     }
                     if (!preg_match('/^http:\\/\\/|^https:\\/\\//i', $url)) {
                         continue;
                     }
                     // check for to remove msn ad links in page
                     if (stristr($url, 'r.msn.com')) {
                         continue;
                     }
                     // check to remove duplicates from same domain if google is the search engine
                     if ($removeDuplicate && $isGoogle) {
                         $currentDomain = parse_url($url, PHP_URL_HOST);
                         if ($previousDomain == $currentDomain) {
                             continue;
                         }
                         $previousDomain = $currentDomain;
                     }
                     if ($this->showAll || stristr($url, $websiteUrl)) {
                         if ($this->showAll && stristr($url, $websiteUrl)) {
                             $matchInfo['found'] = 1;
                         } else {
                             $matchInfo['found'] = 0;
                         }
                         $matchInfo['url'] = $url;
                         $matchInfo['title'] = strip_tags($matches[$this->seList[$seInfoId]['title_index']][$i]);
                         $matchInfo['description'] = strip_tags($matches[$this->seList[$seInfoId]['description_index']][$i]);
                         $matchInfo['rank'] = $rank;
                         $crawlResult[$seInfoId]['matched'][] = $matchInfo;
                     }
                     $rank++;
                 }
                 $crawlStatus = 1;
             } else {
                 // set crawl log info
                 $crawlInfo['crawl_status'] = 0;
                 $crawlInfo['log_message'] = SearchEngineController::isCaptchInSearchResults($pageContent) ? "<font class=error>Captcha found</font> in search result page" : "Regex not matched error occured while parsing search results!";
                 if (SP_DEBUG) {
                     echo "<p class='note' style='text-align:left;'>Error occured while parsing {$seUrl} " . formatErrorMsg("Regex not matched <br>\n") . "</p>";
                 }
             }
         } else {
             if (SP_DEBUG) {
                 echo "<p class='note' style='text-align:left;'>Error occured while crawling {$seUrl} " . formatErrorMsg($result['errmsg'] . "<br>\n") . "</p>";
             }
         }
         $crawlResult[$seInfoId]['status'] = $crawlStatus;
         sleep(SP_CRAWL_DELAY);
         // update crawl log
         $logId = $result['log_id'];
         $crawlLogCtrl->updateCrawlLog($logId, $crawlInfo);
     }
     // if proxy enabled if crawl failed try to check next item
     if (SP_ENABLE_PROXY && CHECK_WITH_ANOTHER_PROXY_IF_FAILED) {
         // max proxy checked in one execution is exeeded
         if ($this->proxyCheckCount < CHECK_MAX_PROXY_COUNT_IF_FAILED) {
             // if proxy is available for execution
             $proxyCtrler = new ProxyController();
             if ($proxyInfo = $proxyCtrler->getRandomProxy()) {
                 $this->proxyCheckCount++;
                 sleep(SP_CRAWL_DELAY);
                 $crawlResult = $this->crawlKeyword($keywordInfo, $seId, $cron, $removeDuplicate);
             }
         } else {
             $this->proxyCheckCount = 1;
         }
     }
     return $crawlResult;
 }
Exemplo n.º 3
0
 function getContent($url, $enableProxy = true)
 {
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR);
     @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER);
     if (!empty($this->_CURLOPT_COOKIE)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE);
     }
     if (!empty($this->_CURLOPT_REFERER)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER);
     }
     if (strlen($this->_CURLOPT_POSTFIELDS) > 1) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST);
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS);
     }
     $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT;
     if (strlen($this->_CURLOPT_USERAGENT) > 0) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT);
     }
     if (strlen($this->_CURLOPT_USERPWD) > 2) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD);
     }
     // to use proxy if proxy enabled
     if (SP_ENABLE_PROXY && $enableProxy) {
         $proxyCtrler = new ProxyController();
         if ($proxyInfo = $proxyCtrler->getRandomProxy()) {
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']);
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, 1);
             if (!empty($proxyInfo['proxy_auth'])) {
                 curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']);
             }
         } else {
             showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel.");
         }
     }
     $ret['page'] = curl_exec($this->_CURL_RESOURCE);
     $ret['error'] = curl_errno($this->_CURL_RESOURCE);
     $ret['errmsg'] = curl_error($this->_CURL_RESOURCE);
     return $ret;
 }
Exemplo n.º 4
0
 function getContent($url, $enableProxy = true)
 {
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_URL, $url);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_FAILONERROR, $this->_CURLOPT_FAILONERROR);
     @curl_setopt($this->_CURL_RESOURCE, CURLOPT_FOLLOWLOCATION, $this->_CURLOPT_FOLLOWLOCATION);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_RETURNTRANSFER, $this->_CURLOPT_RETURNTRANSFER);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_TIMEOUT, $this->_CURLOPT_TIMEOUT);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEJAR, $this->_CURLOPT_COOKIEJAR);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIEFILE, $this->_CURLOPT_COOKIEFILE);
     curl_setopt($this->_CURL_RESOURCE, CURLOPT_HEADER, $this->_CURLOPT_HEADER);
     if (!empty($this->_CURLOPT_COOKIE)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_COOKIE, $this->_CURLOPT_COOKIE);
     }
     if (!empty($this->_CURLOPT_REFERER)) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_REFERER, $this->_CURLOPT_REFERER);
     }
     if (strlen($this->_CURLOPT_POSTFIELDS) > 1) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POST, $this->_CURLOPT_POST);
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_POSTFIELDS, $this->_CURLOPT_POSTFIELDS);
     }
     // user agent assignment
     $this->_CURLOPT_USERAGENT = defined('SP_USER_AGENT') ? SP_USER_AGENT : $this->_CURLOPT_USERAGENT;
     if (strlen($this->_CURLOPT_USERAGENT) > 0) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERAGENT, $this->_CURLOPT_USERAGENT);
     }
     if (strlen($this->_CURLOPT_USERPWD) > 2) {
         curl_setopt($this->_CURL_RESOURCE, CURLOPT_USERPWD, $this->_CURLOPT_USERPWD);
     }
     // to use proxy if proxy enabled
     if (SP_ENABLE_PROXY && $enableProxy) {
         $proxyCtrler = new ProxyController();
         if ($proxyInfo = $proxyCtrler->getRandomProxy()) {
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXY, $proxyInfo['proxy'] . ":" . $proxyInfo['port']);
             curl_setopt($this->_CURL_RESOURCE, CURLOPT_HTTPPROXYTUNNEL, CURLOPT_HTTPPROXYTUNNEL_VAL);
             if (!empty($proxyInfo['proxy_auth'])) {
                 curl_setopt($this->_CURL_RESOURCE, CURLOPT_PROXYUSERPWD, $proxyInfo['proxy_username'] . ":" . $proxyInfo['proxy_password']);
             }
         } else {
             showErrorMsg("No active proxies found!! Please check your proxy settings from Admin Panel.");
         }
     }
     $ret['page'] = curl_exec($this->_CURL_RESOURCE);
     $ret['error'] = curl_errno($this->_CURL_RESOURCE);
     $ret['errmsg'] = curl_error($this->_CURL_RESOURCE);
     // disable proxy if not working
     if (SP_ENABLE_PROXY && $enableProxy && !empty($ret['error']) && !empty($proxyInfo['id'])) {
         // deactivate proxy
         if (PROXY_DEACTIVATE_CRAWL) {
             echo "Deactivating proxy - " . $proxyInfo['proxy'] . "....<br>\n";
             $proxyCtrler->__changeStatus($proxyInfo['id'], 0);
         }
         // chekc with another proxy
         if (CHECK_WITH_ANOTHER_PROXY_IF_FAILED) {
             echo "Checking with another proxy....<br>\n";
             $ret = $this->getContent($url, $enableProxy);
         }
     }
     return $ret;
 }