Example #1
0
 public function remove($proxy)
 {
     unset($this->mapFail[proxyToString($proxy)]);
     for ($i = 0; $i < count($this->proxies); $i++) {
         if (proxyToString($this->proxies[$i]) === proxyToString($proxy)) {
             $indexRemove = $i;
         }
     }
     if ($indexRemove != -1) {
         unset($this->proxies[$indexRemove]);
         $this->proxies = array_values($this->proxies);
         if ($indexRemove <= $this->iProxy) {
             --$this->iProxy;
         }
     }
 }
Example #2
0
function curl_cache_exec($curl_opt, $proxy, $usecache = true)
{
    if (!empty($curl_opt[CURLOPT_URL])) {
        $curl_opt = $curl_opt + buildCurlOptions($proxy);
        d('Curl', "GET " . $curl_opt[CURLOPT_URL] . " via " . proxyToString($proxy) . " (mem: " . debug_memory() . ")");
        $cacheFile = CACHE_DIR . sha1($curl_opt[CURLOPT_URL]);
        if ($usecache && file_exists($cacheFile)) {
            //$cache_hto =  intval(isset($options['general']['cache_lifetime']) ? $options['general']['cache_lifetime'] : 4);
            $cache_hto = CACHE_LIFETIME;
            //            echo "DEBUG CACHE  Now : ".time()." cacheFile: ". filemtime($cacheFile)." hto : ".($cache_hto*3600)."\n";
            // HIT
            if (time() - filemtime($cacheFile) < $cache_hto * 3600) {
                $cacheData = @file_get_contents($cacheFile);
                if ($cacheData !== FALSE) {
                    // use the cache
                    $response['cache'] = true;
                    $response['data'] = $cacheData;
                    $response['status'] = 200;
                    // we only cache 200, no problem
                    $response['cache_age'] = time() - filemtime($cacheFile);
                    $response['redir'] = null;
                    $response['error'] = null;
                }
                d('Curl', "GOT status=200 cache=HIT age=" . $response['cache_age'] . " (mem: " . debug_memory() . ")");
                return $response;
            }
        }
        $ch = curl_init();
        curl_setopt_array($ch, $curl_opt);
        $data = curl_exec($ch);
        $response = array();
        $response['cache'] = false;
        $response['data'] = $data;
        $response['status'] = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $response['cache_age'] = 0;
        $response['redir'] = curl_getinfo($ch, CURLINFO_REDIRECT_URL);
        $response['error'] = curl_error($ch);
        curl_close($ch);
        // only cache 200 OK
        if ($usecache && $response['status'] == 200 && strlen($data) > 0) {
            // try to cache the stuff
            if (!file_exists(CACHE_DIR)) {
                @mkdir(CACHE_DIR);
            }
            @file_put_contents($cacheFile, $data);
        }
        d('Curl', "GOT status=" . $response['status'] . " cache=MISS age=0 (mem: " . debug_memory() . ")");
        return $response;
    }
    return null;
}
Example #3
0
 public function check($group)
 {
     global $options;
     global $proxies;
     global $dbc;
     global $captchaBrokenCurrentRun;
     $ranks = array();
     $domain = "";
     if (!empty($group['options']['datacenter'])) {
         $domain = $group['options']['datacenter'];
     } else {
         if (!empty($group['options']['tld'])) {
             $domain = "www.google." . $group['options']['tld'];
         } else {
             $domain = "www.google.com";
         }
     }
     $curl = null;
     foreach ($group['keywords'] as $keyKW => $keyword) {
         if ($options['general']['proxy_auto_rotate'] === "yes") {
             $proxy = $proxies->next();
             if ($proxy == null) {
                 $this->e("No more valid proxy, aborting");
                 $ranks['__have_error'] = 1;
                 return $ranks;
             } else {
                 $this->l("Switched to proxy " . proxyToString($proxy));
             }
         } else {
             $proxy = $proxies->current();
             $this->l("Using current proxy " . proxyToString($proxy));
         }
         $this->l("Checking {$keyword} on {$domain}");
         $pos = 1;
         $start_index = 0;
         // init a new session
         // TODO: should be done on proxy switch too for local search
         $this->init_session($domain, $proxy, !empty($group['options']['local']) ? $group['options']['local'] : null);
         do {
             if ($start_index == 0) {
                 $url = "https://{$domain}/search?q=" . urlencode($keyword);
                 $referrer = "https://{$domain}/";
             } else {
                 $referrer = $url;
                 $url = "https://{$domain}/search?q=" . urlencode($keyword) . "&start=" . $start_index;
             }
             if (!empty($group['options']['parameters'])) {
                 $url .= "&" . $group['options']['parameters'];
             }
             //                print_r($opts);
             $fetchRetry = 1;
             do {
                 $opts = array(CURLOPT_URL => $url, CURLOPT_REFERER => $referrer);
                 $curlout = curl_cache_exec($opts, $proxy, empty($group['options']['local']));
                 // don't use cache if local search
                 $data = $curlout['data'];
                 $http_status = $curlout['status'];
                 $error = false;
                 $doc = new DOMDocument();
                 switch ($http_status) {
                     case 200:
                         break;
                     case 302:
                         $redir = $curlout['redir'];
                         if (!strstr($redir, "/IndexRedirect?continue=")) {
                             $this->w("Invalid redir");
                             $error = true;
                         } else {
                             //                                $redir = str_replace("://www.google.com/sorry/?continue=", "://".$domain."/sorry/?continue=", $redir);
                             $opts = array(CURLOPT_URL => $redir, CURLOPT_REFERER => $referrer);
                             $data = curl_cache_exec($opts, $proxy, false);
                             if ($data['status'] == 403) {
                                 $proxies->ban($proxy);
                                 $this->w("IP banned from google (no captcha), force proxy remove");
                                 $error = true;
                             } else {
                                 $this->w("Google captcha");
                                 if ($dbc == null) {
                                     $rateLimitSleepTime = intval($options[get_class($this)]['captcha_basesleep']);
                                     $this->w("DeathByCaptcha not configured, sleeping {$rateLimitSleepTime} seconds");
                                     sleep($rateLimitSleepTime);
                                     $error = true;
                                 } else {
                                     if ($captchaBrokenCurrentRun > CAPTCHA_MAX_RUN) {
                                         $rateLimitSleepTime = intval($options[get_class($this)]['captcha_basesleep']);
                                         $this->w("Broke too many captcha ({$captchaBrokenCurrentRun}), sleeping {$rateLimitSleepTime} seconds");
                                         sleep($rateLimitSleepTime);
                                         $error = true;
                                     } else {
                                         $this->w("Handling captcha with DeathByCaptcha (already solved {$captchaBrokenCurrentRun} captchas)");
                                         $data = $this->handleCaptcha($data['data'], $proxy, $domain, $url);
                                         if ($data == null) {
                                             $error = true;
                                         }
                                     }
                                 }
                             }
                         }
                         // go ninja go
                         break;
                     case 0:
                         $this->w("Curl error " . $curlout['error']);
                         $error = true;
                         break;
                     default:
                         $this->w("Bad retcode " . $http_status);
                         $error = true;
                         break;
                 }
                 if (!$error) {
                     // is it really google
                     if (strstr($data, "window.google=") === FALSE) {
                         $this->w("Not a valid google SERP");
                         //                            file_put_contents("/tmp/noserp_". sha1("".time().rand(0, 10000)), $data);
                         $error = true;
                     }
                 }
                 if (!$error) {
                     if (!@$doc->loadHTML($data)) {
                         $this->w("Can't parse HTML");
                         $error = true;
                     }
                 }
                 if ($error) {
                     rm_cache($url);
                     $nPrxCsfFail = $proxies->fail($proxy);
                     if (intval($options['general']['rm_bad_proxies']) > 0 && $nPrxCsfFail >= intval($options['general']['rm_bad_proxies'])) {
                         $this->w("Removing proxy " . proxyToString($proxy) . " after " . $nPrxCsfFail . " consecutives fails");
                         $proxies->remove($proxy);
                     }
                     $proxy = $proxies->next();
                     if ($proxy == null) {
                         $this->e("No more valid proxy, aborting");
                         $ranks['__have_error'] = 1;
                         return $ranks;
                     }
                     $this->w("Previous proxy failed, switched to proxy " . proxyToString($proxy));
                     $this->init_session($domain, $proxy, !empty($group['options']['local']) ? $group['options']['local'] : null);
                 } else {
                     $proxies->success($proxy);
                 }
                 ++$fetchRetry;
             } while ($error && $fetchRetry <= intval($options['general']['fetch_retry']));
             if ($error) {
                 $this->e("Too many consecutive fail ({$fetchRetry}), aborting");
                 $ranks['__have_error'] = 1;
                 return $ranks;
             }
             $allh3 = $doc->getElementsByTagName('h3');
             foreach ($allh3 as $h3) {
                 if (!$h3->hasAttribute("style") && $h3->getAttribute("class") == "r") {
                     try {
                         $h3_a = $h3->getElementsByTagName('a');
                         if ($h3_a == null || $h3_a->length == 0) {
                             continue;
                         }
                         $href = $h3_a->item(0)->getAttribute('href');
                         $parsed = @parse_url($href);
                         if ($parsed !== FALSE && isset($parsed['host'])) {
                             foreach ($group['sites'] as $keySite => $website) {
                                 // if we already have a rank for this keyword, continue
                                 if (isset($ranks[$keyKW][$keySite])) {
                                     continue;
                                 }
                                 // wildcard support
                                 $regex = wd_wildcard_to_preg($website);
                                 if (preg_match($regex, $parsed['host'])) {
                                     $ranks[$keyKW][$keySite][0] = $pos;
                                     $ranks[$keyKW][$keySite][1] = $href;
                                     $this->l("Rank[{$pos}] [{$website}] " . $href);
                                 }
                             }
                             $pos++;
                         }
                     } catch (Exception $e) {
                         $this->e("Parsing error (unexpected bug)");
                     }
                 }
             }
             $bAllWebsiteFound = true;
             foreach ($group['sites'] as $keySite => $website) {
                 if (!isset($ranks[$keyKW][$keySite])) {
                     $bAllWebsiteFound = false;
                 }
             }
             $start_index += 10;
             sleep($options[get_class($this)]['page_sleep']);
         } while ($start_index < 100 && !$bAllWebsiteFound);
         $this->incrementProgressBarUnit();
     }
     return $ranks;
 }