public function remove($proxy) { unset($this->mapFail[proxyToString($proxy)]); for ($i = 0; $i < count($this->proxies); $i++) { if (proxyToString($this->proxies[$i]) === proxyToString($proxy)) { $indexRemove = $i; } } if ($indexRemove != -1) { unset($this->proxies[$indexRemove]); $this->proxies = array_values($this->proxies); if ($indexRemove <= $this->iProxy) { --$this->iProxy; } } }
function curl_cache_exec($curl_opt, $proxy, $usecache = true) { if (!empty($curl_opt[CURLOPT_URL])) { $curl_opt = $curl_opt + buildCurlOptions($proxy); d('Curl', "GET " . $curl_opt[CURLOPT_URL] . " via " . proxyToString($proxy) . " (mem: " . debug_memory() . ")"); $cacheFile = CACHE_DIR . sha1($curl_opt[CURLOPT_URL]); if ($usecache && file_exists($cacheFile)) { //$cache_hto = intval(isset($options['general']['cache_lifetime']) ? $options['general']['cache_lifetime'] : 4); $cache_hto = CACHE_LIFETIME; // echo "DEBUG CACHE Now : ".time()." cacheFile: ". filemtime($cacheFile)." hto : ".($cache_hto*3600)."\n"; // HIT if (time() - filemtime($cacheFile) < $cache_hto * 3600) { $cacheData = @file_get_contents($cacheFile); if ($cacheData !== FALSE) { // use the cache $response['cache'] = true; $response['data'] = $cacheData; $response['status'] = 200; // we only cache 200, no problem $response['cache_age'] = time() - filemtime($cacheFile); $response['redir'] = null; $response['error'] = null; } d('Curl', "GOT status=200 cache=HIT age=" . $response['cache_age'] . " (mem: " . debug_memory() . ")"); return $response; } } $ch = curl_init(); curl_setopt_array($ch, $curl_opt); $data = curl_exec($ch); $response = array(); $response['cache'] = false; $response['data'] = $data; $response['status'] = curl_getinfo($ch, CURLINFO_HTTP_CODE); $response['cache_age'] = 0; $response['redir'] = curl_getinfo($ch, CURLINFO_REDIRECT_URL); $response['error'] = curl_error($ch); curl_close($ch); // only cache 200 OK if ($usecache && $response['status'] == 200 && strlen($data) > 0) { // try to cache the stuff if (!file_exists(CACHE_DIR)) { @mkdir(CACHE_DIR); } @file_put_contents($cacheFile, $data); } d('Curl', "GOT status=" . $response['status'] . " cache=MISS age=0 (mem: " . debug_memory() . ")"); return $response; } return null; }
public function check($group) { global $options; global $proxies; global $dbc; global $captchaBrokenCurrentRun; $ranks = array(); $domain = ""; if (!empty($group['options']['datacenter'])) { $domain = $group['options']['datacenter']; } else { if (!empty($group['options']['tld'])) { $domain = "www.google." . $group['options']['tld']; } else { $domain = "www.google.com"; } } $curl = null; foreach ($group['keywords'] as $keyKW => $keyword) { if ($options['general']['proxy_auto_rotate'] === "yes") { $proxy = $proxies->next(); if ($proxy == null) { $this->e("No more valid proxy, aborting"); $ranks['__have_error'] = 1; return $ranks; } else { $this->l("Switched to proxy " . proxyToString($proxy)); } } else { $proxy = $proxies->current(); $this->l("Using current proxy " . proxyToString($proxy)); } $this->l("Checking {$keyword} on {$domain}"); $pos = 1; $start_index = 0; // init a new session // TODO: should be done on proxy switch too for local search $this->init_session($domain, $proxy, !empty($group['options']['local']) ? $group['options']['local'] : null); do { if ($start_index == 0) { $url = "https://{$domain}/search?q=" . urlencode($keyword); $referrer = "https://{$domain}/"; } else { $referrer = $url; $url = "https://{$domain}/search?q=" . urlencode($keyword) . "&start=" . $start_index; } if (!empty($group['options']['parameters'])) { $url .= "&" . $group['options']['parameters']; } // print_r($opts); $fetchRetry = 1; do { $opts = array(CURLOPT_URL => $url, CURLOPT_REFERER => $referrer); $curlout = curl_cache_exec($opts, $proxy, empty($group['options']['local'])); // don't use cache if local search $data = $curlout['data']; $http_status = $curlout['status']; $error = false; $doc = new DOMDocument(); switch ($http_status) { case 200: break; case 302: $redir = $curlout['redir']; if (!strstr($redir, "/IndexRedirect?continue=")) { $this->w("Invalid redir"); $error = true; } else { // $redir = str_replace("://www.google.com/sorry/?continue=", "://".$domain."/sorry/?continue=", $redir); $opts = array(CURLOPT_URL => $redir, CURLOPT_REFERER => $referrer); $data = curl_cache_exec($opts, $proxy, false); if ($data['status'] == 403) { $proxies->ban($proxy); $this->w("IP banned from google (no captcha), force proxy remove"); $error = true; } else { $this->w("Google captcha"); if ($dbc == null) { $rateLimitSleepTime = intval($options[get_class($this)]['captcha_basesleep']); $this->w("DeathByCaptcha not configured, sleeping {$rateLimitSleepTime} seconds"); sleep($rateLimitSleepTime); $error = true; } else { if ($captchaBrokenCurrentRun > CAPTCHA_MAX_RUN) { $rateLimitSleepTime = intval($options[get_class($this)]['captcha_basesleep']); $this->w("Broke too many captcha ({$captchaBrokenCurrentRun}), sleeping {$rateLimitSleepTime} seconds"); sleep($rateLimitSleepTime); $error = true; } else { $this->w("Handling captcha with DeathByCaptcha (already solved {$captchaBrokenCurrentRun} captchas)"); $data = $this->handleCaptcha($data['data'], $proxy, $domain, $url); if ($data == null) { $error = true; } } } } } // go ninja go break; case 0: $this->w("Curl error " . $curlout['error']); $error = true; break; default: $this->w("Bad retcode " . $http_status); $error = true; break; } if (!$error) { // is it really google if (strstr($data, "window.google=") === FALSE) { $this->w("Not a valid google SERP"); // file_put_contents("/tmp/noserp_". sha1("".time().rand(0, 10000)), $data); $error = true; } } if (!$error) { if (!@$doc->loadHTML($data)) { $this->w("Can't parse HTML"); $error = true; } } if ($error) { rm_cache($url); $nPrxCsfFail = $proxies->fail($proxy); if (intval($options['general']['rm_bad_proxies']) > 0 && $nPrxCsfFail >= intval($options['general']['rm_bad_proxies'])) { $this->w("Removing proxy " . proxyToString($proxy) . " after " . $nPrxCsfFail . " consecutives fails"); $proxies->remove($proxy); } $proxy = $proxies->next(); if ($proxy == null) { $this->e("No more valid proxy, aborting"); $ranks['__have_error'] = 1; return $ranks; } $this->w("Previous proxy failed, switched to proxy " . proxyToString($proxy)); $this->init_session($domain, $proxy, !empty($group['options']['local']) ? $group['options']['local'] : null); } else { $proxies->success($proxy); } ++$fetchRetry; } while ($error && $fetchRetry <= intval($options['general']['fetch_retry'])); if ($error) { $this->e("Too many consecutive fail ({$fetchRetry}), aborting"); $ranks['__have_error'] = 1; return $ranks; } $allh3 = $doc->getElementsByTagName('h3'); foreach ($allh3 as $h3) { if (!$h3->hasAttribute("style") && $h3->getAttribute("class") == "r") { try { $h3_a = $h3->getElementsByTagName('a'); if ($h3_a == null || $h3_a->length == 0) { continue; } $href = $h3_a->item(0)->getAttribute('href'); $parsed = @parse_url($href); if ($parsed !== FALSE && isset($parsed['host'])) { foreach ($group['sites'] as $keySite => $website) { // if we already have a rank for this keyword, continue if (isset($ranks[$keyKW][$keySite])) { continue; } // wildcard support $regex = wd_wildcard_to_preg($website); if (preg_match($regex, $parsed['host'])) { $ranks[$keyKW][$keySite][0] = $pos; $ranks[$keyKW][$keySite][1] = $href; $this->l("Rank[{$pos}] [{$website}] " . $href); } } $pos++; } } catch (Exception $e) { $this->e("Parsing error (unexpected bug)"); } } } $bAllWebsiteFound = true; foreach ($group['sites'] as $keySite => $website) { if (!isset($ranks[$keyKW][$keySite])) { $bAllWebsiteFound = false; } } $start_index += 10; sleep($options[get_class($this)]['page_sleep']); } while ($start_index < 100 && !$bAllWebsiteFound); $this->incrementProgressBarUnit(); } return $ranks; }