function grabWebPage($url) { println('get url: ' . $url); $response = readItemBackup($url); if ($response) { throw new Exception("hit backup", 1); } if (!$response) { $curl = MCurl::curlGetRequest($url); $curl->setUseProxy(true); $curl->setTimeout(10); $curl->setRetry(999); $curl->setRetrySleep(3); $response = $curl->sendRequest(); } return $response; }
<?php # 抓1号店分类 require dirname(__FILE__) . '/script_init.php'; $dbProxy = BaseDal::getDBProxy(DBQUICKSHOP); $sql = "SELECT *\r\n FROM yhd_category\r\n WHERE yhd_parent_cid=0\r\n ORDER BY sort_order"; $ret = $dbProxy->rs2array($sql); foreach ($ret as $category) { $cid = $category['yhd_cid']; $url = sprintf('http://www.yhd.com/header/ajaxGetGlobalLeftFloatMenuDataV10.do?categoryId=%d', $cid); println('sending request: ' . $url); $curl = MCurl::curlGetRequest($url); $curl->setUseProxy(true); $response = $curl->sendRequest(); println('received response'); $json = json_decode($response, true); $html = $json['value']; grepHtml($html, $cid); println(); } ////////////////////////////////// function grepHtml($html, $parent_cid) { $pattern = '/<dl[\\s\\S]*?<\\/dl>/'; if (preg_match_all($pattern, $html, $reg)) { foreach ($reg[0] as $dl) { grepDl($dl, $parent_cid); } } } function grepDl($html, $parent_cid)
function multiget($inurls, &$res, $config, $transit, $roster, $lang, $multi = 0) { global $db, $cache; $timeout = 10; $tcurl = $config['pars']; $num = $config['multiget']; $urlss = array_chunk($inurls, $num, TRUE); foreach ($urlss as $id => $urls) { if ($tcurl == 'curl') { $curl = new CURL(); $curl->retry = 2; $opts = array(CURLOPT_RETURNTRANSFER => true, CURLOPT_CONNECTTIMEOUT => $timeout); foreach ($urls as $key => $link) { $curl->addSession($link, $key, $opts); } $result = $curl->exec(); $curl->clear(); } elseif ($tcurl == 'mcurl') { $curl = new MCurl(); $curl->threads = 100; $curl->timeout = 15; $curl->sec_multiget($urls, $result); } else { foreach ($urls as $id => $link) { $ch[$id] = curl_init(); curl_setopt($ch[$id], CURLOPT_URL, $link); curl_setopt($ch[$id], CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch[$id], CURLOPT_FAILONERROR, true); curl_setopt($ch[$id], CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch[$id], CURLOPT_HTTPHEADER, array("X-Requested-With: XMLHttpRequest", "Accept: text/html, */*", "User-Agent: Mozilla/3.0 (compatible; easyhttp)", "Connection: Keep-Alive")); } $mh = curl_multi_init(); foreach ($ch as $id => $h) { curl_multi_add_handle($mh, $h); } $running = null; do { curl_multi_exec($mh, $running); } while ($running > 0); foreach ($ch as $id => $h) { $result[$id] = curl_multi_getcontent($h); } foreach ($ch as $id => $h) { curl_multi_remove_handle($mh, $h); } curl_multi_close($mh); unset($ch); } if ($multi != 0) { foreach ($result as $name => $val) { $res[$name] = $val; } } else { foreach ($result as $name => $val) { $json = json_decode($val, TRUE); if ($json['status'] == 'ok' && $json['status_code'] == 'NO_ERROR') { $transit = insert_stat($json, $roster[$name], $config, $transit); $res[$name] = pars_data2($json, $name, $config, $lang, $roster[$name]); $cache->set($name, $res[$name], ROOT_DIR . '/cache/players/'); } } } unset($result, $json); } }
function get_url($link, $config) { $url[0] = $link; if ($config['pars'] == 'curl') { $curl = new CURL(); $curl->retry = 4; $opts = array(CURLOPT_RETURNTRANSFER => true); $curl->addSession($url[0], 0, $opts); $result = $curl->exec(); $curl->clear(); } elseif ($config['pars'] == 'mcurl') { $curl = new MCurl(); $curl->threads = 100; $curl->timeout = 15; $curl->sec_multiget($url, $result); } else { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $link); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 2); curl_setopt($ch, CURLOPT_HTTPHEADER, array('Accept: application/json', 'X-Requested-With: XMLHttpRequest', 'Connection: Keep-Alive')); $data = curl_exec($ch); if ($data === false) { $err = curl_errno($ch); $errmsg = curl_error($ch); $result[0] = ''; } else { $result[0] = $data; } curl_close($ch); } return $result[0]; }
function multiget($urls, &$result, $tcurl = 'curl') { if ($tcurl == 'curl') { $curl = new CURL(); $opts = array(CURLOPT_RETURNTRANSFER => true); foreach ($urls as $key => $link) { $curl->addSession($link, $key, $opts); } $result = $curl->exec(); $curl->clear(); } else { $curl = new MCurl(); $curl->threads = 100; $curl->timeout = 15; unset($results); $curl->sec_multiget($urls, $result); } }