function getPaterons($remote_html) { global $output; global $id; global $total; $last = True; // Create DOM from URL or file $html = str_get_html($remote_html); // Create a DOM object from a string // Find all links foreach ($html->find('a') as $element) { $link = $element->href; $name = $element->plaintext; if ($last == True) { $last = False; } else { $output .= "<br>"; $output .= 'Name: ' . $name . '<br>'; $output .= 'Profile: ' . $link . '<br>'; $output .= "---------------------------"; $output .= "<br>"; $last = True; } } $total++; spider($id, $total); }
/** * 测试用主程序 * @return */ function main() { var_dump(spider('http://movie.douban.com')); // $fp_puts = fopen('url.txt','ab'); // 记录URL列表 // $fp_gets = fopen('url.txt','r'); // 保存URL列表 // do { // $result_url_arr = spider($current_url); // if ($result_url_arr) { // foreach ($$result_url_arr as $url) { // fputs($fp_puts,$url.'\r\n'); // } // } // }while ($current_url = fgets($fp_gets,1024)); }
break; } } $ret = deleteFiles($assetsLCL . arg("target")); if (!$ret) { $err = $ERR_FILE_PERMISSION; break; } $ret = copyFiles($assetsLCL . arg("path"), $assetsLCL . arg("target")); if (!$ret) { $err = $ERR_FILE_PERMISSION; break; } break; case "spider": $ret = spider($assetsLCL . arg("path"), $assetsLCL); if (!$ret) { $err = $ERR_FILE_NOT_FOUND; } break; case "touch": if (!in_array(auth_get_class(), array("admin", "supervisor", "dirprod"))) { $err = $ERR_PERMISSION; break; } $ret = touch($assetsLCL . arg("path")); if (!$ret) { $err = $ERR_FILE_PERMISSION; } break; #case "rm":
$header[] = "Cache-Control: no-cache"; $header[] = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; $header[] = "Accept-Encoding: gzip, deflate, sdch"; $header[] = "Accept-Language: zh-CN,zh;q=0.8,en;q=0.6"; $curlObj = curl_init(); curl_setopt($curlObj, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36'); // curl_setopt($curlObj, CURLOPT_HTTPHEADER, $header);//带上这个 header反而不对了?? curl_setopt($curlObj, CURLOPT_URL, $url); curl_setopt($curlObj, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curlObj, CURLOPT_PROXY, $proxy); curl_setopt($curlObj, CURLOPT_COOKIEJAR, $cookiefile); curl_setopt($curlObj, CURLOPT_COOKIEFILE, $cookiefile); curl_setopt($curlObj, CURLOPT_FOLLOWLOCATION, 1); //支持跳转 有才条 没有就不跳 curl_setopt($curlObj, CURLOPT_TIMEOUT, 10); $outPut = curl_exec($curlObj); curl_close($curlObj); // echo $outPut; $filename = "./really_" . $page . ".html"; // $filename = "./really_" . time() . ".html"; file_put_contents($filename, $outPut); // sleep(5);//来个变态 $i++; $page++; if ($i < 10) { $fun = __FUNCTION__; $fun(); } } spider();