function parse_search($date, $page_no = 1, $AppRef = '') { global $applications; $url = 'http://www.planning.wealden.gov.uk/aspxpages/SearchResults.aspx?pageno=' . $page_no . '&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef=' . $AppRef . '&Category=DC&DateType=R&StartDate=' . $date . '&EndDate=' . $date . '&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det='; //echo 'Loading page '.$page_no.' of data for '.$date.' URL:'.$url.'<br />'; $data = fetch_page($url); if (strpos($data, "<title>Wealden District Council's applications online - Copyright, disclaimer & personal data</title>")) { //Accept their terms list($junk, $viewstate) = explode('<input type="hidden" name="__VIEWSTATE" value="', $data, 2); list($viewstate, $junk) = explode('" />', $viewstate, 2); //echo 'Attempting to bypass copyright page...<br />'; $url = 'http://www.planning.wealden.gov.uk/aspxpages/Copyright.aspx?pageno=' . $page_no . '&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef=' . $AppRef . '&Category=DC&DateType=R&StartDate=' . $date . '&EndDate=' . $date . '&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det='; $data = fetch_page($url, 'btnCopyrightAccept=Accept&__VIEWSTATE=' . urlencode($viewstate) . '', 2); } list($junk, $data) = explode('<span id="lblSearchResults">', $data); list($data, $next_page) = explode('<div id="pagenumbers">', $data); $data = explode('</ul>', $data); unset($data[10]); foreach ($data as $application) { $application = explode('</li>', $application); $AppNo = extract_data($application[0]); if (!empty($AppNo)) { $applications[$AppNo]['AppNo'] = $AppNo; $Loc = extract_data($application[1]); $applications[$AppNo]['Address'] = $Loc; preg_match("/([A-Z]{1,2}[0-9][0-9A-Z]?\\s?[0-9][A-Z]{2})/", $Loc, $PostCode); if (isset($PostCode[1])) { $applications[$AppNo]['PostCode'] = $PostCode[1]; } else { $applications[$AppNo]['PostCode'] = false; } $applications[$AppNo]['Info'] = extract_data($application[2]); parse_detail($AppNo); } } if (strpos($next_page, 'Next</a></div></span> <br />')) { $page_no++; //echo "Loading next page..."; if ($page_no < 6) { parse_search($date, $page_no, $AppNo); } } }
function parse_search($page = 1) { global $applications, $day, $month, $year, $xml; $start = $page * 10 - 19; if ($start < 0) { $start = 1; } if ($page == '2') { $shown = 'Y'; $start = 1; } else { $shown = 'N'; } $url = $xml['url'] . '?Controller=p2Controller&Action=FindApplicationsByDatesAction&START_DD=' . $day . '&START_MMM=' . $month . '&START_YYYY=' . $year . '&END_DD=' . $day . '&END_MMM=' . $month . '&END_YYYY=' . $year . '&WARD=ALL&CURR=&DECSN=&START_ROW=' . $start . '&FIRST_TEN_SHOWN=' . $shown . '&SEARCH_DIRECTION=F'; //echo 'Loading page '.$page.' of data from URL:'.$url.'<br />'; $data = explode('<div class="result">', fetch_page($url)); unset($data[0]); foreach ($data as $app) { $app = explode('</span>', $app); $AppNo = trim(strip_tags($app[0])); $applications[$AppNo]['AppNo'] = $AppNo; list($info, $address) = explode('<br/>', $app[2]); $applications[$AppNo]['Info'] = trim(strip_tags($info)); $applications[$AppNo]['Address'] = trim(strip_tags($address)); preg_match("/([A-Z]{1,2}[0-9][0-9A-Z]?\\s?[0-9][A-Z]{2})/", $address, $PostCode); if (isset($PostCode[1])) { $applications[$AppNo]['PostCode'] = $PostCode[1]; } else { $applications[$AppNo]['PostCode'] = false; } parse_detail($AppNo); } if (strpos($app[2], 'alt="Next 10 applications"')) { parse_search($page + 1); } }
} if ($tmp && strpos($tmp, 'shop-title')) { echo "[info] " . ($end - $start) . "s get " . $value . " success\n"; //红色模板 break; } sleep(1); $try_count--; } if (!$tmp) { echo "[error] get " . $value . " error \n"; continue; } if ($tmp && !strpos($tmp, 'site-nav') && !strpos($tmp, 'shop-title')) { echo "[error] get " . $value . " error \n"; continue; } file_put_contents('html/' . $city_key . '/' . substr(strrchr($value, '/'), 1) . '.html', $tmp); } echo "[info] get all success\n"; } else { exit('执行出错'); } } //初始化抓取url $url_arr = init_grap(); //生成商店url $detail_url_arr = get_details_url($url_arr, false); //生成缓存html文件 parse_detail($detail_url_arr);