} $a = array_merge($a, array('__EVENTTARGET' => $eventtarget)); $a = array_merge($a, array('__EVENTARGUMENT' => '')); return $a; } # By default, assume it is single page $dataset = $dom->find("tr[class=rgRow], tr[class=rgAltRow]"); $NumPages = count($dom->find('div[class=rgWrap rgNumPart] a')); if ($NumPages === 0) { $NumPages = 1; } for ($i = 1; $i <= $NumPages; $i++) { # If more than a single page, fetch the page if ($NumPages > 1) { $eventtarget = substr($dom->find('div[class=rgWrap rgNumPart] a', $i - 1)->href, 25, 61); $request = array('http' => array('method' => 'POST', 'header' => 'Content-Type: application/x-www-form-urlencoded\\r\\n', 'content' => http_build_query(buildformdata($dom, $eventtarget)))); $context = stream_context_create($request); $html = file_get_html($da_page, false, $context); $dataset = $html->find("tr[class=rgRow], tr[class=rgAltRow]"); echo "Scraping page {$i} of {$NumPages}\r\n"; } # The usual, look for the data set and if needed, save it foreach ($dataset as $record) { # Slow way to transform the date but it works $date_received = explode(' ', trim($record->children(2)->plaintext), 2); $date_received = explode('/', $date_received[0]); $date_received = "{$date_received['2']}-{$date_received['1']}-{$date_received['0']}"; # Get the address $address = $record->find('td', 3)->plaintext; $tokens = explode('</br>', $address); $address = trim($tokens[0]) . ", NSW";
$cookies = accept_terms_get_cookies($term_url, "Agree"); # Manually set cookie's key and get the value from array $request = array('http' => array('header' => "Cookie: ASP.NET_SessionId=" . $cookies['ASP_NET_SessionId'] . "; path=/; HttpOnly\r\n" . "{$user_agent}\r\n")); $context = stream_context_create($request); $dom = file_get_html($da_page, false, $context); # By default, assume it is single page $dataset = $dom->find("tr[class=rgRow], tr[class=rgAltRow]"); $NumPages = count($dom->find('div[class=rgWrap rgNumPart] a')); if ($NumPages === 0) { $NumPages = 1; } for ($i = 1; $i <= $NumPages; $i++) { # If more than a single page, fetch the page if ($NumPages > 1) { $eventtarget = substr($dom->find('div[class=rgWrap rgNumPart] a', $i - 1)->href, 25, 61); $request = array('http' => array('method' => "POST", 'header' => "Cookie: ASP.NET_SessionId=" . $cookies['ASP_NET_SessionId'] . "; path=/; HttpOnly\r\n" . "Content-Type: application/x-www-form-urlencoded\r\n" . "{$user_agent}\r\n", 'content' => http_build_query(buildformdata($dom, $eventtarget)))); $context = stream_context_create($request); $html = file_get_html($da_page, false, $context); $dataset = $html->find("tr[class=rgRow], tr[class=rgAltRow]"); echo "Scraping page {$i} of {$NumPages}\r\n"; } # The usual, look for the data set and if needed, save it foreach ($dataset as $record) { # Slow way to transform the date but it works $date_received = explode(' ', trim($record->find('td', 2)->plaintext)); $date_received = "{$date_received['2']}-{$date_received['1']}-{$date_received['0']}"; $date_received = date('Y-m-d', strtotime($date_received)); # Prep a bit more, ready to add these to the array $tempstr = explode('<br/>', $record->find('td', 3)->innertext); $addr = trim(html_entity_decode($tempstr[0])); $addr = explode('<strong>', $addr);
$doPostBackAry[] = array('__EVENTTARGET' => $eventTarget[1], '__EVENTARGUMENT' => $eventArgument[1]); } } for ($i = 1; $i <= $NumPages; $i++) { echo "Scraping page {$i} of {$NumPages}\r\n"; # The usual, look for the data set and if needed, save it foreach ($dataset as $record) { # Slow way to transform the date but it works $date_received = explode(' ', trim($record->find("td", 1)->plaintext)); $date_received = explode('/', trim($date_received[0])); $date_received = "{$date_received['2']}-{$date_received['1']}-{$date_received['0']}"; # Put all information in an array $application = array('council_reference' => trim($record->find('a', 0)->plaintext), 'address' => preg_replace('/\\s+/', ' ', trim($record->find("a", 1)->plaintext)) . ", Australia", 'description' => preg_replace('/\\s+/', ' ', trim($record->find("td", 2)->plaintext)), 'info_url' => $info_url . trim($record->find('a', 0)->plaintext), 'comment_url' => $info_url . trim($record->find('a', 0)->plaintext), 'date_scraped' => date('Y-m-d'), 'date_received' => date('Y-m-d', strtotime($date_received))); # Check if record exist, if not, INSERT, else do nothing $existingRecords = scraperwiki::select("* from data where `council_reference`='" . $application['council_reference'] . "'"); if (count($existingRecords) == 0) { print "Saving record " . $application['council_reference'] . "\n"; # print_r ($application); scraperwiki::save(array('council_reference'), $application); } else { print "Skipping already saved record " . $application['council_reference'] . "\n"; } } # If more than a single page, advance to the next page if ($NumPages > 1 and $i < $NumPages) { $request = array('http' => array('method' => "POST", 'header' => "Content-Type: application/x-www-form-urlencoded\r\n", 'content' => http_build_query(buildformdata($dom, $doPostBackAry[$i - 1]['__EVENTTARGET'], $doPostBackAry[$i - 1]['__EVENTARGUMENT'])))); $context = stream_context_create($request); $html = file_get_html($da_page, false, $context); $dataset = $html->find("tr[class=normalRow], tr[class=alternateRow]"); } }