-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.php
101 lines (84 loc) · 3.86 KB
/
scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
<?php
### Wingecarribee Shire Council scraper - ApplicationMaster
require_once 'vendor/autoload.php';
require_once 'vendor/openaustralia/scraperwiki/scraperwiki.php';
use PGuardiario\PGBrowser;
use Sunra\PhpSimple\HtmlDomParser;
date_default_timezone_set('Australia/Sydney');
# Default to 'thisweek', use MORPH_PERIOD to change to 'thismonth' or 'lastmonth' for data recovery
switch(getenv('MORPH_PERIOD')) {
case 'thismonth' :
$period = 'thismonth';
break;
case 'lastmonth' :
$period = 'lastmonth';
break;
default :
$period = 'thisweek';
break;
}
$url_base = "https://datracker.wsc.nsw.gov.au/Modules/applicationmaster/";
$da_page = $url_base . "default.aspx?page=found&1=" .$period. "&4a=WLUA,82AReview,CDC,DA,Mods&6=F";
$comment_base = "mailto:mail@wsc.nsw.gov.au?subject=Development Application Enquiry: ";
# Agreed Terms
$browser = new PGBrowser();
$page = $browser->get($url_base);
$form = $page->form();
$form->set('ctl00$cphContent$ctl01$Button2', 'Agree');
$page = $form->submit();
// Get list of development applications
$page = $browser->get($da_page);
$dom = HtmlDomParser::str_get_html($page->html);
# By default, assume it is single page
$dataset = $dom->find("tr[class=rgRow], tr[class=rgAltRow]");
$NumPages = count($dom->find('div[class=rgWrap rgNumPart] a'));
if ($NumPages === 0) { $NumPages = 1; }
for ($i = 1; $i <= $NumPages; $i++) {
echo "Scraping page $i of $NumPages\n";
# If more than a single page, fetch the page
if ($NumPages > 1) {
$doPostBack = $dom->find('div[class=rgWrap rgNumPart] a')[$i-1]->href;
$form = $page->form();
$page = $form->doPostBack($doPostBack);
$dom = HtmlDomParser::str_get_html($page->html);
$dataset = $dom->find("tr[class=rgRow], tr[class=rgAltRow]");
}
# The usual, look for the data set and if needed, save it
foreach ($dataset as $record) {
# Slow way to transform the date but it works
$date_received = explode(' ', trim($record->find('td',2)->plaintext));
$date_received = "$date_received[2]-$date_received[1]-$date_received[0]";
$date_received = date('Y-m-d', strtotime($date_received));
# Prep a bit more, ready to add these to the array
$tempstr = explode('<br/>', $record->find('td', 3)->innertext);
$addr = trim(html_entity_decode($tempstr[0]));
$addr = explode('<strong>', $addr);
$addr = explode('</strong>', $addr[1]);
$addr = preg_replace('/\s+/', ' ', $addr[0]);
$desc = trim(html_entity_decode($tempstr[1]));
$desc = preg_replace('/\s+/', ' ', $desc);
$council_reference = trim(html_entity_decode($record->find('td',1)->plaintext));
$council_reference = preg_replace('/\s+/', ' ', $council_reference);
$info_url = $url_base . trim($record->find('a',0)->href);
# Put all information in an array
$application = [
'council_reference' => $council_reference,
'address' => $addr,
'description' => $desc,
'info_url' => $info_url,
'comment_url' => $comment_base . $council_reference,
'date_scraped' => date('Y-m-d'),
'date_received' => $date_received
];
# Check if record exist, if not, INSERT, else do nothing
$existingRecords = scraperwiki::select("* from data where `council_reference`='" . $application['council_reference'] . "'");
if (count($existingRecords) == 0) {
print ("Saving record " . $application['council_reference'] . " - " .$application['address']. "\n");
// print_r ($application);
scraperwiki::save(array('council_reference'), $application);
} else {
print ("Skipping already saved record " . $application['council_reference'] . "\n");
}
}
}
?>