}
    $a = array_merge($a, array('__EVENTTARGET' => $eventtarget));
    $a = array_merge($a, array('__EVENTARGUMENT' => ''));
    return $a;
}
# By default, assume it is single page
$dataset = $dom->find("tr[class=rgRow], tr[class=rgAltRow]");
$NumPages = count($dom->find('div[class=rgWrap rgNumPart] a'));
if ($NumPages === 0) {
    $NumPages = 1;
}
for ($i = 1; $i <= $NumPages; $i++) {
    # If more than a single page, fetch the page
    if ($NumPages > 1) {
        $eventtarget = substr($dom->find('div[class=rgWrap rgNumPart] a', $i - 1)->href, 25, 61);
        $request = array('http' => array('method' => 'POST', 'header' => 'Content-Type: application/x-www-form-urlencoded\\r\\n', 'content' => http_build_query(buildformdata($dom, $eventtarget))));
        $context = stream_context_create($request);
        $html = file_get_html($da_page, false, $context);
        $dataset = $html->find("tr[class=rgRow], tr[class=rgAltRow]");
        echo "Scraping page {$i} of {$NumPages}\r\n";
    }
    # The usual, look for the data set and if needed, save it
    foreach ($dataset as $record) {
        # Slow way to transform the date but it works
        $date_received = explode(' ', trim($record->children(2)->plaintext), 2);
        $date_received = explode('/', $date_received[0]);
        $date_received = "{$date_received['2']}-{$date_received['1']}-{$date_received['0']}";
        # Get the address
        $address = $record->find('td', 3)->plaintext;
        $tokens = explode('</br>', $address);
        $address = trim($tokens[0]) . ", NSW";
示例#2
0
$cookies = accept_terms_get_cookies($term_url, "Agree");
# Manually set cookie's key and get the value from array
$request = array('http' => array('header' => "Cookie: ASP.NET_SessionId=" . $cookies['ASP_NET_SessionId'] . "; path=/; HttpOnly\r\n" . "{$user_agent}\r\n"));
$context = stream_context_create($request);
$dom = file_get_html($da_page, false, $context);
# By default, assume it is single page
$dataset = $dom->find("tr[class=rgRow], tr[class=rgAltRow]");
$NumPages = count($dom->find('div[class=rgWrap rgNumPart] a'));
if ($NumPages === 0) {
    $NumPages = 1;
}
for ($i = 1; $i <= $NumPages; $i++) {
    # If more than a single page, fetch the page
    if ($NumPages > 1) {
        $eventtarget = substr($dom->find('div[class=rgWrap rgNumPart] a', $i - 1)->href, 25, 61);
        $request = array('http' => array('method' => "POST", 'header' => "Cookie: ASP.NET_SessionId=" . $cookies['ASP_NET_SessionId'] . "; path=/; HttpOnly\r\n" . "Content-Type: application/x-www-form-urlencoded\r\n" . "{$user_agent}\r\n", 'content' => http_build_query(buildformdata($dom, $eventtarget))));
        $context = stream_context_create($request);
        $html = file_get_html($da_page, false, $context);
        $dataset = $html->find("tr[class=rgRow], tr[class=rgAltRow]");
        echo "Scraping page {$i} of {$NumPages}\r\n";
    }
    # The usual, look for the data set and if needed, save it
    foreach ($dataset as $record) {
        # Slow way to transform the date but it works
        $date_received = explode(' ', trim($record->find('td', 2)->plaintext));
        $date_received = "{$date_received['2']}-{$date_received['1']}-{$date_received['0']}";
        $date_received = date('Y-m-d', strtotime($date_received));
        # Prep a bit more, ready to add these to the array
        $tempstr = explode('<br/>', $record->find('td', 3)->innertext);
        $addr = trim(html_entity_decode($tempstr[0]));
        $addr = explode('<strong>', $addr);
        $doPostBackAry[] = array('__EVENTTARGET' => $eventTarget[1], '__EVENTARGUMENT' => $eventArgument[1]);
    }
}
for ($i = 1; $i <= $NumPages; $i++) {
    echo "Scraping page {$i} of {$NumPages}\r\n";
    # The usual, look for the data set and if needed, save it
    foreach ($dataset as $record) {
        # Slow way to transform the date but it works
        $date_received = explode(' ', trim($record->find("td", 1)->plaintext));
        $date_received = explode('/', trim($date_received[0]));
        $date_received = "{$date_received['2']}-{$date_received['1']}-{$date_received['0']}";
        # Put all information in an array
        $application = array('council_reference' => trim($record->find('a', 0)->plaintext), 'address' => preg_replace('/\\s+/', ' ', trim($record->find("a", 1)->plaintext)) . ", Australia", 'description' => preg_replace('/\\s+/', ' ', trim($record->find("td", 2)->plaintext)), 'info_url' => $info_url . trim($record->find('a', 0)->plaintext), 'comment_url' => $info_url . trim($record->find('a', 0)->plaintext), 'date_scraped' => date('Y-m-d'), 'date_received' => date('Y-m-d', strtotime($date_received)));
        # Check if record exist, if not, INSERT, else do nothing
        $existingRecords = scraperwiki::select("* from data where `council_reference`='" . $application['council_reference'] . "'");
        if (count($existingRecords) == 0) {
            print "Saving record " . $application['council_reference'] . "\n";
            # print_r ($application);
            scraperwiki::save(array('council_reference'), $application);
        } else {
            print "Skipping already saved record " . $application['council_reference'] . "\n";
        }
    }
    # If more than a single page, advance to the next page
    if ($NumPages > 1 and $i < $NumPages) {
        $request = array('http' => array('method' => "POST", 'header' => "Content-Type: application/x-www-form-urlencoded\r\n", 'content' => http_build_query(buildformdata($dom, $doPostBackAry[$i - 1]['__EVENTTARGET'], $doPostBackAry[$i - 1]['__EVENTARGUMENT']))));
        $context = stream_context_create($request);
        $html = file_get_html($da_page, false, $context);
        $dataset = $html->find("tr[class=normalRow], tr[class=alternateRow]");
    }
}