<?php define('GEOCODER', 'hybrid'); require_once '../../spider.php'; $data = new Data('salzburg.info'); $indexes = array('http://www2.salzburg.info/sehenswertes_222.htm', 'http://www2.salzburg.info/sehenswertes_361.htm'); foreach ($indexes as $url) { $indexUrl = new URL($url); while ($indexUrl) { $indexPage = $indexUrl->get(); foreach ($indexPage->match('/<a href="([^"]+)" class="link_darkred"><b>more/') as $match) { $itemUrl = new URL($match, $indexUrl); $itemPage = $itemUrl->get(); list($lat, $lng) = $data->geocode($itemPage->match('/<tr>[\\s\\n]+<td><img src="pics\\/1\\.gif" width="1" height="8"><\\/td>[\\s\\n]+<\\/tr>[\\s\\n]+<tr>[\\s\\n]+<td>([^<]+).*?<\\/td>[\\s\\n]+<\\/tr>[\\s\\n]+<tr>[\\s\\n]+<td><img src="pics\\/1\\.gif" width="1" height="8"><\\/td>[\\s\\n]+<\\/tr>/') . ' Salzburg Austria'); $data->add(array('title' => $itemPage->match('/<h1>([^<]+)/'), 'description' => $itemPage->match('/<\\/td>[\\s\\n]+<td valign="top">[\\s\\n]+<table width="160" border="0" cellspacing="0" cellpadding="0">[\\s\\n]+<tr>[\\s\\n]+<td>([^<]+)/'), 'lat' => $lat, 'lng' => $lng, 'href' => $itemUrl)); } $nextUrl = $indexPage->match('/<a href="([^"]+)" class="link_darkred">next/'); if ((string) $nextUrl) { $indexUrl = new URL($nextUrl, $url); } else { break; } } } // we're done, close the CSV file $data->done();
<?php define('GEOCODER', 'yahoo'); require_once '../../spider.php'; $data = new Data('austria.intermaps.com'); $indexes = array('museum' => 'http://austria.intermaps.com/soew/oewtemplate/mol.xml?subid=412&level=600&startX=25000&endX=700000&startY=0&endY=700000', 'theatre' => 'http://austria.intermaps.com/soew/oewtemplate/mol.xml?subid=213&level=600&startX=25000&endX=700000&startY=0&endY=700000', 'nature' => 'http://austria.intermaps.com/soew/oewtemplate/mol.xml?subid=413&level=600&startX=25000&endX=700000&startY=0&endY=700000'); foreach ($indexes as $type => $url) { $indexUrl = new URL($url); $indexPage = $indexUrl->get(); foreach ($indexPage->match('/<OBJECT Object_ID="([^"]+)"/') as $match) { $itemUrl = new URL('http://austria.intermaps.com/soew/oewtemplate/ObjectShow.xml?objid=' . $match . '&lang=en'); $itemPage = $itemUrl->get(); $city = $itemPage->match('/<[is]_city>([^<]+)/'); $postcode = $itemPage->match('/<[is]_postcode>([^<]+)/'); $location = $itemPage->match('/<[is]_location>([^<]+)/'); list($lat, $lng) = $data->geocode($city . ' ' . $postcode . ' ' . $location . ' Austria'); $href = $itemPage->match('/<[is]_web>([^<]+)/'); if (substr($href, 0, 7) != 'http://') { $href = 'http://' . $href; } $data->add(array('title' => $itemPage->match('/<o_name>([^<]+)/'), 'description' => $itemPage->match('/<DETAIL d_art="Beschreibung" d_language="en" d_priority="0"><\\!\\[CDATA\\[(.+?)\\]\\]>/'), 'lat' => $lat, 'lng' => $lng, 'href' => $href, 'type' => $type)); } } // we're done, close the CSV file $data->done();
/* * This example usage grabs data from www.worldtravelguide.net and creates a CSV * file www.worldtravelguide.net.csv containing the scraped data. */ // we'll just use the Yahoo geocoder define('GEOCODER', 'yahoo'); // include the spider library require_once 'spider.php'; // create a data object to store our data in $data = new Data('www.worldtravelguide.net'); // get the index page of the attractions $indexUrl = new URL('http://www.worldtravelguide.net/attraction/'); $indexPage = $indexUrl->get(); // match each subpage URL foreach ($indexPage->match('/<li><b><a href="([^"]+)/') as $match) { // get the subpage $continentUrl = new URL($match, $indexUrl); $continentPage = $continentUrl->get(); // match each attracton page URL foreach ($continentPage->match('/<b><a href="([^"]+)/') as $match) { // get the attraction page $itemUrl = new URL($match, $continentUrl); $itemPage = $itemUrl->get(); // extract the address from the page and geocode it list($lat, $lng) = $data->geocode($itemPage->match('/<h3>Contact Addresses<\\/h3>[\\r\\n\\t ]+<div class="paragraph">.+?,([^<]+)/')); // extract the other data from the page and add it to the data object along with the geocoded lat/lng data $data->add(array('title' => preg_replace('/\\([^)]+\\)/', '', $itemPage->match('/<title>(.+?) Guide/')), 'description' => $itemPage->match('/<\\/h2>[\\r\\n\\t ]+<div class="paragraph">(.+?)<\\/div>/'), 'lat' => $lat, 'lng' => $lng, 'href' => $itemUrl)); } } // we're done, close the CSV file $data->done();
<?php define('GEOCODER', 'hybrid'); require_once '../../spider.php'; $data = new Data('aboutvienna.org'); $indexUrl = new URL('http://www.aboutvienna.org/sights/ankeruhr.htm'); $indexPage = $indexUrl->get(); foreach ($indexPage->match('/<td width="50%"(?: height="11")?><a href="([^"]+)/') as $match) { $itemUrl = new URL($match, $indexUrl); $itemPage = $itemUrl->get(); list($lat, $lng) = $data->geocode($itemPage->match('/<span class="Adressen">([^<]+)/') . ' Vienna Austria'); $data->add(array('title' => $itemPage->match('/<h1 class="tdolive_gross_strich"><a name="1"><\\/a>([^<]+)/'), 'description' => $itemPage->match('/<\\!-- google_ad_section_start -->([^<]+)/'), 'lat' => $lat, 'lng' => $lng, 'href' => $itemUrl)); } // we're done, close the CSV file $data->done();