Пример #1
0
<?php

define('GEOCODER', 'hybrid');
require_once '../../spider.php';
$data = new Data('salzburg.info');
$indexes = array('http://www2.salzburg.info/sehenswertes_222.htm', 'http://www2.salzburg.info/sehenswertes_361.htm');
foreach ($indexes as $url) {
    $indexUrl = new URL($url);
    while ($indexUrl) {
        $indexPage = $indexUrl->get();
        foreach ($indexPage->match('/<a href="([^"]+)" class="link_darkred"><b>more/') as $match) {
            $itemUrl = new URL($match, $indexUrl);
            $itemPage = $itemUrl->get();
            list($lat, $lng) = $data->geocode($itemPage->match('/<tr>[\\s\\n]+<td><img src="pics\\/1\\.gif" width="1" height="8"><\\/td>[\\s\\n]+<\\/tr>[\\s\\n]+<tr>[\\s\\n]+<td>([^<]+).*?<\\/td>[\\s\\n]+<\\/tr>[\\s\\n]+<tr>[\\s\\n]+<td><img src="pics\\/1\\.gif" width="1" height="8"><\\/td>[\\s\\n]+<\\/tr>/') . ' Salzburg Austria');
            $data->add(array('title' => $itemPage->match('/<h1>([^<]+)/'), 'description' => $itemPage->match('/<\\/td>[\\s\\n]+<td valign="top">[\\s\\n]+<table width="160" border="0" cellspacing="0" cellpadding="0">[\\s\\n]+<tr>[\\s\\n]+<td>([^<]+)/'), 'lat' => $lat, 'lng' => $lng, 'href' => $itemUrl));
        }
        $nextUrl = $indexPage->match('/<a href="([^"]+)" class="link_darkred">next/');
        if ((string) $nextUrl) {
            $indexUrl = new URL($nextUrl, $url);
        } else {
            break;
        }
    }
}
// we're done, close the CSV file
$data->done();
<?php

define('GEOCODER', 'yahoo');
require_once '../../spider.php';
$data = new Data('austria.intermaps.com');
$indexes = array('museum' => 'http://austria.intermaps.com/soew/oewtemplate/mol.xml?subid=412&level=600&startX=25000&endX=700000&startY=0&endY=700000', 'theatre' => 'http://austria.intermaps.com/soew/oewtemplate/mol.xml?subid=213&level=600&startX=25000&endX=700000&startY=0&endY=700000', 'nature' => 'http://austria.intermaps.com/soew/oewtemplate/mol.xml?subid=413&level=600&startX=25000&endX=700000&startY=0&endY=700000');
foreach ($indexes as $type => $url) {
    $indexUrl = new URL($url);
    $indexPage = $indexUrl->get();
    foreach ($indexPage->match('/<OBJECT Object_ID="([^"]+)"/') as $match) {
        $itemUrl = new URL('http://austria.intermaps.com/soew/oewtemplate/ObjectShow.xml?objid=' . $match . '&lang=en');
        $itemPage = $itemUrl->get();
        $city = $itemPage->match('/<[is]_city>([^<]+)/');
        $postcode = $itemPage->match('/<[is]_postcode>([^<]+)/');
        $location = $itemPage->match('/<[is]_location>([^<]+)/');
        list($lat, $lng) = $data->geocode($city . ' ' . $postcode . ' ' . $location . ' Austria');
        $href = $itemPage->match('/<[is]_web>([^<]+)/');
        if (substr($href, 0, 7) != 'http://') {
            $href = 'http://' . $href;
        }
        $data->add(array('title' => $itemPage->match('/<o_name>([^<]+)/'), 'description' => $itemPage->match('/<DETAIL d_art="Beschreibung" d_language="en" d_priority="0"><\\!\\[CDATA\\[(.+?)\\]\\]>/'), 'lat' => $lat, 'lng' => $lng, 'href' => $href, 'type' => $type));
    }
}
// we're done, close the CSV file
$data->done();
Пример #3
0
/*
 * This example usage grabs data from www.worldtravelguide.net and creates a CSV
 * file www.worldtravelguide.net.csv containing the scraped data.
 */
// we'll just use the Yahoo geocoder
define('GEOCODER', 'yahoo');
// include the spider library
require_once 'spider.php';
// create a data object to store our data in
$data = new Data('www.worldtravelguide.net');
// get the index page of the attractions
$indexUrl = new URL('http://www.worldtravelguide.net/attraction/');
$indexPage = $indexUrl->get();
// match each subpage URL
foreach ($indexPage->match('/<li><b><a href="([^"]+)/') as $match) {
    // get the subpage
    $continentUrl = new URL($match, $indexUrl);
    $continentPage = $continentUrl->get();
    // match each attracton page URL
    foreach ($continentPage->match('/<b><a href="([^"]+)/') as $match) {
        // get the attraction page
        $itemUrl = new URL($match, $continentUrl);
        $itemPage = $itemUrl->get();
        // extract the address from the page and geocode it
        list($lat, $lng) = $data->geocode($itemPage->match('/<h3>Contact Addresses<\\/h3>[\\r\\n\\t ]+<div class="paragraph">.+?,([^<]+)/'));
        // extract the other data from the page and add it to the data object along with the geocoded lat/lng data
        $data->add(array('title' => preg_replace('/\\([^)]+\\)/', '', $itemPage->match('/<title>(.+?) Guide/')), 'description' => $itemPage->match('/<\\/h2>[\\r\\n\\t ]+<div class="paragraph">(.+?)<\\/div>/'), 'lat' => $lat, 'lng' => $lng, 'href' => $itemUrl));
    }
}
// we're done, close the CSV file
$data->done();
Пример #4
0
<?php

define('GEOCODER', 'hybrid');
require_once '../../spider.php';
$data = new Data('aboutvienna.org');
$indexUrl = new URL('http://www.aboutvienna.org/sights/ankeruhr.htm');
$indexPage = $indexUrl->get();
foreach ($indexPage->match('/<td width="50%"(?: height="11")?><a href="([^"]+)/') as $match) {
    $itemUrl = new URL($match, $indexUrl);
    $itemPage = $itemUrl->get();
    list($lat, $lng) = $data->geocode($itemPage->match('/<span class="Adressen">([^<]+)/') . ' Vienna Austria');
    $data->add(array('title' => $itemPage->match('/<h1 class="tdolive_gross_strich"><a name="1"><\\/a>([^<]+)/'), 'description' => $itemPage->match('/<\\!-- google_ad_section_start -->([^<]+)/'), 'lat' => $lat, 'lng' => $lng, 'href' => $itemUrl));
}
// we're done, close the CSV file
$data->done();