$company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->plaintext; } $website = $profile->find("span.orange-text a", 0) ? $profile->find("span.orange-text a", 0)->href : ''; if ($profile->find("div.blue3-empty-box div.content div.word-wrap", 0)) { $info = $profile->find("div.blue3-empty-box div.content div.word-wrap", 0)->plaintext; } else { $info = ''; } $record = array('name' => $profile->find("//div/a/span", 1)->plaintext, 'company' => $company, 'phone' => $profile->find("strong.big-blue3-text span", 0)->plaintext, 'website' => $website); scraperwiki::save(array('company'), $record); //print json_encode($record) . "\n"; scraperwiki::save_var('last', $profile_no); } } //scraperwiki::save_var('last', 0); scraperwiki::attach("find_4n_profiles"); $links = scraperwiki::select("profile from find_4n_profiles.swdata"); require 'scraperwiki/simple_html_dom.php'; $profile = new simple_html_dom(); foreach ($links as $link) { set_time_limit(0); $profile_no = intval(str_replace('http://www.4networking.biz/Members/Details/', '', $link['profile'])); if ($profile_no > scraperwiki::get_var('last')) { $html = scraperWiki::scrape($link['profile']); $profile->load($html); if (!($company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->title)) { $company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->plaintext; } $website = $profile->find("span.orange-text a", 0) ? $profile->find("span.orange-text a", 0)->href : ''; if ($profile->find("div.blue3-empty-box div.content div.word-wrap", 0)) { $info = $profile->find("div.blue3-empty-box div.content div.word-wrap", 0)->plaintext;
$routemap = array(); foreach ($routes as $route) { $routemap[$route['route']]['route'] = $route['route']; @($routemap[$route['route']]['coords'] .= $route['latitude'] . ',' . $route['longitude'] . ',2357' . "\n"); } $theroutes = array(); $count = 0; foreach ($routemap as $a_route) { $count++; $r = $a_route['route']; $c = $a_route['coords']; $theroutes[] = array('id' => $count, 'route' => $r, 'coords' => $c); } scraperwiki::save_sqlite(array("id"), $theroutes); //Whoops, seems that doing 600 queries in under 80 seconds isn't a smart idea. This scraper attempts to aggregate coordinates into something usable. scraperwiki::attach("tfl_bus_routes_scraper", "src"); $routes = scraperwiki::select("route, stop_name, latitude, longitude from src.tfl_buses where run = 1 order by sequence asc"); $routemap = array(); foreach ($routes as $route) { $routemap[$route['route']]['route'] = $route['route']; @($routemap[$route['route']]['coords'] .= $route['latitude'] . ',' . $route['longitude'] . ',2357' . "\n"); } $theroutes = array(); $count = 0; foreach ($routemap as $a_route) { $count++; $r = $a_route['route']; $c = $a_route['coords']; $theroutes[] = array('id' => $count, 'route' => $r, 'coords' => $c); } scraperwiki::save_sqlite(array("id"), $theroutes);
$sourcescraper = 'fys_api_1'; # scraperwiki::attach('irish-epa-licenses', 'lic'); # $licenses = scraperwiki::select("* from lic.swdata"); // $licenses = scraperwiki::getData('irish-epa-licenses'); $s = scraperwiki::attach($sourcescraper, $limit = 250); header('Content-type: application/json'); print "{ \"items\": " . json_encode($s) . "}"; # Blank PHP $sourcescraper = 'fys_api_1'; # scraperwiki::attach('irish-epa-licenses', 'lic'); # $licenses = scraperwiki::select("* from lic.swdata"); // $licenses = scraperwiki::getData('irish-epa-licenses'); $s = scraperwiki::attach($sourcescraper, $limit = 250); header('Content-type: application/json'); print "{ \"items\": " . json_encode($s) . "}"; # Blank PHP $sourcescraper = 'fys_api_1'; # scraperwiki::attach('irish-epa-licenses', 'lic'); # $licenses = scraperwiki::select("* from lic.swdata"); // $licenses = scraperwiki::getData('irish-epa-licenses'); $s = scraperwiki::attach($sourcescraper, $limit = 250); header('Content-type: application/json'); print "{ \"items\": " . json_encode($s) . "}"; # Blank PHP $sourcescraper = 'fys_api_1'; # scraperwiki::attach('irish-epa-licenses', 'lic'); # $licenses = scraperwiki::select("* from lic.swdata"); // $licenses = scraperwiki::getData('irish-epa-licenses'); $s = scraperwiki::attach($sourcescraper, $limit = 250); header('Content-type: application/json'); print "{ \"items\": " . json_encode($s) . "}";
$txt = str_replace("</p>", "", $txt); $txt = preg_replace('/\\s+/', ' ', $txt); return $txt; } function searchForId($id, $array) { foreach ($array as $key => $val) { if ($val['COTTAGE_URL'] === $id) { return $key; } } return null; } $blacklist = array(); $url = "http://www.coastandcountry.co.uk/cottage-details/"; scraperwiki::attach("coastandcountrycouk"); # get an array of the cottage data to scrape $cottData = scraperwiki::select("COTTAGE_URL, PRICE_HIGH, PRICE_LOW from 'coastandcountrycouk'.SWDATA order by COTTAGE_URL"); $placeholder = scraperwiki::get_var("cottID"); if ($placeholder != "") { $index = searchForId($placeholder, $cottData); $cottData = array_splice($cottData, $index); } require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); foreach ($cottData as $value) { scraperwiki::save_var("cottID", $value['COTTAGE_URL']); // check the cottage url against the blacklist foreach ($blacklist as $blItem) { if ($value['COTTAGE_URL'] == $blItem) { continue 2;
//print "{ \"items\": ".json_encode($alltrips) ."}"; $callback = $_GET['callback']; if ($callback) { header("Content-Type: text/javascript; charset=utf8"); echo $callback . "(" . json_encode($alltrips) . ");"; } else { header("Content-type: application/json"); echo json_encode($alltrips); } // {label} {id} {type} {day} {date} {year} {time} {startdate} {latlng} {arasnotaras} {details} {place} {act} {issue} {constitutional} {destf} {address} {days} {destination} ?> <?php //$sourcescraper = 'irish_president_engagementstest'; //$s = scraperwiki::scrape($sourcescraper, $limit=250); // = scraperwiki::attach($sourcescraper, $limit=250); scraperwiki::attach('irish_president_engagementsjson'); $trips = scraperwiki::select("* from irish_president_engagementsjson.swdata where date > date('now','-7 day');"); $alltrips = array(); foreach ($trips as $trip) { $tripinfo = $trip["info"]; $triplabel = $trip["label"]; $tripinfo = str_replace('(', '', $tripinfo); $tripinfo = str_replace(')', ',', $tripinfo); $triplabel = str_replace('(', '', $triplabel); $triplabel = str_replace(')', ',', $triplabel); //print $triplabel; $trip["info"] = $tripinfo; $trip["label"] = $triplabel; $alltrips[] = $trip; } //header('Content-type: application/json');
</itunes:owner> <?php // .. CREACION DEL ARRAY foreach ($data as $item) { echo " <item>\n"; echo " <title>" . $item['artist'] . " - " . $item['title'] . "</title>\n"; echo " <enclosure url=\"" . $item['url'] . "\" type=\"audio/mpeg\" />\n"; echo " <guid>" . $item['loved_count'] . "</guid>\n"; echo " </item>\n"; } ?> </channel> </rss><?php scraperwiki::httpresponseheader('Content-Type', 'application/atom+xml'); scraperwiki::attach("exfm"); $data = scraperwiki::select("* from exfm.swdata"); ?> <rss xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0"> <channel> <title>TuMusika Evolution Podcast</title> <link>http://www.tumusika.net/</link> <language>es-es</language> <itunes:owner> <itunes:name>TuMusika Evolution</itunes:name> <itunes:email>darkgiank@darkgiank.com</itunes:email> </itunes:owner> <?php // .. CREACION DEL ARRAY
<?php require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); scraperwiki::attach("test_1_2"); $result = scraperwiki::sqliteexecute("select html from hotel_list_pages"); $hotel_list_pages_contents = $result->data; foreach ($hotel_list_pages_contents as $contents) { $html = $contents[0]; $dom->load($html); foreach ($dom->find("table.hotellist tr") as $data) { $tds = $data->find("td h3 a"); $record = array('hotel' => $tds[0]->plaintext, 'url' => $tds[0]->href); scraperwiki::save_sqlite(array('hotel'), $record, $table_name = 'hotel_list'); } } require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); scraperwiki::attach("test_1_2"); $result = scraperwiki::sqliteexecute("select html from hotel_list_pages"); $hotel_list_pages_contents = $result->data; foreach ($hotel_list_pages_contents as $contents) { $html = $contents[0]; $dom->load($html); foreach ($dom->find("table.hotellist tr") as $data) { $tds = $data->find("td h3 a"); $record = array('hotel' => $tds[0]->plaintext, 'url' => $tds[0]->href); scraperwiki::save_sqlite(array('hotel'), $record, $table_name = 'hotel_list'); } }
print '</ul>'; $data = scraperwiki::select("count(*) AS c, subject AS s FROM contact INNER JOIN contact_subject ON contact_subject.contact_id=contact.contact_id WHERE behalf='{$w}' GROUP BY subject ORDER BY subject DESC"); print "<h2>Subjects covered</h2><ul>"; foreach ($data as $row) { extract($row); ?> <li><?php echo $s; ?> (<?php echo $c; ?> )</li><?php } print "</ul>"; scraperwiki::attach("communication_log"); $who = $_SERVER['URLQUERY']; $w = urldecode($who); # Note that we don't bother to SQL escape our arg - we have a read-only connection already, so meh. $data = scraperwiki::select("contact.contact_id AS id, person, title, organization, uri, date_contact_h FROM contact INNER JOIN victim ON victim.contact_id=contact.contact_id WHERE behalf='{$w}' ORDER BY date_contact_c DESC"); $orgs = array(); print "<h2>Contacts on behalf of {$w}</h2><ul>"; foreach ($data as $row) { extract($row); $sub = scraperwiki::select("subject FROM contact_subject WHERE contact_id='{$id}' ORDER BY subject"); $s = array(); foreach ($sub as $sRow) { $s[] = $sRow['subject']; } $s = join(', ', $s); ?>
return $key; } } return null; } # Check for pages with no usable data function fourOhFour($html) { if (strpos($html, 'Home not available', 1200) !== false) { return true; } return false; } $blacklist = array(); # get an array of the cottage data to scrape scraperwiki::attach("bluechip_summary"); $cottData = scraperwiki::select(" COTTAGE_ID, COTTAGE_URL, SLEEPS, BEDROOMS, FEATURES,COTTAGE_NAME, PRICE_LOW from 'bluechip_summary'.SWDATA order by COTTAGE_URL"); $placeholder = scraperwiki::get_var("cottURL"); if ($placeholder != "") { $index = searchForId($placeholder, $cottData); $cottData = array_splice($cottData, $index); } require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); foreach ($cottData as $value) { $highPrice = ""; $lowPrice = ""; $found = 0; $count = 0; scraperwiki::save_var("cottURL", $value['COTTAGE_URL']); // check the cottage url against the blacklist
<?php $sourcescraper = 'nh_gencourt_votes'; scraperwiki::attach('nh_gencourt_votes'); $data = scraperwiki::select("* from nh_gencourt_votes.bill_votes \n order by \n substr(date_of_vote, 7) || substr(date_of_vote, 1, 2) || substr(date_of_vote, 4, 2) desc, \n cast(vote_num as int) desc\n "); ?> <!DOCTYPE html> <html lang="en"> <head> <!-- http://twitter.github.com/bootstrap/base-css.html --> <link href="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.0/css/bootstrap-combined.min.css" rel="stylesheet"> <script src="//ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script> <script src="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.0/js/bootstrap.min.js"></script> <style> th { white-space: nowrap; } </style> </head> <body> <div class="container"> <div class="page-header"><h1>NH House Bills</h1></div> <div class="row"> <div class="span12"> <table class="table table-striped table-bordered table-hover table-condensed"> <tr> <th>Date of Vote</th> <th>Vote #</th> <th>Bill #</th> <th>Bill Title</th> <th>Question/Motion</th>
return $key; } } return null; } # Check for pages with no usable data function fourOhFour($html) { if (strpos($html, 'Home not available', 1200) !== false) { return true; } return false; } $blacklist = array(); # get an array of the cottage data to scrape scraperwiki::attach("hoseasons_summary"); $cottData = scraperwiki::select(" COTTAGE_ID, COTTAGE_URL, SLEEPS, BEDROOMS, PETS,COTTAGE_NAME, PRICE_LOW, PRICE_HIGH from 'hoseasons_summary'.SWDATA order by COTTAGE_URL"); $placeholder = scraperwiki::get_var("cottURL"); if ($placeholder != "") { $index = searchForId($placeholder, $cottData); $cottData = array_splice($cottData, $index); } require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); foreach ($cottData as $value) { $highPrice = ""; $lowPrice = ""; $found = 0; $count = 0; scraperwiki::save_var("cottURL", $value['COTTAGE_URL']); // check the cottage url against the blacklist
//no form for given combinations return array(); } else { //get dom $dom = new simple_html_dom(); $dom->load($html); $orgs_obj = $dom->find('select[name=icoNam]', 0)->find('option'); foreach ((array) $orgs_obj as $org) { $data[] = array('value' => $org->value, 'label' => trim($org->innertext)); } return $data; } } require 'scraperwiki/simple_html_dom.php'; //read the saved tables scraperwiki::attach("cz_public_organizations_ufis_basics", "src"); $dris = scraperwiki::select("* from src.dri order by value"); $periods = scraperwiki::select("* from src.period order by value"); $forms = scraperwiki::select("* from src.form order by value"); $chapters = scraperwiki::select("* from src.chapter order by value"); $regions = scraperwiki::select("* from src.region order by value"); $periods = array('0' => array('value' => '12/2012')); //temp!! //$forms = array('0' => array('value' => 50)); //temp //scraperwiki::save_var('last_c',4); //temp $d = scraperwiki::get_var('last_d', 0); $p = scraperwiki::get_var('last_p', 0); $f = scraperwiki::get_var('last_f', 0); $c = scraperwiki::get_var('last_c', 0); $r = scraperwiki::get_var('last_r', 0); foreach ((array) $dris as $dkey => $dri) {
//foreach row save info foreach ((array) $rows as $row) { //inner org_id is in <a href= ... it is used for getting details from the system $as = $row->find("a"); $tmp_text = $as[0]->href; $inner_org_id = substr($tmp_text, $len); //<td> $tds = $row->find("td"); //save the data $out = array('org_id' => trim($tds[0]->plaintext), 'short_name' => trim($tds[1]->plaintext), 'inner_org_id' => $inner_org_id, 'chapter' => $data_row['chapter']); scraperwiki::save_sqlite(array('org_id'), $out); } } require 'scraperwiki/simple_html_dom.php'; //read the data saved from downloader scraperwiki::attach("cz_public_organizations_2_downloader", "src"); $data = scraperwiki::select("* from src.swdata"); //helper $len = strlen("/cgi-bin/ufisreg/detail.pl?org="); foreach ((array) $data as $data_row) { //get dom from data $dom = new simple_html_dom(); $dom->load($data_row['html']); //extract information $rows = $dom->find("tr"); //first row is the header, removing it array_shift($rows); //foreach row save info foreach ((array) $rows as $row) { //inner org_id is in <a href= ... it is used for getting details from the system $as = $row->find("a");
return $result; } /** * finds 1st substring between opening and closing markers * @return result 1st substring */ function get_first_string($text, $openingMarker, $closingMarker) { $out_ar = returnSubstrings($text, $openingMarker, $closingMarker); $out = $out_ar[0]; return $out; } //retrieves data about voting members of assembly from https://scraperwiki.com/scrapers/cz_praha_voting_records_retrieval/ //2010-2014 require 'scraperwiki/simple_html_dom.php'; scraperwiki::attach("cz_praha_voting_records_retrieval", "src"); $rows = scraperwiki::select("distinct(mp_id) from src.mp_vote"); foreach ($rows as $row) { $url = "http://www.praha.eu/jnp/cz/home/volene_organy/zastupitelstvo_hmp/slozeni_zastupitelstva/index.html?memberId=" . $row['mp_id']; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $part = get_first_string($html, '</h2>', '<div>'); $name = trim($dom->find('h2', 0)->plaintext); $email = get_first_string($part, 'mailto:', '"'); $party = trim(get_first_string($part, 'Strana:</span>', '<br')); $club = trim(get_first_string(get_first_string($part, 'Klub:</span>', '</a') . '::', '">', '::')); $data[] = array('id' => $row['mp_id'], 'name' => $name, 'party' => $party, 'club' => $club); } scraperwiki::save_sqlite(array('id'), $data, 'info'); /**
function fetchRegionHydroData($dbname, $aliasname) { scraperwiki::attach($dbname, $aliasname); debug_tables($aliasname); updateHydroApplications($aliasname, ''); }
?> </ev:startdate> <ev:enddate><?php echo $enddate; ?> </ev:enddate> <ev:location>Lichfield Cathedral</ev:location> <georss:point>52.685556 -1.830556</georss:point> </item> <?php } ?> </rdf:RDF><?php // Attach the data scraperwiki::attach("hhhlich-lichfield-arts-events"); // Get the data $data = scraperwiki::select("* from swdata limit 10"); //print_r($data); echo '<?xml version="1.0" encoding="utf-8"?>'; ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:ev="http://purl.org/rss/1.0/modules/event/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:georss="http://www.georss.org/georss" xmlns:creativeCommons="http://backend.userland.com/creativeCommonsRssModule" xmlns="http://purl.org/rss/1.0/"> <rdf:Description rdf:about="http://lichfieldlive.co.uk/"> <dc:title>Lichfield What's On Importer</dc:title>
$vevents = scraperwiki::select("* from ons_release_schedule_ical.vevents limit {$limit} offset {$offset}"); $icalevents = array(); foreach ($vevents as $vevent) { $icalevent = "BEGIN:VEVENT\nDTSTAMP" . $vevent["DTSTAMP"] . "\nDTSTART" . $vevent["DTSTART"] . "\nX-TITLE:" . $vevent["Title"] . "\nSUMMARY:" . $vevent["Summary"] . "\nDESCRIPTION:Theme: " . $vevent["Theme"] . "\n" . $vevent["Summary"] . "\\nEND:VEVENT\n"; #print_r($icalevent); $icalevents[] = $icalevent; } print "BEGIN:VCALENDAR\nMETHOD:PUBLISH\nVERSION:2.0\nX-WR-CALNAME:ONS Release Calendar\nPRODID:-//Apple Inc.//iCal 4.0.4//EN\nX-APPLE-CALENDAR-COLOR:#B027AE\nX-WR-TIMEZONE:Europe/London\nCALSCALE:GREGORIAN\n" . implode("", $icalevents) . "\nEND:VCALENDAR"; // Derive an ical string of 10 eventss $querylist = explode("&", getenv("QUERY_STRING")); $limit = 10; $offset = 0; foreach ($querylist as $queryl) { $ql = explode("=", $queryl); if ($ql[0] == "limit" && count($ql) == 2) { $limit = intval($ql[1]); } if ($ql[0] == "offset" && count($ql) == 2) { $offset = intval($ql[1]); } } scraperwiki::httpresponseheader("Content-Type", "text/plain"); scraperwiki::attach('ons_release_schedule_ical'); $vevents = scraperwiki::select("* from ons_release_schedule_ical.vevents limit {$limit} offset {$offset}"); $icalevents = array(); foreach ($vevents as $vevent) { $icalevent = "BEGIN:VEVENT\nDTSTAMP" . $vevent["DTSTAMP"] . "\nDTSTART" . $vevent["DTSTART"] . "\nX-TITLE:" . $vevent["Title"] . "\nSUMMARY:" . $vevent["Summary"] . "\nDESCRIPTION:Theme: " . $vevent["Theme"] . "\n" . $vevent["Summary"] . "\\nEND:VEVENT\n"; #print_r($icalevent); $icalevents[] = $icalevent; } print "BEGIN:VCALENDAR\nMETHOD:PUBLISH\nVERSION:2.0\nX-WR-CALNAME:ONS Release Calendar\nPRODID:-//Apple Inc.//iCal 4.0.4//EN\nX-APPLE-CALENDAR-COLOR:#B027AE\nX-WR-TIMEZONE:Europe/London\nCALSCALE:GREGORIAN\n" . implode("", $icalevents) . "\nEND:VCALENDAR";
<?php foreach ($programs as $program) { print '<outline text="' . $program['progId'] . '">'; $pods = scraperwiki::select("* from sr_p3_poddar.swdata where progId=" . $program['progId'] . " limit 5"); foreach ($pods as $pod) { print '<outline text="' . $pod['description'] . '" URL="' . $pod['url'] . '" type="audio" />'; } print '</outline>'; } ?> </body> </opml> <?php # Blank PHP $sourcescraper = 'sr_p3_poddar'; scraperwiki::attach("sr_p3_poddar"); $programs = scraperwiki::select("distinct progId from sr_p3_poddar.swdata"); print '<?xml version="1.0" encoding="UTF-8"?>'; ?> <opml version="1.1"> <head> <title>Podsändningar i P3</title> </head> <body> <?php foreach ($programs as $program) { print '<outline text="' . $program['progId'] . '">'; $pods = scraperwiki::select("* from sr_p3_poddar.swdata where progId=" . $program['progId'] . " limit 5"); foreach ($pods as $pod) { print '<outline text="' . $pod['description'] . '" URL="' . $pod['url'] . '" type="audio" />'; }
$LinkedIn = str_replace("LinkedIn:", "", $link->data); $LinkedIn = preg_replace("/\\s+/", "", $LinkedIn); $OBJ['linkedIn'] = $LinkedIn; } } // Clean certifications $certifications = array_unique(json_decode($row['certifications'])); $OBJ['certifications'] = json_encode($certifications); // Geo scraperwiki::save_sqlite(array('id', 'name', 'company', 'location', 'date', 'url', 'profile', 'twitter', 'klout', 'profile_url', 'linkedIn', 'certifications'), $OBJ); scraperwiki::save_var('last_page', $counter); $counter = $counter + 1; print_r($counter); } print_r("start"); scraperwiki::attach("appcelerator_devlink"); // Bootstrap variables if (!scraperwiki::table_info($name = "swvariables")) { scraperwiki::save_var('last_page', 0); } $lastPage = scraperwiki::get_var('last_page'); if ($lastPage > 0) { $offset = " OFFSET " . $lastPage; $counter = $lastPage; } else { $offset = ""; $counter = 0; } print_r($offset); $data = scraperwiki::select("* from appcelerator_devlink.swdata LIMIT 1500" . $offset); foreach ($data as $row) {
scraperwiki::attach("lichfield_cathedral_events", "cathedral"); $cathedral = scraperwiki::select("* from cathedral.swdata"); foreach (scraperwiki::select("* from cathedral.swdata") as $record) { var_dump($record); die; } /* $insert_ical = array(); $insert_ical['link'] = $insert['link']; $insert_ical['DTSTART'] = $strt; $insert_ical['DTEND'] = $strt+86399; $insert_ical['FREQ'] = "DAILY"; $insert_ical['BYDAY'] = ""; $insert_ical['WKST'] = "MO"; $insert_ical['COUNT'] = round(($nd-$strt)/86400); $insert_event = array(); $insert_event['name'] = ""; $insert_event['link'] = ""; $insert_venue = array(); $insert_venue['name'] = ""; $insert_venue['postcode'] = ""; $insert_venue['lat'] = ""; $insert_venue['lng'] = ""; $insert_venue['picture'] = ""; */ scraperwiki::attach("lichfield_cathedral_events", "cathedral"); $cathedral = scraperwiki::select("* from cathedral.swdata"); foreach (scraperwiki::select("* from cathedral.swdata") as $record) { var_dump($record); die; }
} return null; } #################################################################################################################### #################################################################################################################### #################################################################################################################### #################################################################################################################### $originalPrice = ""; $discountedPrice = ""; $calcPercentage = ""; $discountAmount = ""; $percentage = ""; $i = 0; $blacklist = array(); $url = "http://www.coastandcountry.co.uk/cottage-details/"; scraperwiki::attach("special_offers_coast_and_country_summary_delete"); # get an array of the cottage data to scrape $cottData = scraperwiki::select("COTTAGE_URL from 'special_offers_coast_and_country_summary_delete'.SWDATA order by COTTAGE_URL"); $placeholder = scraperwiki::get_var("cottID"); if ($placeholder != "") { $index = searchForId($placeholder, $cottData); $cottData = array_splice($cottData, $index); } require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); foreach ($cottData as $value) { scraperwiki::save_var("cottID", $value['COTTAGE_URL']); // check the cottage url against the blacklist foreach ($blacklist as $blItem) { if ($value['COTTAGE_URL'] == $blItem) { continue 2;
<style type="text/css"> body { margin: 0; padding: 0; font:0.8em/1.5em "Lucida Grande", "Lucida Sans Unicode", Helvetica, Arial, sans-serif; } </style> </head> <body> <div id="map_canvas" style="width: 100%; height: 100%;"></div> </body> </html> <?php scraperwiki::attach("walsall_warwickshire_food_safety_inspections"); $data = scraperwiki::select("* from walsall_warwickshire_food_safety_inspections.swdata"); foreach ($data as $data) { if (strlen($data['latlng_lng']) > 0) { $markers[] = "['<h3>" . addslashes(trim($data['name'])) . "</h3><p>" . $data['rating'] . " stars</p>'," . $data['latlng_lat'] . "," . $data['latlng_lng'] . "]"; } } $markers = implode(",", $markers); ?> <html> <head> <script type="text/javascript" src="http://maps.google.com/maps/api/js?sensor=false"></script> <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.5.2/jquery.min.js" type="text/javascript"></script> <script type="text/javascript"> jQuery.noConflict();
$tr = $html->find("div.postmessage div.t_msgfont"); $j = 0; foreach ($tr as $trr) { $noidung = $trr->find('div', 0)->innertext; //$noidung = utf8_encode($noidung); if (mb_strlen($noidung) > 1000) { $j++; @scraperwiki::save_sqlite(array('id'), array('id' => $j . '-' . $src[0]['url'], 'title' => $src[0]['title'], 'url' => $src[0]['url'], 'content' => base64_encode($noidung), 'order' => $j, 'num' => $src[0]['num'], 'reply' => $src[0]['reply'])); } } $html->clear(); unset($html); scraperwiki::save_var('last_id', $i); } require 'scraperwiki/simple_html_dom.php'; scraperwiki::attach("s-in-s", "src"); //scraperwiki::save_var('last_id', 1); //exit(); $id = scraperwiki::get_var('last_id'); for ($i = $id; $i < 1900; $i++) { $src = scraperwiki::select("* from src.swdata limit {$i},1"); $url = $src[0]['link']; $url = 'http://sexinsex.net/bbs/' . $url; $html_content = scraperwiki::scrape($url); $html = str_get_html($html_content); $data = array(); $tr = $html->find("div.postmessage div.t_msgfont"); $j = 0; foreach ($tr as $trr) { $noidung = $trr->find('div', 0)->innertext; //$noidung = utf8_encode($noidung);
if (stristr($url, "twitter")) { $results["twitter_url"] = $url; } else { $results["website_url"] = $url; } } // There are max 3 urls we are interested in if (sizeof($results) === 4) { break; } } scraperwiki::save_sqlite(array("id"), $results, "eparlimen_social_links"); } require 'scraperwiki/simple_html_dom.php'; scraperwiki::sqliteexecute("DELETE FROM eparlimen_social_links"); scraperwiki::attach("eparlimen-constituencies", "urls"); $urls = scraperwiki::select("* FROM urls.eparlimen_constituencies_links"); foreach ($urls as $url) { $url = str_replace(",%20", "", $url["url"]); // A hack for a known bad link. This should be in the link scraper, but it's xmas and I have better things to do :) $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $node = $dom->find("ul.wrap_senarai li", 2); if (is_object($node)) { $code = $node->children(1)->plaintext; } else { echo "Unable to parse {$url}\n"; continue; } $results = array("id" => $code);
} echo '</table>'; echo '<h2>Landkreise nach Fläche (km<sup>2</sup>)</h2>'; echo '<table>'; foreach ($all_by_area as $item) { echo '<tr><td>' . $item['name'] . '</td><td>' . $item['area'] . '</td></tr>' . "\n"; } echo '</table>'; echo '<h2>Landkreise nach Bevölkerungsdichte (Einwohner/km<sup>2</sup>)</h2>'; echo '<table>'; foreach ($all_by_density as $item) { echo '<tr><td>' . $item['name'] . '</td><td>' . $item['inhab_density'] . '</td></tr>' . "\n"; } echo '</table>'; $sourcescraper = 'german-landkreise'; scraperwiki::attach($sourcescraper); $grouped_by_state = scraperwiki::select("state, COUNT(*) AS num FROM swdata GROUP BY state"); $all_by_population = scraperwiki::select("name, inhabitants FROM swdata ORDER BY inhabitants DESC"); $all_by_area = scraperwiki::select("name, CAST(area AS NUMERIC) AS area FROM swdata ORDER BY area DESC"); $all_by_density = scraperwiki::select("name, inhab_density FROM swdata ORDER BY inhab_density DESC"); echo '<h2>Landkreise nach Bundesland</h2>'; echo '<table>'; foreach ($grouped_by_state as $item) { echo '<tr><td>' . $item['state'] . '</td><td>' . $item['num'] . '</td></tr>' . "\n"; } echo '</table>'; echo '<h2>Landkreise nach Einwohnerzahl</h2>'; echo '<table>'; foreach ($all_by_population as $item) { echo '<tr><td>' . $item['name'] . '</td><td>' . $item['inhabitants'] . '</td></tr>' . "\n"; }
} */ scraperwiki::save_var('last_id', $html['id']); } } require 'scraperwiki/simple_html_dom.php'; //corrections: //scraperwiki::save_var('last_id',55626); //55150 /*scraperwiki::sqliteexecute("delete from info where id>55652"); scraperwiki::sqlitecommit(); die();*/ //get last id //scraperwiki::save_var('last_id',0); $last_id = scraperwiki::get_var('last_id', 0); echo $last_id; //read the saved tables scraperwiki::attach("cz_senate_voting_records_downloader_2", "src"); $rows = scraperwiki::select("id from src.swdata where id>{$last_id} order by id"); if (!empty($rows)) { foreach ($rows as $html) { //get dom $dom = new simple_html_dom(); $html2 = scraperwiki::select("* from src.swdata where id={$html['id']}"); $dom->load(str_replace(" ", " ", $html2[0]['html'])); //common part $div = $dom->find("div[class=wysiwyg]", 0); //info $h1 = $div->find('h1', 0); preg_match('/([0-9]{1,}). schůze/', $h1->innertext, $matches); $schuze = $matches[1]; preg_match('/([0-9]{1,}). hlasování/', $h1->innertext, $matches); $hlasovani = $matches[1];
return $pagerank; } } function file_get_contents_curl($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //Set curl to return the data instead of printing it to the browser. curl_setopt($ch, "http://" + CURLOPT_URL, $url); $data = curl_exec($ch); curl_close($ch); return $data; } foreach ($scraper as $scr) { scraperwiki::attach($scr); $qry = "* from " . $scr . ".swdata"; //echo $qry; $arr = scraperwiki::select($qry); // print_r($arr); foreach ($arr as $d) { // print $d["key"]; // print $d["site"]; $pr = (int) getPagerank($d["url"]); if (1) { $d_key = $d["key"]; $d_site = $d["site"]; //print_r($d["url"]." PR is ". (string)$pr ." site is ".$d_site); // ." key is " . $d_key); $record = array('url' => utf8_encode($d["url"]), 'pr' => utf8_encode($pr), 'ar' => utf8_encode($d["rank"]), 'id' => utf8_encode($d_key), 'desc' => $d["site"]); #print_r($record); scraperwiki::save_sqlite(array("id"), $record, "prank");
<?php require 'scraperwiki.php'; require 'scraperwiki/simple_html_dom.php'; $startProductId = scraperwiki::get_var("currentId", -1); if ($startProductId == -1) { print "No previous saved position found. Starting from scratch."; } else { print "Resuming from product id {$startProductId}\n"; } scraperwiki::attach("hobbyking_batteryidlist"); $batteries = scraperwiki::select("id from hobbyking_batteryidlist.data where id > {$startProductId} order by id asc"); $remainingCount = count($batteries); print "Found {$remainingCount} batteries left to be scraped."; $maxPerRun = 100; $loopCount = 0; foreach ($batteries as $bat) { if ($loopCount > $maxPerRun) { print "Ending run after {$maxPerRun} iterations."; break; } $productId = $bat['id']; print "Retrieving " . $productId . "\n"; $html = scraperWiki::scrape("http://www.hobbyking.com/hobbyking/store/uh_viewItem.asp?idProduct={$productId}"); //print $html . "\n"; $dom = new simple_html_dom(); $dom->load($html); // Get the product data (located in a span tag). Should only be one product data area! $productDataAreasDom = $dom->find("SPAN[id=prodDataArea]"); $productDataDom = $productDataAreasDom[0]; //print $productData . "\n";
} //Returns the month as a number function getMonthNum($monthString) { $months = array('jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'); for ($i = 0; $i < count($months); $i++) { if ($months[$i] == strtolower($monthString)) { return $i; } } } /* "Ampron","Atlantex","ARP","Access Music","Akai","Alesis","Analogue Systems","Applied Acoustics","Aries","Arturia","BOSS","BitHeadz","Bomb Factory Studios","Buchla","Casio","Chamberlin","Cheetah","Chimera Synthesis","Clavia","Con Brio","Creamware","Crumar","Dave Smith Instruments","Doepfer","E-mu Systems","EDP","EML","EMS","Electrix Pro","Electro Harmonix","Elektron","Elka","Encore Electronics","Ensoniq","FBT Electronica","Fairlight","Farfisa","Formanta","Future Retro","GForce Software","Generalmusic","Gleeman","Hammond","Hartmann","Hohner","IK Multimedia","Image Line","Jen Electronics","JoMoX","Kawai","Kenton Electronics","KeyFax Hardware","Koblo","Korg","Kurzweil","Linn Electronics","Logan Electronics","MAM","MOTU","MacBeth Studio Systems","Marion Systems","Metasonix","??Miscellaneous??","Moog Music","Mutronics","Native Instruments","New England Digital","Novation","OSC","Oberheim","Octave","PAiA","PPG","Propellerheads","Prosoniq","Quasimidi","Red Sound Systems","Rhodes","Roland" */ require 'scraperwiki/simple_html_dom.php'; scraperwiki::attach('synthfilter_utils'); $state = "Washington"; //Must be Titlecase e.g, Alabama, Michigan. or use an array separated by commas Alabama, Washington $states = explode(",", $state); print_r($states); $jsonManufacturerScraper = scraperwiki::get_var('manufacturer_scraper'); //Get a unique list of synth manufacturers $jsonManufacturerData = file_get_contents($jsonManufacturerScraper); $manufacturerQuery = 'Digisound'; //You can choose to search only a specific manufacturer or if not, just leave it blank $ignoreWords = explode(',', scraperwiki::get_var('iw_digisound')); echo "Total ignored words: " . count($ignoreWords) . "\n"; $cityDepth = 0; //Set to 0 to search all cities found in the database $synthDepth = 0; //Set to 0 to search all synths found in the database
</item> '; } print '</channel> </rss> <!-- ScraperWiki insists on inserting this block, but its not valid XML, so we put it into a comment Thanks http://scraperwiki.com/views/galway-city-planning-feed <div id="scraperwikipane"/> -->'; ?> <?php # Blank PHP #$sourcescraper = 'fill_that_hole'; scraperwiki::httpresponseheader('Content-Type', 'application/atom+xml'); date_default_timezone_set("Europe/London"); scraperwiki::attach("fill_that_hole_manchester"); //really this Manchester! $data = scraperwiki::select("* from fill_that_hole_manchester.swdata order by dateadded desc limit 10"); //print_r($data); print '<?xml version="1.0" encoding="utf-8"?> <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"> <channel> <atom:link href="http://scraperwikiviews.com/run/manchester_pot_holes_rss/" rel="self" type="application/rss+xml" /> <title>Potholes reported on Fill That Hole - Manchester</title> <link>http://www.fillthathole.org.uk/node/114/hazards</link> <description>RSS feed of latest pothole reports via Fill That Hole in Manchester</description> <language>en-gb</language>'; foreach ($data as $item) { print ' <item> <title>' . $item["road"] . '</title>