$date = time(); $arr = array("name", "breed", $date, $latlng); # Metadata functions. $latest_message = scraperwiki::get_metadata('keyname', $default = 'No message yet'); print $latest_message; $latest_message = 'Scraper input'; scraperwiki::save_metadata('latest_message', $latest_message); $arr = array("breed", "name"); # Test scraper for PHP language. # Should contain all our documented PHP functions. # A fail in this scraper indicates a code failure somewhere. require 'scraperwiki/simple_html_dom.php'; # Scrape function. # TODO: Clarify, can we send POST parameters? Does not fail. $arr = array("foo" => "bar"); $html = scraperwiki::scrape("http://scraperwiki.com/hello_world.html", $arr); print $html; # Geo function. $latlng = scraperwiki::gb_postcode_to_latlng("E1 5AW"); print $latlng[0]; # Save function including date and latlng. $arr = array('name' => 'Fluffles', 'breed' => 'Alsatian'); scraperwiki::save(array('name'), $arr); $date = time(); $arr = array("name", "breed", $date, $latlng); # Metadata functions. $latest_message = scraperwiki::get_metadata('keyname', $default = 'No message yet'); print $latest_message; $latest_message = 'Scraper input'; scraperwiki::save_metadata('latest_message', $latest_message); $arr = array("breed", "name");
//Lost //$series_string[""] = ""; // Dummy /* Specify a show to process. If none is specified, will process all shows. */ //$show = "Cali"; /* actual code begins here */ if (isset($show)) { $scrape_url = "http://www.imdb.com/title/" . $series_string[$show] . "/episodes"; processShow($scrape_url); } else { foreach ($series_string as $show => $temp) { $scrape_url = "http://www.imdb.com/title/" . $series_string[$show] . "/episodes"; processShow($scrape_url); } } $keys = array('series_title', 'nr', 'season', 'episode', 'airdate', 'link', 'airdate', 'episode_title', 'description'); scraperwiki::save_metadata('data_columns', $keys); /* function declarations */ /* this function will do all of the scrapping, string-matching and saving */ function processShow($url) { $html = scraperwiki::scrape($url); $regexp_show = '|<h1><small>Episode list for<br></small><a [^>]*>"([^&]*)"</a>|'; preg_match($regexp_show, $html, $arr); $series_title = $arr[1]; $regexp = "|<div class=\"filter-all filter-year-([0-9]{4,4})\"><hr /><table cellspacing=\"0\" cellpadding=\"0\"><tr> <td valign=\"top\"><div class=\"episode_slate_container\"><div class=[^>]*></div></div></td> <td valign=\"top\"><h3>Season ([[:digit:]]*), Episode ([[:digit:]]*): <a href=\"(/title/[[:alnum:]]*/)\">([^<]*)</a></h3><span class=\"less-emphasis\">Original Air Date—<strong>([^<]*)</strong></span><br>([^<]*)[^\n]*</td></tr></table></div>|"; preg_match_all($regexp, $html, $arr, PREG_SET_ORDER); $keys = array('series_title', 'nr', 'season', 'episode', 'airdate', 'link', 'airdate', 'episode_title', 'description'); $i = 0; foreach ($arr as $val) { $i++; $data = array('series_title' => clean($series_title), 'nr' => $i, 'season' => clean($val[2]), 'episode' => clean($val[3]), 'airdate' => clean(date('d.m.Y', strtotime($val[6]))), 'link' => clean('http://www.imdb.com' . $val[4]), 'episode_title' => clean($val[5]), 'description' => clean($val[7]));
function begin() { $dateFrom = scraperwiki::get_metadata("datefrom", "1980-01-01"); $dateTo = scraperwiki::get_metadata("dateto", "1980-01-01"); $dayFrom = $dateFrom; $dayTo = $dateTo; for ($i = 1; $i <= 1; $i++) { $nextDay = mktime(0, 0, 0, date("m", strtotime($dayFrom)), date("d", strtotime($dayFrom)) + 1, date("y", strtotime($dayFrom))); $dayFrom = date('Y-m-d', $nextDay); mainDate($dayFrom); scraperwiki::save_metadata("datefrom", date('Y-m-d', strtotime($dayFrom))); scraperwiki::save_metadata("dateto", date('Y-m-d', strtotime($dayTo))); } }
} unset($tabletrs); $dom->__destruct(); unset($dom); unset($html); scraperwiki::save(array('ID', 'sector'), $record); sleep(2); return $record; } $sourcescraper = 'tedscrapper'; $lasttime = scraperwiki::get_metadata('lasttime', -1); scraperwiki::attach($sourcescraper); print "Querying for data\n"; $data = scraperwiki::sqliteexecute("select distinct time, sector, url from tedscrapper.swdata where time >= '" . $lasttime . "' order by time"); print count($data->data) . " items to process\n"; $count = 0; foreach ($data->data as $ind => $item) { $time = $item[0]; $sector = $item[1]; $url = $item[2]; $record = scrapeTEDDataPage($url, $sector); print $count++ . " " . $record['TI'] . " " . memory_get_usage() / 1024 / 1024 . "MB\n"; scraperwiki::save_metadata('lasttime', $time); unset($item); unset($record); unset($time); unset($sector); unset($url); unset($data->data[$ind]); unset($ind); }
$legal = trim($legal[1][0]); } else { $legal = ''; } preg_match_all('|<div class="assoc">Primary contact address</div>(.*?)<div|', $code, $primary); if (isset($primary[1][0])) { $primary = trim($primary[1][0]); } else { $primary = ''; } $primary = parseAddress($primary); $legal = parseAddress($legal); if (trim($name) != '') { scraperwiki::save(array('num'), array('num' => "" . clean($num), 'name' => clean($name), 'trading' => clean($trading), 'legal_address' => clean($legal['address']), 'legal_phone' => clean($legal['phone']), 'legal_fax' => clean($legal['fax']), 'legal_email' => clean($legal['email']), 'legal_web' => clean($legal['web']), 'primary_address' => clean($primary['address']), 'primary_phone' => clean($primary['phone']), 'primary_fax' => clean($primary['fax']), 'primary_email' => clean($primary['email']), 'primary_web' => clean($primary['web']), 'primary_courses' => clean($primary['courses']))); } scraperwiki::save_metadata('counter', $counter); } } function parseAddress($val) { preg_match_all('|<strong>Telephone: </strong>(.*?)<br />|', $val, $phone); if (isset($phone[1][0])) { $dat['phone'] = trim($phone[1][0]); } else { $dat['phone'] = ''; } preg_match_all('|<strong>E-mail: </strong><a href="mailto:(.*?)">.*?</a><br />|', $val, $email); if (isset($email[1][0])) { $dat['email'] = trim($email[1][0]); } else { $dat['email'] = '';
$formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName)); $_GLOBAL['clubs'][] = $formatClubName; echo 'running ' . $formatClubName . "\n"; foreach ($dom->find('table', 2)->find('tr') as $row) { if (is_numeric($row->find('td', 0)->plaintext)) { $year = trim($row->find('td', 0)->plaintext); $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext)); if (trim($position) == 'Champion') { $position = 1; } $leagueLevel = trim($row->find('td', 2)->plaintext); $overallPosition = trim($row->find('td', 3)->plaintext); $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext)); $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext)); $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance); scraperwiki::save(array('club', 'year'), $dataset); } } /* * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak */ $dom->clear(); unset($dom); } foreach ($frontDom->find('a') as $link) { if (strpos($link->href, 'attnclub') !== FALSE) { clubURL('http://www.european-football-statistics.co.uk/' . $link->href); } } scraperwiki::save_metadata('Clubs', implode(',', $_GLOBAL['clubs']));