$html = scraperWiki::scrape("http://www.who.int/csr/don/archive/disease/en/index.html"); $dom = new simple_html_dom(); $dom->load($html); $handleNextEntry = false; // no previous entries -> start from beginning if ($start == '') { $handleNextEntry = true; } else { print "starting after: {$start}\n\n"; } foreach ($dom->find("ul[@class='a_z'] li a") as $data) { $disease = trim($data->plaintext); print "fetching data for disease: {$disease}"; if ($handleNextEntry) { print "\n"; // saves entries themselves fetchGARArchive($data->href, $disease); // saves last scraped disease scraperWiki::save_var('disease', $disease); if ($counter >= 600) { exit; } } else { print " - skipped\n"; if ($disease == $start) { $handleNextEntry = true; } } } scraperWiki::save_var('disease', '');
$tosave['PrimarySource'] = str_replace('PRIMARY SOURCE: ', '', $dom2->find("table", 1)->find("tr", 1)->plaintext); foreach ($dom2->find("table", 0)->find("tr") as $tr) { $tds = $tr->find("td"); if (count($tds) == 2) { $add = $tds[1]->plaintext; $add = str_replace(' ', '', $add); //remove double spaces if ($add == ' ') { $add = ''; } //format blanks correctly //$add=str_replace('ë','\u00CB',$add); //fix UTF error with one school //$add=str_replace('Ü','\u00DC',$add); //fix UTF error $add = utf8_encode($add); $key = $tds[0]->plaintext; $key = str_replace(' ', '', $key); //remove spaces from key $tosave[$key] = $add; } } //print_r($tosave); //print $data->value; try { scraperwiki::save(array('EMISNumber'), $tosave); } catch (Exception $e) { print 'Caught exception (' . $data->value . '): ' . $e->getMessage() . "\n"; } //break; //uncomment to just process one school } scraperWiki::save_var('place', 'none');