$data = array('id' => $html['id']); $h2s = $dom->find("h2"); //do not use for Czechoslovak parliament //note: Czechoslovak parliament has IDs between 20551 and 23571 (possibly not continous) $cs = strpos($h2s[0]->innertext, 'shromáždění České a Slovenské Federativní republiky'); if (!($cs > 0)) { scraperwiki::save_var('last_id', $html['id']); continue; } //schuze $bigs = $dom->find("big"); preg_match('/([0-9]{1,}). schůze/', $bigs[0]->plaintext, $matches); $schuze = $matches[1]; //date preg_match('/([0-9]{1,}). ([0-9]{1,}). ([0-9]{1,})/', $bigs[0]->plaintext, $matches); $date = $matches[3] . '-' . n2($matches[2]) . '-' . n2($matches[1]); //time preg_match('/([0-9]{1,}:[0-9]{1,})/', $bigs[0]->plaintext, $matches); $time = $matches[1]; //name (if exists) $name = $bigs[1]->plaintext; //tables //there is an error at the source pages - missing <table> tag $tables = $dom->find("table"); $tables2 = $tables[0]->find("table"); //echo $tables2[0]->outertext;die(); //first table $array = $tables2[0]->find("tr"); array_shift($array); foreach ($array as $row) { $tds = $row->find("td");
$dom->load($html); //is it valid division (or empty) $titles = $dom->find("title"); if ($titles[0]->plaintext == 'Chyba SQW' or $titles[0]->plaintext == 'Systémová chyba SQW' or strpos($html, 'ErrNo:') > 0 or $titles[0]->plaintext == 'Error response' or $html == '' or $titles[0]->plaintext == '503 Service Temporarily Unavailable' or $titles[0]->plaintext == 'Hlasování nenalezeno') { //there are problems at the psp.cz (maybe around 3am CET) scraperwiki::save_var('last_url', $last_ok_url); echo "there are problems at the psp.cz (maybe around 3am CET), stopping"; die; } else { //find date $h1 = $dom->find('h1', 0); $h1 = str_replace(' ', '', $h1->innertext); preg_match('/([0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,4})/', $h1, $dates); preg_match('/([0-9]{2}:[0-9]{2})/', $h1, $times); $dates_ar = explode('.', $dates[1]); $iso_date = $dates_ar[2] . '-' . n2($dates_ar[1]) . '-' . n2($dates_ar[0]); //find previous and next $div = $dom->find('div[class=document-nav-x]', 0); if (is_object($div)) { $prevv = $div->find('a[class=prev]', 0); $nextt = $div->find('a[class=next]', 0); $previous = $prevv->href; $data = array('html' => $html, 'url' => $next_url, 'previous_url' => $previous, 'date' => $iso_date, 'time' => $times[1]); if (is_object($nextt)) { $next_url = $nextt->href; $data['next_url'] = $next_url; } else { $continue = false; } //save it scraperwiki::save_sqlite(array('url'), $data);
//info $data = array('id' => $row['id']); $divs = $dom->find("div[class=voting_stats_summary_full]"); //session preg_match('/CisSchodze=([0-9]{1,})/', $divs[0]->innertext, $matches); $data['session'] = $matches[1]; //term preg_match('/CisObdobia=([0-9]{1,})/', $divs[0]->innertext, $matches); $data['term'] = $matches[1]; //date and time $divs2 = $divs[0]->find("div[class=grid_4]"); $spans = $divs2[1]->find("span"); preg_match('/([0-9]{1,2}). ([0-9]{1,2}). ([0-9]{4})/', $spans[0]->innertext, $matches); $data['date'] = $matches[3] . '-' . n2($matches[2]) . '-' . n2($matches[1]); preg_match('/([0-9]{1,2}):([0-9]{1,2})/', $spans[0]->innertext, $matches); $data['time'] = n2($matches[1]) . ':' . n2($matches[2]); //number of division in session $spans = $divs2[2]->find("span"); $data['division_number'] = trim($spans[0]->innertext); //name $divs2 = $divs[0]->find("div[class=grid_12]"); $spans = $divs2[0]->find("span"); $data['name'] = trim($spans[0]->innertext); //results if (isset($divs2[1])) { $spans = $divs2[1]->find("span"); if ($spans[0] == 'Návrh prešiel') { $data['result'] == 'y'; } else { if ($spans[0] == 'Návrh neprešiel') { $data['result'] == 'n';
$date = trim($in_ar2[2]) . '-' . n2(months($in_ar2[1])) . '-' . n2($in_ar2[0]); $data = array('org_id' => $row[0], 'org_name' => $name, 'since' => $date, 'text' => iconv("UTF-8", "ASCII//IGNORE", $last_text), 'active' => 'active', 'last_updated' => $today, 'last_change' => $date); scraperwiki::save_sqlite(array('org_id', 'since', 'text'), $data); } else { $last_text = $s->innertext; } } foreach ($spans2 as $s) { $in_ar0 = explode(' ', $s->plaintext); $in_ar = explode(':', str_replace(' ', '.', str_replace(' ', '', $in_ar0[0]))); if ($in_ar[0] == 'Zapsáno') { $in_ar2 = explode('.', $in_ar[1]); $date = trim($in_ar2[2]) . '-' . n2(months($in_ar2[1])) . '-' . n2($in_ar2[0]); $in_ar_until = explode(':', str_replace(' ', '.', str_replace(' ', '', $in_ar0[1]))); $in_ar2_until = explode('.', $in_ar_until[1]); $date_until = trim($in_ar2_until[2]) . '-' . n2(months($in_ar2_until[1])) . '-' . n2($in_ar2_until[0]); $data = array('org_id' => $row[0], 'org_name' => $name, 'since' => $date, 'text' => iconv("UTF-8", "ASCII//IGNORE", $last_text), 'active' => 'historical', 'until' => $date_until, 'last_updated' => $today, 'last_change' => $date_until); scraperwiki::save_sqlite(array('org_id', 'since', 'text'), $data); } else { $last_text = $s->innertext; } } } function months($m) { switch ($m) { case 'ledna': return '1'; case 'února': return '2'; case 'března':
function scrape_month($year, $month) { $month2 = n2($month); $url = "http://www.congresonacional.hn/index.php?option=com_jcalpro&Itemid=149&extmode=flat&date={$year}-{$month2}-01"; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $table0 = $dom->find('table[class=maintable]', 0); echo $year . $month; echo $table0->outertext; $tables = $table0->find('table[width=100%]'); foreach ($tables as $table) { $a = $table->find('a', 0); echo $a->outertext; die; } }