} //retrieves data about voting members of assembly from https://scraperwiki.com/scrapers/cz_praha_voting_records_retrieval/ //2010-2014 require 'scraperwiki/simple_html_dom.php'; scraperwiki::attach("cz_praha_voting_records_retrieval", "src"); $rows = scraperwiki::select("distinct(mp_id) from src.mp_vote"); foreach ($rows as $row) { $url = "http://www.praha.eu/jnp/cz/home/volene_organy/zastupitelstvo_hmp/slozeni_zastupitelstva/index.html?memberId=" . $row['mp_id']; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $part = get_first_string($html, '</h2>', '<div>'); $name = trim($dom->find('h2', 0)->plaintext); $email = get_first_string($part, 'mailto:', '"'); $party = trim(get_first_string($part, 'Strana:</span>', '<br')); $club = trim(get_first_string(get_first_string($part, 'Klub:</span>', '</a') . '::', '">', '::')); $data[] = array('id' => $row['mp_id'], 'name' => $name, 'party' => $party, 'club' => $club); } scraperwiki::save_sqlite(array('id'), $data, 'info'); /** * finds substrings between opening and closing markers * @return result array of the substrings */ function returnSubstrings($text, $openingMarker, $closingMarker) { $openingMarkerLength = strlen($openingMarker); $closingMarkerLength = strlen($closingMarker); $result = array(); $position = 0; while (($position = strpos($text, $openingMarker, $position)) !== false) { $position += $openingMarkerLength;
$html = '<html><body>' . $r['html'] . '</body></html>'; $dom = new simple_html_dom(); $dom->load($html); $info = array('id' => $r['id'], 'decision_number' => $r['decision_number'], 'date' => $r['date'], 'document_number' => $r['document_number'], 'name' => $r['name'], 'passed' => $r['passed'], 'link' => $r['link']); $part = get_first_string($html, '</h2>', '<div>'); $info['for'] = trim(get_first_string($part, 'pro:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'pro:</span>', '<br')); $info['against'] = trim(get_first_string($part, 'proti:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'proti:</span>', '<br')); $info['abstain'] = trim(get_first_string($part, 'zdržel se:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'zdržel se:</span>', '<br')); $info['number_representatives'] = trim(get_first_string($part, 'Počet zastupitelů:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'Počet zastupitelů:</span>', '<br')); $info['present'] = trim(get_first_string($part, 'přítomno:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'přítomno:</span>', '<br')); $trs = $dom->find("table[class=data-grid]", 0)->find("tr"); array_shift($trs); $data = array(); foreach ($trs as $tr) { $tds = $tr->find("td"); $mp_id = get_first_string($tds[0]->find("a", 0)->href . "&", "memberId=", "&"); $data[] = array('division_id' => $info['id'], 'mp_id' => $mp_id, 'vote' => trim($tds[1]->plaintext), 'mp_name' => $tds[0]->plaintext); } //one division done scraperwiki::save_sqlite(array('id'), $info, 'division'); scraperwiki::save_sqlite(array('division_id', 'mp_id'), $data, 'mp_vote'); scraperwiki::save_var('last_id', $info['id']); } /** * finds substrings between opening and closing markers * @return result array of the substrings */ function returnSubstrings($text, $openingMarker, $closingMarker) { $openingMarkerLength = strlen($openingMarker); $closingMarkerLength = strlen($closingMarker);
//preg_match('/id=([0-9]{1,})/',$as[0]->href,$matches); //simple_html_dom.php preg_match('/id=([0-9]{1,})/', $as[0], $matches); $data_mp['mp_unique_id'] = $matches[1]; //$tmp = explode ('-',$as[0]->plaintext); //simple_html_dom.php $tmp = explode('-', get_first_string($as[0], '">', '<')); $tmp2 = explode('/', trim(end($tmp))); $data_mp['state'] = $tmp2[1]; $data_mp['party'] = $tmp2[0]; array_pop($tmp); $data_mp['name'] = trim(implode('-', $tmp)); } scraperwiki::save_sqlite(array('term', 'mp_id'), $data_mp, 'mp'); //votes $data = array(); //$trs0 = $dom->find('table[class=tabela-1]',0); //simple_html_dom.php $trs0 = get_first_string($html[0]['html'], '<table class="tabela-1"', '<!--Fim Código-->'); $trs = returnSubstrings($trs0, '<tr', '</tr>'); if (count($trs) > 0) { array_shift($trs); //first row is the header foreach ($trs as $tr) { //$tds = $tr->find('td'); //simple_html_dom.php; $tds = returnSubstrings($tr, '<td', '</td>'); //if ($tr->class == 'even') { //session //simple_html_dom.php if (strpos($tr, 'even') > 0) { //session //$da = explode('/',trim($tds[0]->plaintext)); //simple_html_dom.php $da = explode('/', trim(strip_tags('<td' . $tds[0]))); $date = $da[2] . '-' . $da[1] . '-' . $da[0]; //$session = trim($tds[1]->plaintext); //simple_html_dom.php $session = trim(strip_tags('<td' . $tds[1]));
if ($number > 500) { echo $url1 . "** has more than 500 divisions -> problem -> needs to solve pagination"; die; } $trs = $dom1->find("table[class=data-grid]", 0)->find("tr"); array_shift($trs); $data = array(); foreach ($trs as $tr) { $tds = $tr->find('td'); $datum = array('decision_number' => $tds[0]->plaintext, 'date' => convert_date($tds[1]->plaintext), 'document_number' => $tds[2]->plaintext, 'name' => $tds[3]->plaintext, 'passed' => $tds[4]->plaintext, 'link' => htmlspecialchars_decode($tds[4]->find('a', 0)->href)); $datum['id'] = get_first_string($datum['link'] . "&", "votingId=", "&"); $url = "http://www.praha.eu" . $datum['link']; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $datum['html'] = '<h1>' . get_first_string($dom->innertext, '<h1>', '</table>') . '</table>'; $data[] = $datum; } //one session done: scraperwiki::save_sqlite(array('id'), $data, 'division'); scraperwiki::save_sqlite(array('id'), $s, 'session'); } } /** * converts dates formats between Central European and ISO (ISO 8601) * @return converted date * examples: * convert_date('2010-02-15','to euro') * returns '15.2.2010; * convert_date('15.2.2010') * returns '2010-02-15'
function cl_camara_division2array($division_id) { include "./db.inc.php"; $db = new MySQL(); //if(!$db->init()) die("¡¡¡ERROR!!!<BR>\n"); $url = "http://www.camara.cl/trabajamos/sala_votacion_detalle.aspx?prmID="; //3274 $out = array(); $out['division_id'] = $division_id; $html = Grabber($url . $division_id); $out['original_html'] = $html; $a_favor_sub = get_first_string($html, 'A favor</h2>', 'En contra</h2>'); $en_contra_sub = get_first_string($html, 'En contra</h2>', 'Abstención</h2>'); $abstencion_sub = get_first_string($html, 'Abstención</h2>', 'Dispensados Art. 5°</h2>'); $dispensados_sub = get_first_string($html, 'Dispensados Art. 5°</h2>', 'Pareos</h2>'); $pareos_sub = get_first_string($html, 'Pareos</h2>', '</div>'); $table_sub = get_first_string($html, '<table class="tabla resumenvotacion">', '</table>'); $table_control_number = returnSubstrings($table_sub, '<td>', '</td>'); $fecha = trim(get_first_string($html, 'Fecha:</strong>', '</p>')); $materia = str_replace("'", "\\'", trim(get_first_string($html, 'Materia:</strong>', '</p>'))); $out['info']['topic'] = $materia; if ($materia == "") { $materia = str_replace("'", "\\'", trim(get_first_string($html, 'Observaciones:</strong>', '</p>'))); $out['info']['topic'] = $materia; } $articulo = str_replace("'", "\\'", trim(get_first_string($html, 'Artículo:</strong>', '</p>'))); $out['info']['article'] = $articulo; $sesion = str_replace("'", "\\'", trim(get_first_string($html, 'Sesión:</strong>', '</p>'))); $out['info']['session'] = $sesion; $tramite = str_replace("'", "\\'", trim(get_first_string($html, 'Trámite:</strong>', '</p>'))); $out['info']['step'] = $tramite; $tipo_de_votacion = str_replace("'", "\\'", strtolower(trim(get_first_string($html, 'Tipo de votación:</strong>', '</p>')))); $out['info']['division_type'] = $tipo_de_votacion; $quorum = str_replace("'", "\\'", trim(get_first_string($html, 'Quorum:</strong>', '</p>'))); $out['info']['quorum'] = $quorum; $resultado = str_replace("'", "\\'", trim(get_first_string($html, 'Resultado:</strong>', '</p>'))); $out['info']['result'] = $resultado; $name_sub = trim(get_first_string($html, '<div id ="detail">', '<p>')); $name = trim(get_first_string($name_sub, '<h2>', '</h2>')); $out['info']['name'] = $name; $fecha_db_ar = explode(' ', $fecha); global $mes; $fecha_db = $fecha_db_ar[4] . '-' . $mes[trim($fecha_db_ar[2], '.')] . '-' . $fecha_db_ar[0] . ' ' . $fecha_db_ar[5]; $fecha_db_date = $fecha_db_ar[4] . '-' . $mes[trim($fecha_db_ar[2], '.')] . '-' . $fecha_db_ar[0]; $fecha_db_time = $fecha_db_ar[5]; $out['info']['date'] = $fecha_db_date; $out['info']['time'] = $fecha_db_time; /*$query = " INSERT INTO division (division_id,divided_on,name,materia,session,article,tramite,type,quorum,result) VALUES ($row, '$fecha_db', '$name', '$materia', '$sesion', '$articulo', '$tramite', '$tipo_de_votacion', '$quorum', '$resultado') ";*/ $camara = 'C.Diputados'; $en_sala = '1'; $out['info']['enSala'] = 'true'; if (strpos($name, 'Bolet') == 0) { $nro_boletin = substr($name, 12); } if ($nro_boletin != null) { $id_proyecto_ley = $db->getIdProyectoLey($nro_boletin); } else { $id_proyecto_ley = 0; } $id_sesion = $db->getIdSesion($sesion); $name = utf8_decode($name); $tipo_de_votacion = utf8_decode($tipo_de_votacion); $articulo = utf8_decode($articulo); $materia = utf8_decode($materia); $quorum = utf8_decode($quorum); $query = "INSERT INTO Votacion (name,camara,en_sala,tipo,articulo,materia,fecha,hora,voto_si,voto_no,voto_abs,voto_disp,voto_pareos,voto_aus,resultado,quorum,id_proyecto_ley,id_sesion,id_parlamento,created_at,updated_at) VALUES ('{$name}', '{$camara}', {$en_sala}, '{$tipo_de_votacion}', '{$articulo}', '{$materia}', '{$fecha_db_date}', '{$fecha_db_time}', {$table_control_number['0']}, {$table_control_number['1']}, {$table_control_number['2']}, {$table_control_number['3']}, 0, 0, '{$resultado}', '{$quorum}', {$id_proyecto_ley}, {$id_sesion}, {$division_id}, '" . date('Y-m-d H:m:s') . "', '" . date('Y-m-d H:m:s') . "')"; //echo $query; $id_votacion = $db->insert($query); //echo $id_votacion; $a_favor_ar = returnSubstrings($a_favor_sub, 'ID=', '">'); $en_contra_ar = returnSubstrings($en_contra_sub, 'ID=', '">'); $abstencion_ar = returnSubstrings($abstencion_sub, 'ID=', '">'); $dispensados_ar = returnSubstrings($dispensados_sub, 'ID=', '">'); $pareos_ar = returnSubstrings($pareos_sub, 'ID=', '">'); $out['total'] = array('yes' => 0, 'no' => 0, 'abstain' => 0, 'dispensed' => 0, 'paired' => 0); foreach ($a_favor_ar as $mp_row) { $db->insertVoto($id_votacion, $mp_row, 'y'); $name_pom = str_replace("'", "\\'", trim(get_first_string($a_favor_sub, 'prmID=' . $mp_row . '">', '</a>'))); $names[$name_pom][$mp_row] = $mp_row; $out['mp']['mp_' . $mp_row]['mp_id'] = $mp_row; $out['mp']['mp_' . $mp_row]['vote'] = 'y'; $name_pom2 = explode('.', $name_pom); $out['mp']['mp_' . $mp_row]['name'] = trim($name_pom2[1]) . '.' . $name_pom2[2]; if ($name_pom2[0] == 'Sra') { $out['mp']['mp_' . $mp_row]['sex'] = 'f'; } else { $out['mp']['mp_' . $mp_row]['sex'] = 'm'; } $out['total']['yes']++; } foreach ($en_contra_ar as $mp_row) { $db->insertVoto($id_votacion, $mp_row, 'n'); $name_pom = str_replace("'", "\\'", trim(get_first_string($en_contra_sub, 'prmID=' . $mp_row . '">', '</a>'))); $names[$name_pom][$mp_row] = $mp_row; $out['mp']['mp_' . $mp_row]['mp_id'] = $mp_row; $out['mp']['mp_' . $mp_row]['vote'] = 'n'; $name_pom2 = explode('.', $name_pom); $out['mp']['mp_' . $mp_row]['name'] = trim($name_pom2[1]) . '.' . $name_pom2[2]; if ($name_pom2[0] == 'Sra') { $out['mp']['mp_' . $mp_row]['sex'] = 'f'; } else { $out['mp']['mp_' . $mp_row]['sex'] = 'm'; } $out['total']['no']++; } foreach ($abstencion_ar as $mp_row) { $db->insertVoto($id_votacion, $mp_row, 'a'); $name_pom = str_replace("'", "\\'", trim(get_first_string($abstencion_sub, 'prmID=' . $mp_row . '">', '</a>'))); $names[$name_pom][$mp_row] = $mp_row; $out['mp']['mp_' . $mp_row]['mp_id'] = $mp_row; $out['mp']['mp_' . $mp_row]['vote'] = 'a'; $name_pom2 = explode('.', $name_pom); $out['mp']['mp_' . $mp_row]['name'] = trim($name_pom2[1]) . '.' . $name_pom2[2]; if ($name_pom2[0] == 'Sra') { $out['mp']['mp_' . $mp_row]['sex'] = 'f'; } else { $out['mp']['mp_' . $mp_row]['sex'] = 'm'; } $out['total']['abstain']++; } foreach ($dispensados_ar as $mp_row) { $db->insertVoto($id_votacion, $mp_row, 'd'); $name_pom = str_replace("'", "\\'", trim(get_first_string($dispensados_sub, 'prmID=' . $mp_row . '">', '</a>'))); $names[$name_pom][$mp_row] = $mp_row; $out['mp']['mp_' . $mp_row]['mp_id'] = $mp_row; $out['mp']['mp_' . $mp_row]['vote'] = 'd'; $name_pom2 = explode('.', $name_pom); $out['mp']['mp_' . $mp_row]['name'] = trim($name_pom2[1]) . '.' . $name_pom2[2]; if ($name_pom2[0] == 'Sra') { $out['mp']['mp_' . $mp_row]['sex'] = 'f'; } else { $out['mp']['mp_' . $mp_row]['sex'] = 'm'; } $out['total']['dispensed']++; } foreach ($pareos_ar as $mp_row) { $db->insertVoto($id_votacion, $mp_row, 'p'); $name_pom = str_replace("'", "\\'", trim(get_first_string($pareos_sub, 'prmID=' . $mp_row . '">', '</a>'))); $names[$name_pom][$mp_row] = $mp_row; $out['mp']['mp_' . $mp_row]['mp_id'] = $mp_row; $out['mp']['mp_' . $mp_row]['vote'] = 'p'; $name_pom2 = explode('.', $name_pom); $out['mp']['mp_' . $mp_row]['name'] = trim($name_pom2[1]) . '.' . $name_pom2[2]; if ($name_pom2[0] == 'Sra') { $out['mp']['mp_' . $mp_row]['sex'] = 'f'; } else { $out['mp']['mp_' . $mp_row]['sex'] = 'm'; } $out['total']['paired']++; //check if ($table_control_number[0] == $out['total']['yes'] and $table_control_number[1] == $out['total']['no'] and $table_control_number[2] == $out['total']['abstain'] and $table_control_number[3] == $out['total']['dispensed']) { } else { $out['error'] = 'wrong sums: yes:' . $table_control_number[0] . ' vs. ' . $out['total']['yes'] . ', no:' . $table_control_number[1] . ' vs. ' . $out['total']['no'] . ', abstain:' . $table_control_number[2] . ' vs. ' . $out['total']['abstain'] . ', dispensed:' . $table_control_number[3] . ' vs. ' . $out['total']['dispensed']; } } //updated Votacion con pareos y ausentes $ausentes = 120 - $table_control_number[0] - $table_control_number[1] - $table_control_number[2] - $out['total']['paired']; $db->updatePareosAusentes($id_votacion, $out['total']['paired'], $ausentes); if (strlen($html) < 8300) { $out['error'] = 'small file; might have not been downloaded correctly or wrong id'; } return $out; }
} foreach ($htmls as $key => $html) { $dom->load($html); $divs = $dom->find('div[class=votacionesResultado]'); foreach ($divs as $div) { $as = $div->find('a'); if (count($as) > 1) { $link = $div->find('a', 1)->href; $td_ar = explode("<br>", $div->find('td', 0)); $si_ar = explode(':', $td_ar[0]); $si = trim($si_ar[1]); $no_ar = explode(':', $td_ar[1]); $no = trim($no_ar[1]); $abst_ar = explode(':', $td_ar[2]); $abst = trim($abst_ar[1]); $number = get_first_string($div->innertext, 'votacion=', '&'); $url = "http://www.congreso.es" . $link; $xml = str_replace("ISO-8859-1", "UTF-8", iconv("ISO-8859-1", "UTF-8", scraperwiki::scrape($url))); $data = array('date' => $right_date->format('Y-m-d'), 'number' => $number, 'yes' => $si, 'no' => $no, 'abstain' => $abst, 'link' => $link, 'xml' => $xml); scraperwiki::save_sqlite(array('date', 'number'), $data, 'division'); } } } // /foreach $htmls } $date = $right_date->modify('+1 day'); scraperwiki::save_var('last_date', $date->format('Y-m-d')); } /*print_r($data); if (isset($data)) {
$dom = new simple_html_dom(); $dom->load($html); //selects $selects = $dom->find("select"); //options from 2nd select $regs = $selects[1]->find("option"); foreach ((array) $regs as $reg) { $regions[] = $reg->value; } //foreach region foreach ((array) $regions as $key => $region) { if ($key >= $last_region) { //get number of records $url = "http://wwwinfo.mfcr.cz/cgi-bin/ufisreg/vyber1.pl?Viewico=1&zkokraj={$region}&uzemcelek=2&Viewnao=0&useZko=0&typ=1&pocet=0"; $html = iconv("cp1250", "UTF-8//TRANSLIT", scraperwiki::scrape($url)); $total = trim(get_first_string($html, 'Celkem nalezeno', 'záznamů')); //up to number of records for ($i = $last_i; $i < $total; $i = $i + 20) { //get the html $url = "http://wwwinfo.mfcr.cz/cgi-bin/ufisreg/vyber1.pl?Viewico=1&zkokraj={$region}&uzemcelek=2&Viewnao=0&useZko=0&typ=1&pocet={$i}"; $html = iconv("cp1250", "UTF-8//TRANSLIT", scraperwiki::scrape($url)); //get dom $dom = new simple_html_dom(); $dom->load($html); //extract the table with data $tables = $dom->find("table"); $out = array('region' => $region, 'i' => $i, 'html' => $tables[2]->innertext); //save it scraperwiki::save_sqlite(array('region', 'i'), $out); scraperwiki::save_var('last_i', $i); scraperwiki::save_var('last_region', $key);
$item['interpelation_oral'] = 0; } //interpelations written $url = "http://www.psp.cz/sqw/tisky.sqw?o={$term}&pi=" . $row['id']; $html = iconv("cp1250", "UTF-8//TRANSLIT", scraperwiki::scrape($url)); if (strpos($html, 'Celkem nalezen')) { $ar = explode(' ', trim(get_first_string($html, 'Celkem nalezen', 'tisk'))); $item['interpelation_written'] = $ar[count($ar) - 1]; } else { $item['interpelation_written'] = 0; } //law proposals / návrhy zákonů $url = "http://www.psp.cz/sqw/tisky.sqw?o={$term}&nz=" . $row['id']; $html = iconv("cp1250", "UTF-8//TRANSLIT", scraperwiki::scrape($url)); if (strpos($html, 'Celkem nalezen')) { $ar = explode(' ', trim(get_first_string($html, 'Celkem nalezen', 'tisk'))); $item['proposal'] = end($ar); } else { $item['proposal'] = 0; } //speeches (number of sessions) $url = "http://www.psp.cz/eknih/2010ps/rejstrik/jmenny/{$row['id']}.html"; //********** $html = iconv("cp1250", "UTF-8//TRANSLIT", scraperwiki::scrape($url)); preg_match_all('/#sx/', $html, $matches); $item['speech_session'] = count($matches[0]); /*print_r($item); if ($i > 3) die();*/ scraperwiki::save_var('last_id', $row['id']); scraperwiki::save_sqlite(array('id'), $item);