function get_rep_details($content, $source, $city) { global $run_environment; //$html = scraperWiki::scrape($url); //$dom = new simple_html_dom(); //$dom->load($html); foreach ($content->find("tr", 19)->find("td", 1)->find("table", 0)->find("tr") as $row) { $rep['title'] = trim($row->find("td", 0)->plaintext); // Skip any empty columns (often the first one) if ($rep['title'] == ' ') { continue; } $rep['title'] = str_replace(':', '', $rep['title']); $rep['name'] = trim($row->find("td", 1)->plaintext); $rep['name'] = str_replace(' ', '', $rep['name']); $official = official(); $official['government_name'] = $city; $official['government_level'] = 'municipal'; $official['type'] = null; $official['title'] = $rep['title']; $official['name_full'] = $rep['name']; $official['address_locality'] = $city; $official['address_region'] = 'TX'; $official['address_country'] = 'USA'; $official['sources'] = json_encode(array(array('description' => null, 'url' => $source, "timestamp" => gmdate("Y-m-d H:i:s")))); if ($run_environment == 'dev') { $officials[] = $official; } else { scraperwiki::save_sqlite(array('name_full', 'title', 'government_name'), $official, $table_name = 'officials'); } } if ($run_environment == 'dev') { return $officials; } else { return true; } }
function get_mayors($html) { global $run_environment; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div[@align=center] table") as $data) { //$image = $data->find("table[class=pagesSectionBodyTight] td", 0); //$text = $data->find("td", 0); //$name = $data->find("strong", 0); $raw = $data->innertext; $mayor = $data->find("strong", 0); if (!$mayor) { continue; } $mayor = $mayor->innertext; $name = substr($mayor, 0, strpos($mayor, '<br>')); $location = substr($mayor, strpos($mayor, '<br>') + 4); $city = substr($location, 0, strpos($location, ',')); $state = substr($location, strpos($location, ',') + 2); $bio_url = $data->find("a[class=pagesSectionBodyTight]", 0); if ($bio_url) { $bio_url = $bio_url->href; } else { $bio_url = null; } $info = null; $raw = null; // These conditions could probably be explicitly associated with each piece of data, but the // variation with "not available" seemed like it could apply to anything, so I figured I'd be // careful and test all condiations against each piece of data foreach ($data->find("table[class=pagesSectionBodyTight] td") as $row) { $raw_info = $row->innertext; $raw[] = $raw_info; $start = 0; $end = strlen($raw_info); if (strpos($raw_info, '<b>') && strpos($raw_info, '</b>')) { $start = strpos($raw_info, '<b>') + 3; $end = strpos($raw_info, '</b>'); } // this is to catch the wild card of "not available" if (strpos($raw_info, '<i>')) { $start = strpos($raw_info, '<i>') + 3; $end = strpos($raw_info, '</i>'); } if (strpos($raw_info, '<b>') && !strpos($raw_info, '</b>')) { $start = strpos($raw_info, '<b>') + 3; $end = strpos($raw_info, '</a></B>') - 2; } if (strpos($raw_info, '<a href=') && !strpos($raw_info, '</a></B>')) { $start = strpos($raw_info, '">') + 2; $end = strpos($raw_info, '</a>'); } if (strpos($raw_info, 'height=270 width=216')) { $start = strpos($raw_info, '<img src=') + 9; $end = strpos($raw_info, 'height=270') - 1; } $length = $end - $start; $info[] = substr($raw_info, $start, $length); } $url_photo = $info[5] ? 'http://www.usmayors.org' . $info[5] : null; $next_election = is_numeric(substr($info[2], 0, 1)) ? date("Y-m-d", strtotime($info[2])) : null; if ($name) { $official = official(); //$record[] = array...... - used for debugging $record = array('name' => $name, 'city' => $city, 'state' => $state, 'population' => $info[0], 'phone' => $info[1], 'next_election' => $next_election, 'email' => $info[3], 'url' => $info[4], 'bio_url' => $bio_url, 'url_photo' => $url_photo); $official['government_name'] = $city; $official['government_level'] = 'municipal'; $official['type'] = 'executive'; $official['title'] = 'Mayor'; //$official['description'] = ; //$official['name_given'] = ; //$official['name_family'] = ; $official['name_full'] = $name; $official['url'] = $info[4]; $official['url_photo'] = $url_photo; //$official['url_schedule'] = ; //$official['url_contact'] = ; $official['email'] = $info[3]; $official['phone'] = $info[1]; //$official['address_name'] = ; //$official['address_1'] = ; //$official['address_2'] = ; //$official['address_locality'] = ; $official['address_region'] = $state; //$official['address_postcode'] = ; //$official['current_term_enddate'] = ; //$official['last_updated'] = ; //$official['social_media'] = ; $official['other_data'] = json_encode(array('population' => $info[0], 'biography_url' => $bio_url, 'next_election' => $next_election)); //$official['conflicting_data'] = ; $official['sources'] = json_encode(array(array('description' => null, 'url' => 'http://usmayors.org/meetmayors/mayorsatglance.asp', "timestamp" => gmdate("Y-m-d H:i:s")))); if ($run_environment == 'dev') { $officials[] = $official; } else { scraperwiki::save_sqlite(array('title', 'name_full', 'government_name'), $official, $table_name = 'officials'); } } } if ($run_environment == 'dev') { return $officials; } else { return true; } }
function get_rep_details($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); if ($table = $dom->find("table", 0)) { $rows = $table->find("tr"); // echo $rows[0]->find("td", 1)->plaintext; exit; // echo $table->find("tr", 0)->find("td", 1)->plaintext; exit; $rep['name_full'] = $rows[0]->find("td", 1) ? trim($rows[0]->find("td", 1)->plaintext) : null; $rep['title'] = $rows[2]->find("td", 1) ? trim($rows[2]->find("td", 1)->plaintext) : null; $rep['address_1'] = $rows[3]->find("td", 1) ? trim($rows[3]->find("td", 1)->plaintext) : null; $rep['address_2'] = $rows[4]->find("td", 1) ? trim($rows[4]->find("td", 1)->plaintext) : null; $rep['city'] = $rows[1]->find("td", 1) ? trim($rows[1]->find("td", 1)->plaintext) : null; $rep['state'] = 'CA'; // filter zip from string $zip = $rows[5]->find("td", 1) ? trim($rows[5]->find("td", 1)->plaintext) : null; $zip = $zip ? trim(substr($zip, strrpos($zip, ' '))) : null; $rep['zip'] = $zip; $rep['phone'] = $rows[6]->find("td", 1) ? trim($rows[6]->find("td", 1)->plaintext) : null; // $rep['city_url'] = $rows[8]->find("td", 1) ? trim($rows[8]->find("td", 1)->plaintext) : null; $rep['source'] = $url; // Map to data model $official = official(); $official['government_name'] = $rep['city']; $official['government_level'] = 'municipal'; $official['type'] = null; $official['title'] = $rep['title']; $official['name_full'] = $rep['name_full']; $official['address_1'] = $rep['address_1']; $official['address_2'] = $rep['address_2']; $official['address_locality'] = $rep['city']; $official['address_region'] = $rep['state']; $official['address_postcode'] = $rep['zip']; $official['address_country'] = 'USA'; $official['phone'] = $rep['phone']; $official['sources'] = json_encode(array(array('description' => null, 'url' => $rep['source'], "timestamp" => gmdate("Y-m-d H:i:s")))); return $official; } else { return false; } }