function scrapeTEDRSS($url, $sector) { print $url . " " . $sector . "\n"; // $xml = scraperWiki::scrape($url); $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); // 10 second before aborting // try CURLOPT_CONNECTTIMEOUT (in seconds) // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with): // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting $xml = curl_exec($curl); print curl_error($curl) . "\n"; $dom = new simple_html_dom(); $dom->load($xml); $items = $dom->find("item"); foreach ($items as $item) { $guid = $item->find("guid"); $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext); print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB"; echo "\n"; // $record = scrapeTEDDataPage ($noticeURL, $sector); $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL); scraperwiki::save(array('sector', 'url'), $record); sleep(1); } $dom->__destruct(); unset($items); unset($dom); unset($xml); print memory_get_usage() / 1024 / 1024 . "MB\n"; }
function scrape_job_page($page) { $page_html = scraperWiki::scrape("https://jobsearch.direct.gov.uk/JobSearch/PowerSearch.aspx?tm=0&pg=" . $page); $dom = new simple_html_dom(); $dom->load($page_html); foreach ($dom->find("table tr") as $data) { $tds = $data->find("td"); if (count($tds) == 5) { $id_hyperlink = $tds[0]->find('a[name]', 0); $id = intval($id_hyperlink->name); $more_info_hyperlink = $tds[2]->find('a', 0)->href; print $more_info_hyperlink; $record = array('id' => $id, 'posted_date' => date_create($tds[0]->plaintext), 'job_title' => trim($tds[2]->plaintext), 'company' => trim($tds[3]->plaintext), 'location' => trim($tds[4]->plaintext), 'url' => $more_info_hyperlink); //print json_encode($record) . "\n"; scraperwiki::save(array('id'), $record); } } $dom->__destruct(); }
scraperwiki::save_sqlite(array("stock"), $record, "NSE_Stocks"); } } } $dom->__destruct(); } //scrapping html require 'scraperwiki/simple_html_dom.php'; foreach (range('A', 'Z') as $char) { $dom = new simple_html_dom(); for ($pageNum = 0; $pageNum <= 10; $pageNum++) { $html = scraperWiki::scrape("http://www.kotaksecurities.com/stock-market-news/equity/1024/pe-ratio-NSE-All-" . $char . "/" . $pageNum); if ($html == NULL) { continue; } $dom->load($html); //print ("CHAR:".$char); foreach ($dom->find('table[class="TableBG1"]') as $table) { foreach ($table->find('tr[class="tabbody"]') as $tr) { $stock = $tr->children(0)->plaintext; $close = $tr->children(1)->plaintext; $eps = $tr->children(2)->plaintext; $pe = $tr->children(3)->plaintext; $record = array('stock' => $stock, 'close' => $close, 'eps' => $eps, 'pe' => $pe); //print_r($record); scraperwiki::save_sqlite(array("stock"), $record, "NSE_Stocks"); } } } $dom->__destruct(); }
function extractCFP($cat, $name, $link) { print " " . $name . " -- " . $link . "\n"; if (alreadyKnown($cat, $name, $link)) { return false; } $html = scraperWiki::scrape("http://www.wikicfp.com/" . str_replace(" ", "%20", $link)); $dom = new simple_html_dom(); $dom->load($html); $spans = $dom->find("span"); $type = ""; $title = ""; $link = ""; $id = ""; $description = ""; $locality = ""; $summaries = array(); $startdates = array(); $enddates = array(); $sdate = ""; $edate = ""; $deadline = ""; $notification = ""; $finalversion = ""; foreach ($spans as $span) { // print_r($span); if (isset($span->attr['property'])) { // print(" ".$span->attr['property']."=".$span->attr['content']."\n"); if (strcmp($span->attr['property'], "v:eventType") === 0) { $type = $span->attr['content']; print " type = " . $type . "\n"; } if (strcmp($span->attr['property'], "dc:title") === 0) { $title = $span->attr['content']; print " title = " . $title . "\n"; } if (strcmp($span->attr['property'], "dc:source") === 0) { $link = $span->attr['content']; print " link = " . $link . "\n"; } if (strcmp($span->attr['property'], "dc:identifier") === 0) { $id = $span->attr['content']; print " id = " . $id . "\n"; } if (strcmp($span->attr['property'], "dc:description") === 0) { $description = $span->attr['content']; print " description = " . $description . "\n"; } if (strcmp($span->attr['property'], "v:locality") === 0) { $locality = $span->attr['content']; print " locality = " . $locality . "\n"; } if (strcmp($span->attr['property'], "v:summary") === 0) { $summaries[] = $span->attr['content']; } if (strcmp($span->attr['property'], "v:startDate") === 0) { $startdates[] = $span->attr['content']; } if (strcmp($span->attr['property'], "v:endDate") === 0) { $enddates[] = $span->attr['content']; } } $dom->__destruct(); } foreach ($summaries as $ind => $summary) { if (strcmp($summary, $name) === 0) { $sdate = $startdates[$ind]; $edate = $enddates[$ind]; print " between " . $sdate . " and " . $edate . "\n"; } if (strcmp($summary, "Submission Deadline") === 0) { $deadline = $startdates[$ind]; print " deadline = " . $deadline . "\n"; } if (strcmp($summary, "Notification Due") === 0) { $notification = $startdates[$ind]; print " notification = " . $notification . "\n"; } if (strcmp($summary, "Final Version Due") === 0) { $finalversion = $startdates[$ind]; print " finalversion = " . $finalversion . "\n"; } } $record = array('id' => $id, 'category' => $cat, 'type' => $type, 'title' => $title, 'link' => $link, 'location' => $locality, 'description' => $description, 'startdate' => $sdate, 'enddate' => $edate, 'deadline' => $deadline, 'notification' => $notification, 'finalversion' => $finalversion); scraperwiki::save(array('ID', 'category'), $record); sleep(5); return true; }
function get_city_list($url) { global $run_environment; global $max_records; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); // table/tr/td/div/table/tr/td[2]/table/tr/td/table/tr[5] $content = $dom->find("div[id=ctl00_cphmain_pnlIndex]", 0)->find("table", 1); $count = 0; foreach ($content->find("a") as $link) { if ($link->href) { $city['source'] = 'http://www.floridaleagueofcities.com' . $link->href; $city['name'] = $link->plaintext; $cities[] = $city; $count++; } } // Clear memory $dom->__destruct(); $content->__destruct(); return $cities; }
function get_city_data($url) { global $run_environment; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $count = 1; // for debugging if (!$dom->find("table", 0)) { echo $url; exit; } // /html/body/table/tbody/tr/td/div/section/div/table // $content = $dom->find("table", 0)->find("tr", 0)->find("td", 0)->find("div", 0)->find("table", 0)->find("tr", 0)->find("td", 2)->find("table", 0); $content = $dom->find("table", 0)->find("tr", 0)->find("td", 0)->find("div", 0)->find("section", 0)->find("div", 0)->find("table", 0); $city['source'] = $url; $city['name_full'] = $content->find("h2", 0)->plaintext; $city['name'] = substr($city['name_full'], strpos($city['name_full'], ' of ') + 4); $city['type'] = strtolower(substr($city['name_full'], 0, strpos($city['name_full'], ' of '))); $city['url'] = $content->find("tr", 5)->find("td", 1)->find("a", 0) ? $content->find("tr", 5)->find("td", 1)->find("a", 0)->href : null; $city['region'] = trim($content->find("tr", 6)->find("td", 1)->plaintext); $city['county'] = trim($content->find("tr", 7)->find("td", 1)->plaintext); $city['address1'] = trim($content->find("tr", 8)->find("td", 1)->plaintext); $city['address2'] = trim($content->find("tr", 9)->find("td", 1)->plaintext); $city['phone'] = trim($content->find("tr", 10)->find("td", 1)->plaintext); $city['fax'] = trim($content->find("tr", 11)->find("td", 1)->plaintext); $city['council_meeting_time'] = trim($content->find("tr", 12)->find("td", 1)->plaintext); $city['year_incorporated'] = trim($content->find("tr", 13)->find("td", 1)->plaintext); $city['fiscal_year_start'] = trim($content->find("tr", 14)->find("td", 1)->plaintext); $city['population'] = trim($content->find("tr", 15)->find("td", 1)->plaintext); $city['government_type'] = trim($content->find("tr", 16)->find("td", 1)->plaintext); $city['civil_service'] = trim($content->find("tr", 17)->find("td", 1)->plaintext); $rep_details = get_rep_details($content, $url, $city['name']); // Clear memory $dom->__destruct(); $content->__destruct(); if ($run_environment == 'dev') { $city['reps'] = $rep_details; return $city; } else { scraperwiki::save_sqlite(array('name_full', 'source'), $city, $table_name = 'jurisdiction'); return true; } }
$url[] = "cpu"; $url[] = "motherboard"; $url[] = "memory"; $url[] = "internal-hard-drive"; $url[] = "video-card"; $url[] = "power-supply"; $url[] = "case"; $url[] = "monitor"; $html = scraperWiki::scrape("http://pcpartpicker.com/parts/monitor/"); $dom = new simple_html_dom(); $dom->load($html); unset($html); foreach ($dom->find("id=\"list_table\" tr") as $data) { $tds = $data->find("td"); $tdsa = $data->find("td a"); if (!empty($tds[0])) { $html_a = scraperWiki::scrape("http://pcpartpicker.com" . $tdsa[0]->href); $dom_a = new simple_html_dom(); $dom_a->load($html_a); $table_a = $dom_a->find("table class=\"box-table-a\""); $rekod_a["href"] = $tdsa[0]->href; foreach ($table_a[0]->find("tr") as $data_a) { $tds_a = $data_a->find("td"); $rekod_a[$tds_a[0]->plaintext] = $tds_a[1]->plaintext; } scraperwiki::save(array('href'), $rekod_a); //print json_encode($rekod_a) . "\n"; $dom_a->__destruct(); } } $dom->__destruct();
function get_cb_data($name, $url) { global $run_environment; global $max_records; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $count = 1; foreach ($dom->find("table[class=cb_table]") as $board) { $cb = null; $cb['source'] = $url; $cb['borough'] = $name; $cb['community_board'] = trim($board->find("td[class=cb_title]", 0)->plaintext); $cb['community_board_number'] = trim(substr($cb['community_board'], strlen('Community Board '))); $cb['city_id'] = get_city_id($cb['borough'], $cb['community_board_number']); $cb['neighborhoods'] = trim($board->find("tr", 1)->find("td", 2)->plaintext); $cb['precincts'] = trim($board->find("tr", 4)->find("td", 1)->plaintext); $cb['precinct_phones'] = trim($board->find("tr", 5)->find("td", 1)->plaintext); // Try to parse the unstructured contact info text $cb_info = trim($board->find("tr", 3)->find("td", 1)->innertext); $cb_info = str_replace('<strong>', '<b>', $cb_info); $cb_info = str_replace('</strong>', '</b>', $cb_info); $contacts = explode("<b>", $cb_info); foreach ($contacts as $val) { $val = str_replace("<br />", ",", $val); $val = trim($val); $val = explode("</b>", $val); array_walk($val, create_function('&$val', '$val = trim($val);')); if (!empty($val[1])) { $heading = trim($val[0], ","); $heading = trim($heading, ":"); $heading = strtolower(str_replace(' ', '_', $heading)); // Clean up stray html tags if (stripos($val[1], '<span>')) { $val[1] = get_between($val[1], '<span>', '</span>'); } $val[1] = trim($val[1], '</p>'); $val[1] = trim($val[1], ','); $val[1] = trim($val[1], ','); $cb[$heading] = $val[1]; } } // check if we have data in the email field that needs to be parsed like the website url if (!empty($cb['address'])) { $cb['address'] = trim($cb['address']); $cb['address'] = trim($cb['address'], ','); $cb['address'] = str_replace(",,", ",", $cb['address']); $cb['address'] = trim($cb['address']); $lines = explode(',', $cb['address']); $line_num = count($lines) - 1; if ($line_num >= 4) { $cb['address_title'] = $lines[$line_num - 4]; } else { $cb['address_title'] = $cb['borough'] . ' ' . $cb['community_board']; } if ($cb['address_title'] == $lines[$line_num - 3]) { $cb['address_1'] = $lines[$line_num - 2]; $cb['address_2'] = null; } else { $cb['address_1'] = $lines[$line_num - 3]; $cb['address_2'] = $lines[$line_num - 2]; } $zip = trim($lines[$line_num], ', NY '); $cb['address_zip'] = $zip; $cb['address_city'] = $lines[$line_num - 1]; $cb['address_state'] = 'NY'; } // check if we have data in the email field that needs to be parsed like the website url if (!empty($cb['email'])) { $snippet = new simple_html_dom(); $snippet->load($cb['email']); if ($snippet->find('a', 0)) { // Isolate the email address from the other html if (stripos($cb['email'], '<a') > 0) { $cb['email'] = trim(substr($cb['email'], 0, stripos($cb['email'], '<a'))); if (count($emails = explode(',', $cb['email'])) > 1) { $cb['all_email'] = $cb['email']; $cb['email'] = trim($emails[0]); $cb['email'] = trim($cb['email'], ' '); } } else { $cb['email'] = null; $cb['website'] = null; } $cb['website'] = $snippet->find('a', 0)->href; // External URLs have a proxy URL on nyc.gov, let's parse that off if (stripos($cb['website'], 'exit.pl')) { $cb['website'] = substr($cb['website'], stripos($cb['website'], 'exit.pl?') + 12); } } else { $cb['website'] = null; } } else { $cb['email'] = null; } // Make this field universal, even if we don't have any data if (empty($cb['all_email'])) { $cb['all_email'] = null; } // verify we didn't mix up website and email if (!empty($cb['website']) && stripos($cb['website'], 'mailto') !== FALSE) { $cb['email'] = substr($cb['website'], stripos($cb['website'], 'mailto:') + 7); $cb['website'] = null; } // Be sure to clear any stray commas if (!empty($cb['email'])) { $cb['email'] = trim($cb['email'], ','); } // normalize field names if (!empty($cb['chairperson'])) { $cb['chair'] = $cb['chairperson']; unset($cb['chairperson']); } if ($run_environment == 'dev') { $cbs[] = $cb; } else { scraperwiki::save_sqlite(array('source', 'borough', 'community_board_number'), $cb, $table_name = 'community_board'); } $count++; //if ($run_environment == 'dev' && $count > $max_records) break; // Clear memory $board->__destruct(); } // Clear memory $dom->__destruct(); if ($run_environment == 'dev') { return $cbs; } else { return true; } }
$email_rest = substr($datos_rest[1]->innertext, $posicion_corte); } else { $email_rest = ""; } $iframe = $dom_rest->find("div.TabbedPanelsContent iframe"); $posicion_corte = strpos($iframe[0]->src, "ll=") + 3; $coords_rest = substr($iframe[0]->src, $posicion_corte); $posicion_corte = strpos($coords_rest, "&"); $coords_rest = substr($coords_rest, 0, $posicion_corte); $coords = explode(",", $coords_rest); $lat_rest = $coords[0]; $lon_rest = $coords[1]; $web_rest = $dom_rest->find("span.url a.external"); if ($web_rest) { $web_rest = $web_rest[0]->href; } else { $web_rest = ""; } $type_rest = $dom_rest->find("div.col_02 p"); $type_rest = $type_rest[0]->innertext; $desc_rest = $dom_rest->find("div.col_02"); $posicion_corte = strpos($desc_rest[1]->innertext, "Descripci") + 54; $desc_rest = strip_tags(substr($desc_rest[1]->innertext, $posicion_corte)); $desc_rest = $type_rest . " - " . $desc_rest; $restaurante = array("nombre" => utf8_encode($nombre_rest), "direccion" => utf8_encode($direccion_rest), "telefono" => $telefono_rest, "descripcion" => utf8_encode($desc_rest), "lattitude" => $lat_rest, "longitude" => $lon_rest); $restaurantes[] = $restaurante; $dom_rest->__destruct(); } $dom->__destruct(); } scraperwiki::save(array('nombre'), $restaurantes);
/** * Convert Embedded CSS to Inline * @param string $document * @param bool $strip_class strip attribute class */ function convert($document, $strip_class = false) { // Debug mode // Debug mode will output selectors and styles that are detected in the embedded CSS $debug = false; // Extract the CSS preg_match('/<style[^>]+>(?<css>[^<]+)<\\/style>/s', $document, $matches); // If no CSS style if (empty($matches)) { return $document; } // Strip out extra newlines and tabs from CSS $css = preg_replace("/[\n\r\t]+/s", "", $matches['css']); // Extract each CSS declaration preg_match_all('/([-a-zA-Z0-9_ ,#\\.]+){([^}]+)}/s', $css, $rules, PREG_SET_ORDER); // For each CSS declaration, make the selector and style declaration into an array // Array index 1 is the CSS selector // Array index 2 is the CSS rule(s) foreach ($rules as $rule) { // If the CSS selector is multiple, we should split them up if (strstr($rule['1'], ',')) { // Strip out spaces after a comma for consistency $rule['1'] = str_replace(', ', ',', $rule['1']); // Unset any previous combos unset($selectors); // Make each selector declaration its own // Create a separate array element in styles array for each declaration $selectors = explode(',', $rule['1']); foreach ($selectors as $selector) { $selector = trim($selector); if (!isset($styles[$selector])) { $styles[$selector] = ''; } $styles[$selector] .= trim($rule['2']); if ($debug) { echo $selector . ' { ' . trim($rule['2']) . ' }<br/>'; } } } else { $selector = trim($rule['1']); if (!isset($styles[$selector])) { $styles[$selector] = ''; } $styles[$selector] .= trim($rule['2']); if ($debug) { echo $selector . ' { ' . trim($rule['2']) . ' }<br/>'; } } } // DEBUG: Show selector and declaration if ($debug) { echo '<pre>'; foreach ($styles as $selector => $styling) { echo $selector . ':<br>'; echo $styling . '<br/><br/>'; } echo '</pre><hr/>'; } // For each style declaration, find the selector in the HTML and add the inline CSS if (!empty($styles)) { // Load Simple HTML DOM helper require_once 'simple_html_dom.php'; $html_dom = new simple_html_dom(); // Load in the HTML without the head and style definitions $html_dom->load(preg_replace('/\\<head\\>(.+?)\\<\\/head>/s', '', $document)); foreach ($styles as $selector => $styling) { foreach ($html_dom->find($selector) as $element) { // Check to make sure the style doesn't already exist if (!stristr($element->style, $styling)) { if (strlen($element->style) > 0 && substr(rtrim($element->style), -1) !== ';') { $element->style .= ';'; } // If there is any existing style, this will append to it $element->style .= $styling; } } } $inline_css_message = $html_dom->save(); // Strip class attribute if ($strip_class === true) { $inline_css_message = preg_replace('~(<[a-z0-0][^>]*)(\\s(?:class|id)\\s*=\\s*(([\'"]).*?\\4|[^\\s]*))~usi', '\\1', $inline_css_message); } $html_dom->__destruct(); return $inline_css_message; } return false; }
function _scrapeIndexPage($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $result = array(); foreach ($dom->find("div[@id='paddingLR12'] p") as $data) { $as = $data->find("a"); $record = array('title' => $as[0]->plaintext, 'url' => URL_BASE . substr($as[0]->href, 1)); $result[] = $record; } $dom->__destruct(); return $result; }
function scrapeTEDDataPage($url, $sector) { $record = array(); // print ("a \n"); if (strcmp($url, "http://ted.europa.eu/") === 0) { return $record; } //print ("b \n"); $time = microtime(true); // $html = scraperWiki::scrape($url); $curl = curl_init($url); //print ("d \n"); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); //print ("e \n"); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); //print ("f \n"); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); //print ("g \n"); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 10); // 10 second before aborting // try CURLOPT_CONNECTTIMEOUT (in seconds) // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with): // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting //print ("h \n"); $html = curl_exec($curl); //print ("i ".curl_error($curl)."\n"); curl_close($curl); //print ("j \n"); $dom = new simple_html_dom(); //print ("k \n"); $dom->load($html); print "......done in " . (microtime(true) - $time) . "s " . memory_get_usage() / 1000000 . "MB\n"; $tabletrs = $dom->find("table[class=data] tr"); $record = array('ID' => $url, 'sector' => $sector); foreach ($tabletrs as $tabletr) { $th = $tabletr->find("th"); $tds = $tabletr->find("td"); $record[$th[0]->plaintext] = $tds[1]->plaintext; unset($th); unset($tds); } unset($tabletrs); $dom->__destruct(); unset($dom); unset($html); scraperwiki::save(array('ID', 'sector'), $record); sleep(2); return $record; }
$doc_name = trim($tr->children(1)->plaintext); $father_name = trim($tr->children(2)->plaintext); $quals_name = trim($tr->children(3)->plaintext); $univ_name = trim($tr->children(4)->plaintext); $link = $tr->find('a', 0); if ($link != null) { $link_text = $link->href; $inner_html = scraperWiki::scrape("http://www.apmedicalcouncil.com/" . $link_text); $inner_dom->load($inner_html); $birth_date = trim($inner_dom->find('div[id="birth"]', 0)->plaintext); $sex = trim($inner_dom->find('div[id="sex"]', 0)->plaintext); $quals = trim($inner_dom->find('div[id="qualification"]', 0)->plaintext); $quals2 = trim($inner_dom->find('div[id="addqualification"]', 0)->plaintext); $add = trim($inner_dom->find('div[id="address"]', 0)->plaintext); $add_more = trim($inner_dom->find('div[id="address2"]', 0)->plaintext); $record = array('regd_num' => $regd_num, 'doc_name' => $doc_name, 'father_name' => $father_name, 'quals_name' => $quals_name, 'univ_name' => $univ_name, 'birth_date' => $birth_date, 'sex' => $sex, 'qualification' => $quals, 'advanced_quals' => $quals2, 'address' => $add, 'add_more' => $add_more); } else { $record = array('regd_num' => $regd_num, 'doc_name' => $doc_name, 'father_name' => $father_name, 'quals_name' => $quals_name, 'univ_name' => $univ_name); } print_r($record); $counter++; //print_r($link); //scraperwiki::save_sqlite(array("doc_name"),$record,"AP_Docs"); if ($counter == 2) { break; } } } } $inner_dom->__destruct(); $dom->__destruct();
function get_council_list($url) { global $run_environment; global $max_records; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); // table/tr/td/div/table/tr/td[2]/table/tr/td/table/tr[5] $content = $dom->find("table[id=members_table]", 0); $count = 1; foreach ($content->find("tr") as $row) { if ($count > 1) { $councilmember['name'] = $row->find("td", 0)->plaintext; $councilmember['source'] = 'http://council.nyc.gov' . $row->find("td", 0)->find("a", 0)->href; $councilmember['district'] = $row->find("td", 1)->plaintext; $councilmember['borough'] = $row->find("td", 2)->plaintext; $councilmember['party'] = $row->find("td", 3)->plaintext; $council[] = $councilmember; } $count++; } // Clear memory $dom->__destruct(); $content->__destruct(); return $council; }
function get_link_list($url) { global $run_environment; global $max_records; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $content = $dom->find("table[class=wikitable]", 0); $count = 1; $endpoints = array(); foreach ($content->find("tr") as $row) { //Skip first line if ($count == 1) { $count++; continue; } $endpoint['jurisdiction'] = $row->find("td", 0) ? trim($row->find("td", 0)->plaintext) : null; $endpoint['services'] = $row->find("td", 6)->find("a", 0) ? $row->find("td", 6)->find("a", 0)->href : null; $endpoint['base_url'] = $endpoint['services'] ? substr($endpoint['services'], 0, strpos($endpoint['services'], 'services.xml')) : null; // Make sure we get the correct URL for discovery if ($row->find("td", 4)->find("a", 0)) { foreach ($row->find("td", 4)->find("a") as $link) { if (strtolower($link->plaintext) == 'xml') { $endpoint['discovery'] = $link->href; } } } if (empty($endpoint['discovery'])) { $endpoint['discovery'] = null; } $endpoints[] = $endpoint; unset($endpoint); $count++; } // Clear memory $dom->__destruct(); $content->__destruct(); return $endpoints; }
function _handleDetailPage($url, $result) { $html = _getHTML($url); $dom = new simple_html_dom(); $dom->load($html); $divBoxes = $dom->find("div[@class='featured_box margin_top_fb']"); if (count($divBoxes) < 1) { return $result; } $style = $divBoxes[0]->style; $styleParts = explode("'", $style); $imageURL = EUROPA_URL_BASE . $styleParts[1]; $result['logoURL'] = $imageURL; $addressText = $divBoxes[0]->xmltext; $addressTextParts = explode('</h3>', $addressText); if (count($addressTextParts) !== 2) { return $result; } $addressText = $addressTextParts[1]; $addressText = str_replace('<br />', '<br>', $addressText); $addressTextParts = explode('<br>', $addressText); if (count($addressTextParts) < 3) { return $result; } $street = trim($addressTextParts[0]); $matches = array(); $curPos = 1; preg_match('/^.*[0-9]+.*$/', $street, $matches); if (count($matches) === 0) { $street .= ' ' . trim($addressTextParts[$curPos++]); } $result['zipCity'] = trim($addressTextParts[$curPos++]); $result['country'] = trim($addressTextParts[$curPos++]); for ($i = $curPos; $i < count($addressTextParts); ++$i) { $val = strtolower(trim($addressTextParts[$i])); if (substr($val, 0, 4) === 'tel:') { $result['tel'] = trim(substr($val, 4)); } else { if (substr($val, 0, 4) === 'fax:') { $result['fax'] = trim(substr($val, 4)); } else { if (substr($val, 0, 2) === '<a') { $parts = explode('"', $val); $result['mailto'] = trim($parts[1]); } } } } $links = array(); $aElements = $dom->find("div[@id='euCenter'] a"); foreach ($aElements as $a) { if (strpos($a->href, 'http://') !== false) { $links[] = $a->href; } } if (count($links) > 0) { if (isset($result['seeAlso'])) { $result['seeAlso'] = array_merge($result['seeAlso'], $links); } else { $result['seeAlso'] = $links; } } $dom->__destruct(); return $result; }