function scrapPage($page) { print "Scraping page " . $page; $url = "http://www.geipan.fr/index.php?id=202"; $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens="; $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); curl_setopt($curl, CURLOPT_POST, 11); curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string); $html = curl_exec($curl); print curl_error($curl) . "\n"; // print($html); $dom = new simple_html_dom(); $dom->load($html); $trs = $dom->find("tr"); foreach ($trs as $tr) { if (isset($tr->attr['onclick'])) { $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13); print $ID . "\n"; $tds = $tr->find("td"); $title = utf8_encode($tds[0]->plaintext); $date = $tds[1]->plaintext; $departement = utf8_encode($tds[2]->plaintext); $classe = $tds[3]->plaintext; $maj = $tds[4]->plaintext; $city = substr($title, 0, strpos($title, "(") - 1); $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city); scraperwiki::save(array('ID', 'maj'), $record); } } }
function clubURL($url) { $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext)); $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName)); $_GLOBAL['clubs'][] = $formatClubName; echo 'running ' . $formatClubName . "\n"; foreach ($dom->find('table', 2)->find('tr') as $row) { if (is_numeric($row->find('td', 0)->plaintext)) { $year = trim($row->find('td', 0)->plaintext); $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext)); if (trim($position) == 'Champion') { $position = 1; } $leagueLevel = trim($row->find('td', 2)->plaintext); $overallPosition = trim($row->find('td', 3)->plaintext); $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext)); $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext)); $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance); scraperwiki::save(array('club', 'year'), $dataset); } } /* * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak */ $dom->clear(); unset($dom); }
function scrapeTEDRSS($url, $sector) { print $url . " " . $sector . "\n"; // $xml = scraperWiki::scrape($url); $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); // 10 second before aborting // try CURLOPT_CONNECTTIMEOUT (in seconds) // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with): // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting $xml = curl_exec($curl); print curl_error($curl) . "\n"; $dom = new simple_html_dom(); $dom->load($xml); $items = $dom->find("item"); foreach ($items as $item) { $guid = $item->find("guid"); $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext); print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB"; echo "\n"; // $record = scrapeTEDDataPage ($noticeURL, $sector); $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL); scraperwiki::save(array('sector', 'url'), $record); sleep(1); } $dom->__destruct(); unset($items); unset($dom); unset($xml); print memory_get_usage() / 1024 / 1024 . "MB\n"; }
function saveIt($txt) { global $joke_count; $record = array('JOKE_ID' => ++$joke_count, 'JOKE_TEXT' => $txt); scraperwiki::save(array('JOKE_ID'), $record); //var_dump($record); }
function getIngredients($html) { $i = 0; $dom = new simple_html_dom(); $dom->load($html); //foreach($dom->find('result-item',1)->href as $data) //{ // if ($data != null) //$res = trim($data->plaintext); $res = $dom->find('a[class=callout]', 1)->href; $res = str_replace("reviews/", "", $res); echo "http://www.foodnetwork.com" . $res; $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res); $domFoods = new simple_html_dom(); //$domFoods->load($html1); $h = str_get_html($html1); //echo $domFoods; echo "\n\n"; foreach ($h->find('li[class=ingredient]') as $data) { $ingredient = $data->plaintext; if (isset($h->href)) { $href = $h->href; } //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){ //echo $data->plaintext; scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href)); } }
function gazelangs($url, $lang) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $michi = "strong"; $michi = $michi . " hope"; foreach ($dom->find("ul[@class='trans_sent']") as $data) { $tds = $data->find("li"); $record = array('user_input' => $tds[0]->plaintext, 'babelfish_output' => $tds[1]->plaintext, 'timestamp_scrape' => date("Y-m-d H:i:s"), 'page' => $url, 'language' => $lang); // print json_encode($record) . "\n"; scraperwiki::save(array('user_input', 'babelfish_output', 'timestamp_scrape', 'page', 'language'), $record); } }
function scrapeIdeeLab() { $html = scraperWiki::scrape("http://ideelab.wordpress.com/category/uudis/"); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('div.status-publish') as $data) { $newsTitle = $data->find('div.posttitle h2.pagetitle'); // print($newsTitle[0]->plaintext."\n"); $newsBody = $data->find('div.entry'); // print($newsBody[0]->plaintext."\n"); $record = array('title' => $newsTitle[0]->plaintext, 'newsbody' => $newsBody[0]->plaintext); scraperwiki::save(array('title', 'newsbody'), $record); } }
function grab($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("#tbl_proxy_list tr") as $data) { $tds = $data->find("td"); if (count($tds) == 7) { $input = decode_ip((string) $tds[0]); $record = array('ip' => $input); scraperwiki::save(array('ip'), $record); } } }
function extract_data($value) { $htmlvalue = str_get_html($value); //print $htmlvalue; $link = $htmlvalue->find('li[class="first last"] a', 0); $title = $htmlvalue->find('li[class="first last"] a', 0); $description = $htmlvalue->find('li[class="first last"] a', 0); $date = $htmlvalue->find('span[class="date-display-single"]', 0); $processdate = substr($date->plaintext, -10); //print $link->href. "\n"; //print $title->plaintext. "\n"; //print $description->plaintext. "\n"; $when = date_create_from_format('d/m/Y', $processdate); print_date($when); $data = array('link' => $link->href, 'title' => $title->plaintext, 'description' => $description->plaintext, 'date' => $when); scraperwiki::save(array('title'), $data); }
function getExcuse($extension) { global $html; global $count; $root = "http://www.goodexcuses.co.uk"; //$extension = "/Excuses/My-fish-is-sick-and-I-need-to-take-it-to-the-vet/" ; $html = file_get_html($root . $extension); //The excuse $excuse = $html->find('h2', 0)->innertext; echo $excuse . "\n"; //save to DB $record = array('EXCUSE_ID' => ++$count, 'EXCUSE_TEXT' => $excuse, 'EXCUSE_URL' => $extension); scraperwiki::save(array('EXCUSE_ID'), $record); //Get next url //echo "\n".goToNextURL()."\n"; goToNextURL(); }
function loadPageGallery($url) { $htmlGallery = scraperWiki::scrape($url); $domGallery = new simple_html_dom(); $domGallery->load($htmlGallery); foreach ($domGallery->find("div#contentDetail1") as $data) { $title = $data->find("h3"); $adressclass = $data->find('.adres'); $urlandemail = $data->find('.adres a'); $artists = $data->find('.artists'); $contactName = explode("\n", $adressclass[0]->plaintext); list($contactNameGallery) = $contactName; $tels = explode("\n", $adressclass[4]->plaintext); list($tel1, $tel2) = $tels; $record = array('name' => $title[0]->plaintext, 'contact' => $contactNameGallery, 'url' => $urlandemail[0]->plaintext, 'email' => $urlandemail[1]->plaintext, 'address' => $adressclass[1]->plaintext, 'tel1' => $tel1, 'tel2' => $tel2, 'artists' => $artists[0]->plaintext); scraperwiki::save(array('name', 'contact', 'url', 'email', 'address', 'tel1', 'tel2', 'artists'), $record); //print_r($record); } }
function scrape_job_page($page) { $page_html = scraperWiki::scrape("https://jobsearch.direct.gov.uk/JobSearch/PowerSearch.aspx?tm=0&pg=" . $page); $dom = new simple_html_dom(); $dom->load($page_html); foreach ($dom->find("table tr") as $data) { $tds = $data->find("td"); if (count($tds) == 5) { $id_hyperlink = $tds[0]->find('a[name]', 0); $id = intval($id_hyperlink->name); $more_info_hyperlink = $tds[2]->find('a', 0)->href; print $more_info_hyperlink; $record = array('id' => $id, 'posted_date' => date_create($tds[0]->plaintext), 'job_title' => trim($tds[2]->plaintext), 'company' => trim($tds[3]->plaintext), 'location' => trim($tds[4]->plaintext), 'url' => $more_info_hyperlink); //print json_encode($record) . "\n"; scraperwiki::save(array('id'), $record); } } $dom->__destruct(); }
function getCitieListByATO($p_atoCODE = "") { $html = scraperWiki::scrape("http://www.rifiutiebonifica.puglia.it/dettaglio_differenziata.php?ato=" . $p_atoCODE . "&data=12"); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("table tr") as $data) { $tds = $data->find("td"); $a = $data->find("a"); if (isset($a[0])) { $link = $a[0]->href; $link = str_replace("dettaglio_trasmissione.php?IdComune=", "", $link); $position = strrpos($link, "&"); $id = substr($link, 0, $position); $ato = $p_atoCODE; $comuni = array('comune' => $tds[0]->plaintext, 'id' => $id); scraperwiki::save(array('id'), $comuni); } } }
function topSites() { $page = 0; $country = 'IT'; $limit = 20; $count = 0; while ($limit > $page) { $html = scraperWiki::scrape("http://www.alexa.com/topsites/countries;" . $page . "/" . $country); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("span[class=topsites-label]") as $data) { $record = array('site' => $data->plaintext); scraperwiki::save(array('site'), $record); $count++; } ++$page; } print $count; }
function data_from_overview_page($url, $type) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $count = 0; $base_url = 'http://www.dnr.state.mn.us'; foreach ($dom->find(".paddingbig table tr table tr") as $rows) { $count++; $data = $rows->find("td"); $link_image = $data[0]->find("a"); $image = $data[0]->find("img"); $link_text = $data[1]->find("a"); $name = $link_text[0]->plaintext; if (!empty($data[0]->plaintext)) { $record = array('id' => $type . '--' . strtolower(str_replace(' ', '-', $name)), 'type' => $type, 'name' => $name, 'link' => !empty($link_image[0]->href) ? $base_url . $link_image[0]->href : '', 'thumb_url' => !empty($image[0]->src) ? $image[0]->src : '', 'timestamp' => time()); scraperwiki::save(array('id'), $record); } } }
function getPageOfResults($url, $pagenum, $cat) { $html = scraperWiki::scrape($url . "?page=" . $pagenum); $dom = new simple_html_dom(); $dom->load($html); $links = $dom->find("h2 a"); $count = 0; foreach ($links as $link) { echo $link->href . "\n"; // if (alreadyKnown($cat, $link->href)) return; $count = $count + 1; $record = array('cat' => $cat, 'url' => $link->href); scraperwiki::save(array('cat', 'url'), $record); } echo "got " . $count . " results\n"; if ($count === 0) { return; } // getPageOfResults($url, $pagenum+1, $cat); }
function scrapepage($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $rows = $dom->find("table table table table table table tr"); foreach ($rows as $row) { $tds = $row->find('td'); if ($tds[0]->height == 30) { $document = array(); $document['name'] = $tds[0]->plaintext; if ($tds[2]->plaintext == '-') { $document['date'] = ''; } else { $document['date'] = get_date(date_create($tds[2]->plaintext)); } $document['house'] = $tds[4]->plaintext; $document['language'] = $tds[6]->plaintext; $link = $tds[8]->find('a'); $img = $tds[8]->find('img'); $document['url'] = 'http://www.parliament.gov.za/live/' . $link[0]->href; if ($img[0]->src == 'images/icon_word.gif') { $type = '.doc'; } if ($img[0]->src == 'images/icon_pdf.gif') { $type = '.pdf'; } $document['type'] = $type; scraperwiki::save(array('url'), $document); //print_r($document); //print $row->plaintext; } } //find next page to scrape $links = $dom->find("table[style=height:26px] a"); foreach ($links as $link) { if ($link->plaintext == 'Next') { scrapepage('http://www.parliament.gov.za/live/' . $link->href); } } }
function topSites() { $page = 0; $country = 'US'; $limit = 20; $count = 0; while ($limit > $page) { $html = scraperWiki::scrape("http://www.alexa.com/topsites/countries;" . $page . "/" . $country); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div[class=description]") as $data) { $foobar = 'none' . $page . $count; if ($data->plaintext != " ") { $foobar = $data->plaintext; } $record = array('site' => $foobar); scraperwiki::save(array('site'), $record); $count++; } ++$page; } print $count; }
function scrape_ministers($id, $type) { $html = scraperWiki::scrape("http://apps.gcis.gov.za/gcis/InternetIncludes/gcis_list.jsp?id={$id}&heading={$type}"); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); if (count($tds) == 2) { if ($tds[0]->plaintext == "Minister:") { $minister = $tds[1]->plaintext; } elseif ($tds[0]->plaintext == "Portfolio:") { $portfolio = $tds[1]->plaintext; } elseif ($tds[0]->plaintext == "Tel no:") { $tel_no = $tds[1]->plaintext; } elseif ($tds[0]->plaintext == "Fax no:") { $fax_no = $tds[1]->plaintext; } elseif ($tds[0]->plaintext == "Mail address:") { $record = array('minister' => $minister, 'portfolio' => $portfolio, 'tel_no' => $tel_no, 'fax_no' => $fax_no, 'mail address' => $tds[1]->plaintext); scraperwiki::save(array('minister'), $record); #print json_encode($record) . "\n"; } } } }
function create_dataset2($html) { $i = 0; $dom = new simple_html_dom(); $dom->load($html); #controllo se esiste veramente prima di entrare $table = $dom->find('table', 2); if (isset($table)) { foreach ($dom->find('table', 2)->children() as $data) { echo "parsing info tabella principale"; if ($data != null) { $res = trim($data->plaintext); } if ($i > 0 && strlen($res) > 0) { # Store data in the datastore #print $res; $res = str_replace(''', "'", $res); #splitto i risultati in un array $array_result = split(' ', $res); #print_r($res); #echo $denom; # Mi salvo il codiceMPI $codMPI = trim($array_result[1]); $url_MPI = "http://www.trampi.istruzione.it/ricScu/dettaglio.do?cod=" . $codMPI; #print $url_MPI."\n"; $html = scraperwiki::scrape($url_MPI); $dom_mpi = new simple_html_dom(); $dom_mpi->load($html); $tel = ""; $fax = ""; $email = ""; $web = ""; $indS = ""; $tr = $dom_mpi->find('table[cellspacing=1] tr'); if (isset($tr)) { foreach ($dom_mpi->find('table[cellspacing=1] tr') as $data_mpi) { $res = $data_mpi->plaintext . "\n"; $values = split(':', $res); #print_r($values); if (strlen($values[0]) > 0) { if (stripos($values[0], 'tel') !== false) { $tel = trim($values[1]); #print "tel:".$tel."\t"; } else { if (stripos($values[0], 'fax') !== false) { $fax = trim($values[1]); #print "fax:".$fax."\t"; } else { if (stripos($values[0], 'e-mail') !== false) { $email = trim($values[1]); } else { if (stripos($values[0], 'web') !== false) { while (list($key, $value) = each($values)) { if ($key = 2) { $web = $values[1] . ":" . $value; } } } else { if (stripos($values[0], 'studio') !== false) { $indS = str_replace('</td>', '', $values[1]); $indS = str_replace('</tr>', '', $indS); $indS = str_replace(array("\r", "\t", "\n"), '', $indS); $indS = trim($indS); #print "ind studio:".$indS."\n"; } } } } } #echo $web."\n"; } } unset($values); } $dom_mpi->clear(); unset($dom_mpi); $dataset = array('denominazione' => trim(html_entity_decode($array_result[0])), 'codiceMPI' => trim($array_result[1]), 'tipologia' => trim(html_entity_decode($array_result[2])), 'tipologiaIIgrado' => trim(html_entity_decode($array_result[3])), 'descrizione' => trim(html_entity_decode($array_result[4])), 'indirizzo' => trim(html_entity_decode($array_result[5])), 'località' => trim(html_entity_decode($array_result[6])), 'cap' => trim($array_result[7]), 'comune' => trim(html_entity_decode($array_result[8])), 'provincia' => trim(html_entity_decode($array_result[9])), 'regione' => trim(html_entity_decode($array_result[10])), 'codIstitutoComprensivo' => trim(html_entity_decode($array_result[11])), 'telefono' => $tel, 'fax' => $fax, 'email' => $email, 'web' => $web, 'IndirizziStudio' => trim(html_entity_decode($indS))); #print_r($dataset); #scraperwiki::save(array('data'), array('data' => $data->plaintext)); if (strlen($dataset['denominazione']) > 1) { scraperwiki::save(array('denominazione', 'codiceMPI'), $dataset); } unset($dataset); unset($res); unset($tel); unset($fax); unset($email); unset($web); unset($indS); } $i = $i + 1; } #dealloco il dom sennò schianta $dom->clear(); unset($dom); } }
$terminate = $json->count; if ($terminate == 0) { $t_count = 1; break; } $list = $html2->find('div[class=fk-srch-item fk-inf-scroll-item]'); if (!empty($list)) { foreach ($list as $src) { $k = $k + 1; $linki = $src->find('h2', 0); $link = $linki->first_child()->href; $title1 = $linki->first_child()->plaintext; $author = $linki->next_sibling()->plaintext; $info = $src->first_child()->children(2)->plaintext; $record = array('id' => $k, 'link' => $link, 'title' => $title1, 'author' => $author, 'info' => $info, 'search_word' => $title); scraperwiki::save(array('id'), $record); $linki = "NA"; $link = "NA"; $title1 = "NA"; $author = "NA"; $info = "NA"; } } $html2->clear(); //if loop } $k1 = $k1 + 1; //break; //while loop } // break;
$urls[] = 'http://www.opentable.com/chicago-graduation-party-places'; $urls[] = 'http://www.opentable.com/chicago-graduation-party-places'; $urls[] = 'http://www.opentable.com/chicago-wedding-reception-venues'; foreach ($urls as $url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $row) { $url = $row->find('a.GDRNmLk'); $venueHtml = scraperWiki::scrape($url[0]->href); $domInner = new simple_html_dom(); $domInner->load($venueHtml); $name = $domInner->find('h1.RestProfileTitle span'); $description = $domInner->find('#RestaurantProfile_RestProfileGroupDiningTab_lblPrivateDiningContent, #RestaurantProfile_RestaurantProfileInfo_lblDescription'); //$type = $domInner->find('#Header_trType td'); $location = $domInner->find('div.RestProfileAddress span'); $parts = explode("<br/>", $location[0]->innertext); $addr2 = ''; $city_state = $parts[1]; if (count($parts) == 4) { $addr2 = $parts[1]; $city_state = $parts[2]; } //print_r($city_state); preg_match("/([^,]+),\\s*(\\w+)\\s*(\\d{5}(?:-\\d{4})?)/", $city_state, $matches); list($arr['addr'], $arr['city'], $arr['state'], $arr['zip']) = $matches; //print_r($arr); $record = array('name' => $name[0]->innertext, 'address1' => $parts[0], 'address2' => $addr2, 'city' => $arr['city'], 'state' => $arr['state'], 'zip' => $arr['zip'], 'type' => 'Restaruant', 'description' => $description[0]->innertext); scraperwiki::save(array('name'), $record); } }
$dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div.item-content tr") as $row) { $headers = $row->find("th"); $columns = $row->find("td"); if (preg_match("/Postcode/", $headers[0]->plaintext)) { echo $code . ": " . $columns[0]->plaintext; scraperwiki::save(array("code"), array("code" => $code, "postcode" => $columns[0]->plaintext)); break; } } } require 'scraperwiki/simple_html_dom.php'; // The following is a list of Exchange SAUIDs gleaned from SamKnows $codes = array("SWLAK", "SWLAS", "SWLCA", "SWLDR", "SWLDV", "SWLGC", "SWLHY", "SWLJ", "SWLJV", "SWLJZ", "SWLKB", "SWLKD", "SWLKY", "SWLLD", "SWLLF", "SWLLG", "SWLLM", "SWLLO", "SWLLP", "SWLLR", "SWLLU", "SWLLW", "SWLNI", "SWLNN", "SWLPI", "SWLQW", "SWLY", "SWLYA", "SWLYJ", "SWLYW", "SWMAD", "SWMAL", "SWMDE", "SWMDX", "SWMES", "SWMF", "SWMGR", "SWMGX", "SWMLZ", "SWMMN", "SWMMV", "SWMNF", "SWMT_EX", "SWMU", "SWMWY", "SWMYE", "SWMYG", "SWMYS", "SWMYU", "SWNB", "SWNBI", "SWNDO", "SWNDU", "SWNE_CH", "SWNE_EX", "SWNEN", "SWNES", "SWNM", "SWNNA", "SWNSN", "SWNTD", "SWOAG", "SWPBL", "SWPBM", "SWPDU", "SWPDW", "SWPEC", "SWPEK", "SWPEU", "SWPEV", "SWPHX", "SWPM", "SWPMQ", "SWPN", "SWPND", "SWPOM", "SWPP", "SWPQS", "SWPRU", "SWPTB", "SWPTH", "SWPTM", "SWPTY", "SWPUN", "SWPYH", "SWRAG", "SWRDA", "SWRDX", "SWRHA", "SWRHR", "SWRLS", "SWRRY", "SWRSO", "SWRSV", "SWRTH", "SWRVH", "SWRWI", "SWSAS", "SWSAW", "SWSDV", "SWSFJ", "SWSKJ", "SWSKU", "SWSMX", "SWSNI", "SWSSQ", "SWSVB", "SWSX", "SWSZX", "SWTAF", "SWTAJ", "SWTAT", "SWTB", "SWTDE", "SWTDU", "SWTEK", "SWTEZ", "SWTFA", "SWTFS", "SWTLL", "SWTLU", "SWTR", "SWTRF", "SWTRH", "SWTSA", "SWTUC", "SWUAH", "SWUAZ", "SWUCW", "SWUGI", "SWUGU", "SWUHN", "SWUTK", "SWUWN", "SWVLD", "SWVVW", "SWWCP", "SWWHT", "SWWJK", "SWWXC", "SWXNH", "SWXSX", "SWXTP", "SWXUU", "SWYBL", "SWYDU", "SWYRO", "SWYYN", "THEAR", "THH", "THHC", "THHD", "THHDY", "THHE", "THHF", "THHH", "THHM", "THHN", "THHRJ", "THHS", "THHT", "THHW", "THIN", "THIP", "THKB", "THKC", "THKE", "THLG", "THLL", "THLM", "THLP", "THLSN", "THM", "THMD", "THML", "THMO", "THMS", "THMSD", "THNB", "THNE", "THNL", "THNU", "THOH", "THOK", "THOL", "THOV", "THP", "THPC", "THPM", "THPS", "THRD", "THRG", "THRO", "THS", "THSBN", "THSCR", "THSE", "THSL", "THSL_UD", "THSPD", "THT", "THTAD", "THTF", "THTG", "THTH", "THTI", "THTT", "THTV", "THUB", "THWA", "THWDY", "THWI", "THWK", "THWL", "THWM", "THWN", "THWO", "THWP", "THWR", "THWT", "THWTH", "THWY", "THY", "WEWHAM", "WEWLOR", "WEWMAI", "WEWMAR", "WEWMAY", "WEWNPN", "WEWPAD", "WEWPRI", "WEWSOH", "WMCIT", "WMHAM", "WMHAN", "WMHAR", "WMHAS", "WMHAV", "WMHCH", "WMHIL", "WMHIM", "WMHOL", "WMHSW", "WMHX", "WMINK", "WMIPN", "WMIPS", "WMKD", "WMKDG", "WMKEM", "WMKLT", "WMKNI", "WMLEE", "WMLEI", "WMLIT", "WMLON", "WMLOW", "WMMAD", "WMMAL", "WMMFD", "WMMIC", "WMMTL", "WMNAN", "WMNEW", "WMOAK", "WMOMB", "WMONE", "WMPAX", "WMPEO", "WMPER", "WMPKR", "WMPOW", "WMRCR", "WMRID", "WMRJ", "WMROK", "WMROM", "WMRUD", "WMRUG", "WMSAN", "WMSBH", "WMSEI", "WMSEV", "WMSHB", "WMSMA", "WMSPA", "WMSPE", "WMSRK", "WMSTA", "WMSTD", "WMSTJ", "WMSTK", "WMSTO", "WMSTP", "WMSTU", "WMSUC", "WMSWY", "WMTEA", "WMTRE", "WMUPS", "WMUSN", "WMUTT", "WMUUS", "WMWAR", "WMWAT", "WMWES", "WMWET", "WMWHE", "WMWHS", "WMWIC", "WMWLN", "WMWLY", "WMWOO", "WMWR", "WMWRK", "WMWTM", "WMWYB", "WMWYC", "WMYAR", "WNHAE", "WNHAM", "WNHAN", "WNHAR", "WNHAT", "WNHAW", "WNHAY", "WNHCP", "WNHER", "WNHL", "WNHLN", "WNHMR", "WNHOD", "WNHOL", "WNHR", "WNHUN", "WNHUX", "WNHW", "WNIB", "WNIV", "WNKEL", "WNKER", "WNKIN", "WNKNG", "WNKNI", "WNKNO", "WNKT", "WNKYR", "WNLAD", "WNLAN", "WNLAR", "WNLBD", "WNLBG", "WNLBH", "WNLBR", "WNLBW", "WNLC", "WNLDA", "WNLDC", "WNLDD", "WNLDF", "WNLDG", "WNLDO", "WNLEA", "WNLED", "WNLEI", "WNLEO", "WNLEY", "WNLFF", "WNLFN", "WNLFS", "WNLFU", "WNLGD", "WNLGF", "WNLGG", "WNLGL", "WNLGN", "WNLGO", "WNLGW", "WNLGY", "WNLIN", "WNLIT", "WNLMD", "WNLMR", "WNLMY", "WNLN", "WNLNF", "WNLNO", "WNLNS", "WNLNY", "WNLON", "WNLR", "WNLRD", "WNLSF", "WNLSN", "WNLST", "WNLTH", "WNLTN", "WNLU", "WNLUD", "WNLVL", "WNLW", "WNLWA", "WNLWN", "WNLWW", "WNLYD", "WNLYI", "WNLYO", "WNM", "WNMAC", "WNMAN", "WNMAP", "WNMAR", "WNMB", "WNMD", "WNMDL", "WNMEI", "WNMFB", "WNMIC", "WNMM", "WNMOC", "WNMOE", "WNMON", "WNMOS", "WNMSB", "WNMSL", "WNMT", "WNMUC", "WNMUN", "WNMW", "WNNAN", "WNNBG", "WNNBR", "WNNCL", "WNNEB", "WNNEF", "WNNN", "WNNOR", "WNNOW", "WNNP", "WNNR", "WNNTP", "WNOAK", "WNOC", "WNOOD", "WNOSW", "WNPAI", "WNPAN", "WNPBK", "WNPCH", "WNPCO", "WNPDD", "WNPEB", "WNPEF", "WNPEG", "WNPEM", "WNPEN", "WNPG", "WNPIP", "WNPMN", "WNPNL", "WNPNN", "WNPNR", "WNPON", "WNPOR", "WNPRD", "WNPRE", "WNPRG", "WNPRL", "WNPRS", "WNPTD", "WNPTW", "WNPWL", "WNQH", "WNRAY", "WNRC", "WNRE", "WNRHD", "WNRHU", "WNRIW", "WNRM", "WNRNR", "WNROS", "WNROW", "WNRST", "WNRUA", "WNRUT", "WNRWX", "WNRYT", "WNSA", "WNSAM", "WNSAU", "WNSEA", "WNSEI", "WNSHA", "WNSHI", "WNSSM", "WNSSN", "WNSTE", "WNSTI", "WNSW", "WNSY", "WNTAL", "WNTAR", "WNTFG", "WNTH", "WNTHL", "WNTHR", "WNTIL", "WNTRA", "WNTRB", "WNTRG", "WNTRN", "WNTRR", "WNTRU", "WNTRW", "WNTRY", "WNTUD", "WNTV", "WNTW", "WNTYG", "WNTYN", "WNTYW", "WNUB", "WNUM", "WNUP", "WNVAL", "WNWA", "WNWCH", "WNWEL", "WNWEM", "WNWEO", "WNWET", "WNWIG", "WNWIT", "WNWOM", "WNWOR", "WNWPL", "WNWTN", "WNWUL", "WNWX", "WNWXL", "WNWXN", "WNYA", "WNYO", "WRKGDN", "WRNELMS", "WRPGRN", "WRPIM", "WRSKEN", "WRSLO", "WRSTHBK", "WRVAUX", "WRWHI", "WRWKEN", "WRWMIN", "WSHAM", "WSHAU", "WSHEL", "WSHOL", "WSIBR", "WSINN", "WSINS", "WSINV", "WSIRS", "WSIRV", "WSJOB", "WSJOH", "WSJOP", "WSJUR", "WSKBN", "WSKET", "WSKGE", "WSKIA", "WSKIB", "WSKIC", "WSKID", "WSKIE", "WSKIF", "WSKIG", "WSKII", "WSKIK", "WSKIL", "WSKIM", "WSKIN", "WSKIO", "WSKIP", "WSKIR", "WSKIU", "WSKIW", "WSKIY", "WSKKC", "WSKKD", "WSKKE", "WSKKF", "WSKKL", "WSKKN", "WSKKO", "WSKKR", "WSKKT", "WSKKZ", "WSKLM", "WSKLN", "WSKRK", "WSLAA", "WSLAB", "WSLAH", "WSLAK", "WSLAL", "WSLAM", "WSLAN", "WSLAR", "WSLAU", "WSLEA", "WSLED", "WSLEN", "WSLES", "WSLEW", "WSLEX", "WSLID", "WSLIS", "WSLOA", "WSLOC", "WSLOD", "WSLOE", "WSLOG", "WSLOH", "WSLOI", "WSLON", "WSLOS", "WSLOT", "WSLUI", "WSLUS", "WSMAB", "WSMAC", "WSMAH", "WSMAR", "WSMAU", "WSMAY", "WSMER", "WSMIL", "WSMIN", "WSMIT", "WSMOC", "WSMOD", "WSMOF", "WSMON", "WSMOS", "WSMOT", "WSMOU", "WSMUI", "WSNEA", "WSNEB", "WSNEC", "WSNEG", "WSNEL", "WSNES", "WSNEW", "WSOBA", "WSOCH", "WSOLD", "WSORM", "WSPAI", "WSPAL", "WSPAN", "WSPAR", "WSPAT", "WSPEN", "WSPIN", "WSPIR", "WSPOA", "WSPOC", "WSPOE", "WSPOL", "WSPOP", "WSPOR", "WSPOS", "WSPOW", "WSPRE", "WSPRO", "WSPTH", "WSPTN", "WSREN", "WSRHU", "WSRIN", "WSROC", "WSROT", "WSRUT", "WSSAL", "WSSAN", "WSSAQ", "WSSCA", "WSSCO", "WSSHE", "WSSHI", "WSSKI", "WSSKL", "WSSLI", "WSSOE", "WSSOK", "WSSOR", "WSSPR", "WSSTD", "WSSTE", "WSSTN", "WSSTO", "WSSTR", "WSSTT", "WSSTU", "WSSTW", "WSSYM", "WSTAB", "WSTAH", "WSTAR", "WSTAT", "WSTAY", "WSTHL", "WSTHO", "WSTIG", "WSTIN", "WSTIR", "WSTOB", "WSTOD", "WSTOR", "WSTOW", "WSTRO", "WSTUR", "WSTWE", "WSTWY", "WSTYN", "WSUDD", "WSULV", "WSUPL", "WSWAT", "WSWEK", "WSWEM", "WSWES", "WSWHB", "WSWHH", "WSWHI", "WSWIG", "WSWIS", "WWHARB", "WWHART", "WWHATH", "WWHAWK", "WWHAYL", "WWHBCK", "WWHBCM", "WWHCRX", "WWHELE", "WWHELS", "WWHEMY", "WWHENL", "WWHOLB", "WWHOLF", "WWHOLN", "WWHOLS", "WWHONI", "WWHTOR", "WWILCH", "WWILFR", "WWILMI", "WWINST", "WWIPPL", "WWISLE", "WWIVYB", "WWKENN", "WWKENT", "WWKGWR", "WWKILK", "WWKKWL", "WWKNGB", "WWKSTM", "WWLAND", "WWLANR", "WWLAPF", "WWLAUN", "WWLDOW", "WWLEED", "WWLIFT", "WWLISK", "WWLLAW", "WWLODD", "WWLOOE", "WWLOST", "WWLPRT", "WWLSTL", "WWLSUT", "WWLTRE", "WWLUPP", "WWLVET", "WWLWDN", "WWLYDF", "WWLYME", "WWLYNT", "WWMABT", "WWMARA", "WWMARK", "WWMART", "WWMAWG", "WWMBSH", "WWMCAN", "WWMDAM", "WWMEVA", "WWMILV", "WWMINE", "WWMITC", "WWMLBK", "WWMMAG", "WWMODY", "WWMORT", "WWMORW", "WWMOUS", "WWMPRT", "WWMSMT", "WWMTON", "WWMTVY", "WWMULL", "WWNABB", "WWNANP", "WWNCAD", "WWNCUR", "WWNCYR", "WWNETH", "WWNEWQ", "WWNFER", "WWNMOL", "WWNPTN", "WWNPWI", "WWNTAM", "WWNTAW", "WWNTCY", "WWOAKF", "WWOKEH", "WWOSMY", "WWOSTN", "WWPADS", "WWPAIG", "WWPAR", "WWPCMB", "WWPENZ", "WWPERR", "WWPINH", "WWPIPE", "WWPISA", "WWPLRN", "WWPOLP", "WWPORL", "WWPOST", "WWPOUN", "WWPRAZ", "WWPREA", "WWPRIN", "WWPRYN", "WWPSCO", "WWPSTK", "WWPTON", "WWPTRE", "WWPTWN", "WWPURI", "WWPYTH", "WWRACK", "WWREDR", "WWRILL", "WWROBO", "WWROCH", "WWRUMF", "WWSAGN", "WWSALC", "WWSALT", "WWSAMP", "WWSAUS", "WWSBNT", "WWSBUD", "WWSBUR", "WWSCAN", "WWSCHD", "WWSCIL", "WWSCLM", "WWSCOL", "WWSDAY", "WWSDOM", "WWSEAT", "WWSENN", "WWSFLM", "WWSGAB", "WWSGEN", "WWSGER", "WWSHAL", "WWSHAU", "WWSHEB", "WWSHER", "WWSHIP", "WWSHIR", "WWSIDB", "WWSIDM", "WWSILV", "WWSIVE", "WWSJUS", "WWSKEV", "WWSMAB", "WWSMAR", "WWSMER", "WWSMOL", "WWSMWG", "WWSMWS", "WWSOME", "WWSOWT", "WWSPAX", "WWSPET", "WWSTAL", "WWSTAR", "WWSTAV", "WWSTEN", "WWSTIC", "WWSTIT", "WWSTOC", "WWSTOG", "WWSTUD", "WWSUTT", "WWSWIM", "WWTAUN", "WWTAVI", "WWTEDB", "WWTEIG", "WWTEMP", "WWTHRE", "WWTIMB", "WWTINT", "WWTIVE", "WWTLIZ", "WWTOPS", "WWTORQ", "WWTORR", "WWTORX", "WWTOTN", "WWTPNT", "WWTREB", "WWTREG", "WWTRES", "WWTRUR", "WWUPOT", "WWVERY", "WWWADE", "WWWASH", "WWWBAY", "WWWCKR", "WWWDGT", "WWWDWN", "WWWEEK", "WWWELL", "WWWEMB", "WWWFRD", "WWWHEA", "WWWHIM", "WWWILL", "WWWILM", "WWWINC", "WWWITH", "WWWIVE", "WWWKLH", "WWWMON", "WWWMOR", "WWWOOD", "WWWOOL", "WWWSHM", "WWWZOY", "WWYEAL", "WWYELV", "WWYEOV", "WWYETM", "WWZELA"); foreach ($codes as $code) { echo "Loading " . $code . "..."; $html = scraperWiki::scrape("http://www.samknows.com/broadband/exchange/" . $code); echo "Loaded"; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div.item-content tr") as $row) { $headers = $row->find("th"); $columns = $row->find("td"); if (preg_match("/Postcode/", $headers[0]->plaintext)) { echo $code . ": " . $columns[0]->plaintext; scraperwiki::save(array("code"), array("code" => $code, "postcode" => $columns[0]->plaintext)); break; } } }
$b += $inclusive ? 0 : strlen($start); $e = empty($stop) ? strlen($src) : strpos(strtolower($src), strtolower($stop), $b); $e += $inclusive ? strlen($stop) : 0; $e = $e > strlen($src) ? strlen($src) : $e; if ($e > $b) { return trim(substr($src, $b, $e - $b)); } } } $URL = 'http://en.wikipedia.org/wiki/ISO_3166-1_alpha-3'; $x = scraperWiki::scrape($URL); if (!empty($x)) { $x = partof($x, 'Officially assigned code elements</span></h3>', '<h3>'); $x = partof($x, '<table', null, true); $x = explode('</table>', $x); if (count($x) > 0) { foreach ($x as $y) { $y = explode('</tr>', $y); if (count($y) > 0) { foreach ($y as $z) { if (preg_match_all('/<td(.*?)>(.*?)<\\/td>/iu', $z, $m)) { if (count($m[2]) == 2) { $d = array('code' => trim(strip_tags($m[2][0])), 'label' => trim(strip_tags($m[2][1]))); scraperwiki::save(array('code'), $d); } } } } } } }
$dropDownList = $dom->find("#fachlicheZuordnung", 0); $max = 200000; foreach ($dropDownList->children() as $option) { if ($option->tag != 'option') { continue; } if (!preg_match('/^\\d+$/', $option->value, $m)) { continue; } if ($max-- < 1) { break; } $discipline = array(); $discipline['id'] = (int) $option->value; $discipline['title'] = $option->plaintext; scraperwiki::save(array('id'), $discipline); } function load_html($url, $parameters) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POST, count(explode('&', $parameters))); curl_setopt($ch, CURLOPT_POSTFIELDS, 'task=copyRequestParametersToSession&' . $parameters); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 1); $result = curl_exec($ch); curl_close($ch); preg_match_all('|Set-Cookie: (.*?);|U', $result, $m); $cookies = implode(';', $m[1]); echo $cookies . "\n"; $ch = curl_init();
scraperwiki::save(array('Link'), array('Link' => $name)); } } require 'scraperwiki/simple_html_dom.php'; //MUSEUM /* //museum for($i=764; $i<=49; $i++){ print $i."\n"; $html = scraperwiki::scrape("http://www.mamilade.de/kinder/2006700-4---1317074400-$i-1324941496.html"); # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); */ //gastro for ($i = 1; $i <= 765; $i++) { print $i . "\n"; $html = scraperwiki::scrape("http://www.mamilade.de/gastronomie/2024700-4---1317074400-{$i}-1324976513.html"); # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); //LINK foreach ($dom->find('a.headline400') as $name) { # Store data in the datastore $name = $name->href; //print $name. "\n"; scraperwiki::save(array('Link'), array('Link' => $name)); } }
foreach ($pages_to_scrape as $page) { $html = scraperwiki::scrape($base_url . $page); $sections_dom = new simple_html_dom(); $sections_dom->load($html); $datah2X = ''; foreach ($sections_dom->find('h2 span.azpisarrera') as $datah2) { $datah2X = utf8_encode($datah2->plaintext); print "h2: " . $datah2X . "\n"; } if (!isset($datah2X) || $datah2X == '') { foreach ($sections_dom->find('h1 span') as $datah2) { $datah2X = utf8_encode($datah2->plaintext); print "h1: " . $datah2X . "\n"; } } $alldata = ''; $i = 0; $arraydom = $sections_dom->find('dt.ordaina strong'); foreach ($arraydom as $data) { $sep = "|"; if ($i == count($arraydom) - 1) { $sep = ""; } $alldata .= utf8_encode($data->plaintext) . $sep; $i++; } print "data: " . $alldata . "\n----------------\n"; $entry['Term'] = $datah2X; $entry['Definition'] = $alldata; scraperwiki::save(array('Definition'), $entry); }
<?php $html = scraperWiki::scrape("http://www.hud.gov/local/ny/homeless/familiesshelters.cfm"); //print $html . "\n"; require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("td#content-area p") as $data) { $rows = explode("<br>", $data); # print_r($rows); # print count($rows); $record = array('shelter' => $rows[0], 'address' => $rows[1], 'city' => $rows[2], 'phone' => $rows[3]); print_r($record); scraperwiki::save(array('shelter'), $record); } $html = scraperWiki::scrape("http://www.hud.gov/local/ny/homeless/familiesshelters.cfm"); //print $html . "\n"; require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("td#content-area p") as $data) { $rows = explode("<br>", $data); # print_r($rows); # print count($rows); $record = array('shelter' => $rows[0], 'address' => $rows[1], 'city' => $rows[2], 'phone' => $rows[3]); print_r($record); scraperwiki::save(array('shelter'), $record); }
$gg_url = 'http://www.google.com.au/search?&num=100&tbm=plcs&hl=en&q=' . urlencode($query) . '&start='; */ $i = 1; $size = 0; $options = array(CURLOPT_RETURNTRANSFER => true, CURLOPT_HEADER => false, CURLOPT_FOLLOWLOCATION => true, CURLOPT_ENCODING => "", CURLOPT_AUTOREFERER => true, CURLOPT_CONNECTTIMEOUT => 120, CURLOPT_TIMEOUT => 120, CURLOPT_MAXREDIRS => 10, CURLOPT_COOKIEFILE => "cookie.txt", CURLOPT_COOKIEJAR => "cookie.txt", CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3", CURLOPT_REFERER => "http://www.google.com/"); for ($page = $start; $page < $npages; $page++) { $ch = curl_init($gg_url . $page . '0'); curl_setopt_array($ch, $options); $scraped = ""; $scraped .= curl_exec($ch); curl_close($ch); $results = array(); $save = array(); $keys = array('name', 'address', 'suburb', 'phone', 'url'); preg_match_all('@line-height:1.24" valign="top">([^"]+)<br>([^"]+)<br>.<nobr>([^"]+)</nobr></table>.*<h3\\s*class="r">\\s*<a[^<>]*href="([^<>]*)"[^<>]*>(.*)</a>\\s*</h3>@siU', $scraped, $results); $address = $results[1]; $suburb = $results[2]; $phone = $results[3]; $url = $results[4]; $name = $results[5]; for ($zf = 0; $zf < count($results[0]); $zf++) { $save[] = array('name' => $name[$zf], 'address' => $address[$zf], 'suburb' => $suburb[$zf], 'phone' => $phone[$zf], 'url' => $url[$zf]); scraperwiki::save($keys, $save); //echo $address[$zf]." : ".$suburb[$zf]." : ".$phone[$zf]." : ".$url[$zf]." : ".$name[$zf]." \n"; } $size += strlen($scraped); $i++; } //fclose($fp); echo "Number of results: {$i} Total KB read: " . $size / 1024.0; print "Done.";
$link_number += 1; print "Brace yourselfs...\n"; } print "Foreach-loop made it through the end. Wow.\n\nBrace yourselfs...\n"; } print "For-loop made it through the end. Phew.\n"; print "\n"; print "First URL was \"" . $allthelinks[0] . "\"\n"; print "Last URL was \"" . $attribute_href . "\"\n"; print $link_number . " URLs found initially.\n"; $allthelinks = array_unique($allthelinks); // Throwing out those double entries $allthelinks = array_values($allthelinks); // Making new index $link_number = count($allthelinks); // Counting what's left print $link_number . " unique URLs found.\n"; $countdat = 0; foreach ($allthelinks as $datlink) { if (stripos($datlink, $victimstrace) !== FALSE) { $alltheuserlinks = array($countdat, $datlink); $countdat += 1; } } print $countdat . " URLs found belong to " . $thevictim . ".\n\n"; print "Here it goes... nggghhh\n"; var_dump($alltheuserlinks); // And now taking a dump in your face //print "scraperwiki::save says: "; scraperwiki::save(0, var_dump($allthelinks));