function scrape_page() { $row = 0; $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); if (count($tds) > 8) { $row++; $GLOBALS['rowTotal']++; $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext); scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']); print $GLOBALS['rowTotal'] . "\n"; print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n"; } } if ($row == 50) { print "Vielä jatkuu, haetaan seuraava sivu..." . "\n"; $GLOBALS['z']++; scrape_page(); } else { print "Skrääpiminen suoritettu." . "\n"; print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n"; print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n"; } }
function scrap_yp($last_alphabet = '', $last_page = '') { $alphabet = range('a', 'z'); if (is_null($last_alphabet) || $last_alphabet == '') { $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded'); if (!is_null($temp_alphabet)) { $last_alphabet = $temp_alphabet; } else { $last_alphabet = 'a'; } } if (is_null($last_page) || $last_page == '') { $temp_page = scraperwiki::get_var('last_page_loaded'); if (!is_null($temp_page)) { $last_page = $temp_page; } else { $last_page = 1; } } $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page; $html = scraperWiki::scrape($yp_base_url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("ul.directory-list") as $data) { echo $data; } }
function run_ml($q_num = 0) { $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext))); $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext); /* * Stores results */ scraperwiki::save_sqlite(array("No"), $record); unset($temp_data); } foreach ($dom->find("a") as $a) { if ($a->plaintext == 'Next') { $tmp_a = $a->href; $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a); if ($tmp_a > 0) { continue; } } } if ((int) $tmp_a != 0) { run_ml($tmp_a); } else { exit; } }
function get_dom($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); return $dom; }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape("http://www.norwegian.no/fly/lavpris/?D_City=CPH&A_City=DUB&TripType=2&D_Day=1&D_Month=201104&R_Day=1&R_Month=201104&AdultCount=1&ChildCount=0&InfantCount=0"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("div"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
function scrapeMarketGroup($url) { global $visitedIds; $html = scraperWiki::scrape($url); $html = str_replace("\n", "", $html); preg_match_all("|<a href=\"/importing/61000746/marketgroup/(\\d+?)/\">(.+?)</a>|s", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $groupId = $match[1]; $groupName = html_entity_decode($match[2]); //echo $groupName."\n"; if (!in_array($groupId, $visitedIds)) { $visitedIds[] = $groupId; scrapeMarketGroup("http://goonmetrics.com/importing/61000746/marketgroup/" . $groupId . "/"); } } preg_match_all("|<tr(.*?)>(.*?)<td(.*?)><a href=\"http://games.chruker.dk/eve_online/item.php\\?type_id=(.+?)\" target=\"_blank\">(.*?)<span class=\"dot\" onclick=\"CCPEVE.showMarketDetails\\((.*?)\\)\">(.+?)</span>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)</tr>|s", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $item = array("itemId" => trim($match[4]), "name" => trim(mb_check_encoding($match[7], 'UTF-8') ? $match[7] : utf8_encode($match[7])), "weekVol" => trim(mb_check_encoding($match[11], 'UTF-8') ? $match[11] : utf8_encode($match[11])), "k6Stock" => trim(mb_check_encoding($match[17], 'UTF-8') ? $match[17] : utf8_encode($match[17]))); $item['weekVol'] = str_replace(",", "", $item['weekVol']); $item['k6Stock'] = str_replace(",", "", $item['k6Stock']); $saved = false; $delay = 0; while (!$saved && $delay < 600) { try { @scraperwiki::save_sqlite(array('itemId'), $item, 'eve_goonmetrics'); $saved = true; } catch (Exception $e) { sleep(10); $delay++; } } } }
function grep_munich($url, $table_name) { $html = scraperWiki::scrape($url); $count = 0; # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); //Drop all old informations by dropping the table scraperwiki::sqliteexecute("drop table if exists " . $table_name); scraperwiki::sqlitecommit(); $table = $dom->getElementById('flight_info_area'); foreach ($table->find('tr') as $data) { // Flight details. Read tds or ths $tds = $data->find("td"); //if there are less then 7 columns continue to next loop if (sizeof($tds) < 7) { continue; } //print $data->plaintext . "\n"; $flightnr = $tds[1]->plaintext; $from = $tds[2]->plaintext; $time = $tds[3]->plaintext; $expected_time = $tds[4]->plaintext; //Create date $date = date("Y-m-d"); //Build array of flight informations $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time); //Save the informations of one flight scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name); $count = $count + 1; } }
function getCardInfo($url) { $baseURL = 'http://gatherer.wizards.com/Pages/Card/'; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $cardImage = $dom->find('img[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cardImage]', 0)->src; $cardImage = str_replace("amp;", "", $cardImage); $imgURL = $baseURL . $cardImage; $name = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow] div[class=value]', 0)->plaintext; $name = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $name); $mana = ""; $manaImages = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow] div[class=value] img'); foreach ($manaImages as $manaItem) { $mana .= substr($manaItem->alt, 0, 1); } $mana = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $mana); $cmc = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow] div[class=value]', 0); $cmc = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cmc); $type = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow] div[class=value]', 0); $type = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $type); $text = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow] div[class=value]', 0); $text = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $text); $flavor = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow] div[class=value]', 0); $flavor = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $flavor); $cardNumber = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow] div[class=value]', 0); $cardNumber = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cardNumber); $artist = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow] div[class=value]', 0); $artist = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $artist); $rarity = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow] div[class=value]', 0); $rarity = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $rarity); $set = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_setRow] div[class=value]', 0); $set = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $set); scraperwiki::save_sqlite(array("card"), array("Card" => trim($name), "Image" => $imgURL, "Mana" => trim($mana), "CMC" => trim($cmc), "Type" => trim($type), "Card Text" => trim($text), "Flavor Text" => trim($flavor), "Artist" => trim($artist), "Card Number" => trim($cardNumber), "Rarity" => trim($rarity), "Expansion" => trim($set))); }
function read_listing($params, $url = 'http://www.auto24.ee/kasutatud/nimekiri.php') { $endpoint = build_query($url, $params); $html = scraperWiki::scrape($endpoint); $dom = new simple_html_dom(); $dom->load($html); $totalResultsEl = $dom->find('.paginator .current-range strong'); $totalResults = $totalResultsEl[0]->plaintext; $medianItem = ($totalResults + 1) / 2; if ($medianItem > RESULTS_PER_PAGE) { $listingOffset = floor($medianItem / RESULTS_PER_PAGE) * RESULTS_PER_PAGE; $params['ak'] = $listingOffset; $medianItem -= $listingOffset; $endpoint = build_query($url, $params); $html = scraperWiki::scrape($endpoint); $dom = new simple_html_dom(); $dom->load($html); } $rows = $dom->find("[@id=usedVehiclesSearchResult] .result-row"); $lPoint = floor($medianItem) - 1; $hPoint = ceil($medianItem) - 1; $a24ksi = 0; if ($lPoint == $hPoint) { $rowData = get_row_data($rows[$lPoint]); $a24ksi = $rowData['price']; } else { $lRowData = get_row_data($rows[$lPoint]); $hRowData = get_row_data($rows[$hPoint]); $a24ksi = round(($lRowData['price'] + $hRowData['price']) / 2); } return array('n' => $totalResults, 'val' => $a24ksi); }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape(BASE_URL . "?type={$param}"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("td"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
function scrape_NG_news_article($art_url) { $html = scraperWiki::scrape($art_url); require_once 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div#page_head h1") as $data) { $art_title = $data->innertext; } foreach ($dom->find("div#page_head h2") as $data) { $art_subtitle = $data->innertext; } $art_text_array = array(); $art_paragraph_count = 0; $art_text_full = ""; $art_teaser50 = ""; $art_teaser100 = ""; foreach ($dom->find("div#content div.article_text p") as $data) { $art_paragraph_count++; $tmp = str_get_html($data)->plaintext; // $art_text_array[$art_paragraph_count] = $tmp; $art_text_full .= $tmp . " #" . $art_paragraph_count . "# "; //if ($art_paragraph_count == 1) $art_teaser = $tmp; } $art_teaserS = word_teaser($art_text_full, 60); $art_teaserM = word_teaser($art_text_full, 120); /* print $art_text_full; show_article($art_title,$art_subtitle,$art_text_array); for($i=0;$i<count($art_text_array);$i++) { $art_text_full .= $art_text_array[$i]." #".$i."# "; } $art_text_full = $art_text_full->plaintext; $art_teaser = $art_text_array[0]->plaintext; */ // $record = array("Title" => $art_title, "Subtitle" => $art_subtitle, "TeaserS" => $art_teaserS, "TeaserM" => $art_teaserM, "Text" => $art_text_full, "URL" => $art_url); $record = array("TeaserM" => $art_teaserM, "URL" => $art_url); scraperwiki::save(array('URL'), $record); return $record; }
function populateDOM($htmlDOM, $src_link, $upd_flag = false) { scraperwiki::sqliteexecute("CREATE TABLE IF NOT EXISTS sources (src_link TEXT PRIMARY KEY, timestamp DATETIME, src_dump TEXT)"); echo "Checking local cache...<br>\n"; $result = scraperwiki::sqliteexecute("SELECT src_link, timestamp, src_dump FROM sources WHERE src_link = :slnk", array("slnk" => $src_link)); if (empty($result->data[0][2]) || $upd_flag == true) { echo "No Cache for this site (or force-update flag given), scraping live site for local cache...<br>\n"; // Load the site and save it locally so that we dont end up crawling their site a million times during development $source = scraperWiki::scrape($src_link); $htmlDOM->load($source); $save_source = $htmlDOM->save(); echo "Scrape complete, storing into cache...<br>\n"; scraperwiki::sqliteexecute("INSERT OR REPLACE INTO sources VALUES (:slnk, :stime, :sdmp)", array("slnk" => $src_link, "stime" => time(), "sdmp" => $save_source)); scraperwiki::sqlitecommit(); echo "Cache saved.<br>\n"; echo "Populate DOM Complete."; return $htmlDOM; } else { echo "Using local cache, as cached data exists from '" . date(DATE_RFC822, $result->data[0][1]) . ".'<br>\n"; echo "Loading...<br>\n"; $htmlDOM->load($result->data[0][2]); echo "Populate DOM Complete."; return $htmlDOM; } }
function scrapepage($url) { $html = scraperWiki::scrape($url); $html = new simple_html_dom(); $html->load($url); foreach ($html->find("table[@class='products-list'] tr td h2 a") as $menu_link) { $menu_link = $product_link->href; echo "Link to Details: " . $product_link . "<br>"; } }
function getDetails($url, $team) { global $teams; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr.active") as $data) { $tds = $data->find("td.title a"); $teams[$team]['nextOpponent'] = $tds[0]->plaintext; } }
function scrapeTeams($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $cells = $dom->find('td.cw a'); foreach ($cells as $cell) { $name = $cell->plaintext; $team = array('club' => $name); scraperWiki::save_sqlite(array('club'), $team); } }
function grab($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("#tbl_proxy_list tr") as $data) { $tds = $data->find("td"); if (count($tds) == 7) { $input = decode_ip((string) $tds[0]); $record = array('ip' => $input); scraperwiki::save(array('ip'), $record); } } }
function gazelangs($url, $lang) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $michi = "strong"; $michi = $michi . " hope"; foreach ($dom->find("ul[@class='trans_sent']") as $data) { $tds = $data->find("li"); $record = array('user_input' => $tds[0]->plaintext, 'babelfish_output' => $tds[1]->plaintext, 'timestamp_scrape' => date("Y-m-d H:i:s"), 'page' => $url, 'language' => $lang); // print json_encode($record) . "\n"; scraperwiki::save(array('user_input', 'babelfish_output', 'timestamp_scrape', 'page', 'language'), $record); } }
function scrapePage($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $cells = $dom->find('td.nom'); foreach ($cells as $cell) { $name = $cell->find('a', 0)->plaintext; $parent = $cell->parent(); $count = $parent->find('td.compte', 0)->plaintext; if ($count) { $payload = array('name' => $name, 'count' => $count); scraperWiki::save_sqlite(array('name'), $payload); } } }
function getLinks($page) { global $destination, $id, $from_city, $pisah; $id = 0; $source = scraperWiki::scrape($page); $html = new simple_html_dom(); $html->load($source); $id = $id + 1; $ticketvalues = $html->find("td[@class='ticketvalue']"); $from_city = $ticketvalues[0]->plaintext; $destination = $ticketvalues[5]->plaintext; $pisah = $ticketvalues[2]->plaintext; $railway = array("id" => $id, "from_city" => $from_city, "destination" => $destination, "pisah" => $pisah); // Save the record. saveData(array("from_city", "destination", "pisah"), $railway); }
function scrapepage($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $table = $dom->find("table"); $rows = $table[15]->find("tr"); foreach ($rows as $row) { $tds = $row->find('td'); //print "\nnewrow\n"; //print $row->plaintext; //print "\n"; //print $tds[0]->height; //print "\nendrow\n"; if (isset($tds[1])) { if ($tds[1]->height == 30) { //print $row->plaintext; $document = array(); $document['date'] = get_date(date_create($tds[1]->plaintext)); //if ($tds[1]->plaintext=='-') $document['date']=''; //else $document['date']=get_date(date_create($tds[2]->plaintext)); $document['house'] = $tds[3]->plaintext; //$document['language']=$tds[6]->plaintext; $link = $tds[5]->find('a'); $img = $tds[5]->find('img'); $document['url'] = 'http://www.parliament.gov.za/live/' . $link[0]->href; if ($img[0]->src == 'images/icon_word.gif') { $type = '.doc'; } if ($img[0]->src == 'images/icon_pdf.gif') { $type = '.pdf'; } $document['type'] = $type; scraperwiki::save(array('url'), $document); //print_r($document); //print $row->plaintext; } } } //find next page to scrape $links = $dom->find("table[style=height:26px] a"); foreach ($links as $link) { if ($link->plaintext == 'Next') { scrapepage('http://www.parliament.gov.za/live/' . $link->href); } } }
function get_state_data($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $count = 1; foreach ($dom->find("div[id=dashboard_main_content] tr") as $data) { $ths = $data->find("th"); $tds = $data->find("td"); $heading = $ths[0]->plaintext; $object = $tds[0]; $value = $object->plaintext; if (!$heading) { $heading = "Extra Field {$count}:"; $count++; } $heading = substr($heading, 0, strpos($heading, ":")); $clean_heading = str_replace(' ', '_', strtolower($heading)); $record[$clean_heading] = $value; if ($clean_heading == 'official_name' || $clean_heading == 'information' || $clean_heading == 'governor') { foreach ($object->find("a") as $link) { $link = $link->href; $new_heading = $clean_heading . "_url"; if (substr($link, 0, 7) == 'mailto:') { $record['email'] = substr($link, 7, strlen($link) - 7); } else { $heading = "{$heading} URL"; $record[$new_heading] = $link; } } } if ($clean_heading == 'information') { $phone_numbers = get_phone_numbers($value); $record['phone_primary'] = $phone_numbers[0] ? $phone_numbers[0] : ''; $record['phone_secondary'] = $phone_numbers[1] ? $phone_numbers[1] : ''; } } if (!isset($record['email'])) { $record['email'] = null; } if (isset($record)) { return $record; } else { return 'hello'; } }
function scrape_pby_office() { $url = "http://oga.pcusa.org/section/departments/mid-councils/directory-a-m/"; print "URL=" . $url . "\n"; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("<a") as $tds) { if (substr($tds, 0, 8) == "<a name=") { foreach ($tds as $y) { print $y . "\n"; } //print "tds=" . $tds . "\n"; //break; } } }
function ProductInfo($motherboards) { foreach ($motherboards as $mobo) { $html = scraperWiki::scrape($mobo['URI']); $dom = new simple_html_dom(); $dom->load($html); $specs = $dom->find('div#specifications'); $video = $specs[0]->find('tr#GraphicsOutput td', 1)->plaintext; $hdmi = preg_match('/hdmi/', strtolower($video)); $vga = preg_match('/vga/', strtolower($video)); $dp = preg_match('/dp|displayport|display[ ]port/', strtolower($video)); $details = array('Name' => $mobo['Name'], 'URI' => $mobo['URI'], 'Status' => $specs[0]->find('div#infosectionessentials tr', 1)->find('td', 1)->plaintext, 'Form factor' => $specs[0]->find('tr#FormFactor td', 1)->plaintext, 'Socket' => $specs[0]->find('tr#SupportedCPUSocket td', 1)->plaintext, 'HDMI' => $hdmi, 'VGA' => $vga, 'DP' => $dp); //print_r($details); scraperwiki::save_sqlite(array('Name'), $details); $output[] = $details; } return $output; }
function scrape_job_page($page) { $page_html = scraperWiki::scrape("https://jobsearch.direct.gov.uk/JobSearch/PowerSearch.aspx?tm=0&pg=" . $page); $dom = new simple_html_dom(); $dom->load($page_html); foreach ($dom->find("table tr") as $data) { $tds = $data->find("td"); if (count($tds) == 5) { $id_hyperlink = $tds[0]->find('a[name]', 0); $id = intval($id_hyperlink->name); $more_info_hyperlink = $tds[2]->find('a', 0)->href; print $more_info_hyperlink; $record = array('id' => $id, 'posted_date' => date_create($tds[0]->plaintext), 'job_title' => trim($tds[2]->plaintext), 'company' => trim($tds[3]->plaintext), 'location' => trim($tds[4]->plaintext), 'url' => $more_info_hyperlink); //print json_encode($record) . "\n"; scraperwiki::save(array('id'), $record); } } $dom->__destruct(); }
function topSites() { $page = 0; $country = 'IT'; $limit = 20; $count = 0; while ($limit > $page) { $html = scraperWiki::scrape("http://www.alexa.com/topsites/countries;" . $page . "/" . $country); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("span[class=topsites-label]") as $data) { $record = array('site' => $data->plaintext); scraperwiki::save(array('site'), $record); $count++; } ++$page; } print $count; }
function getCitieListByATO($p_atoCODE = "") { $html = scraperWiki::scrape("http://www.rifiutiebonifica.puglia.it/dettaglio_differenziata.php?ato=" . $p_atoCODE . "&data=12"); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("table tr") as $data) { $tds = $data->find("td"); $a = $data->find("a"); if (isset($a[0])) { $link = $a[0]->href; $link = str_replace("dettaglio_trasmissione.php?IdComune=", "", $link); $position = strrpos($link, "&"); $id = substr($link, 0, $position); $ato = $p_atoCODE; $comuni = array('comune' => $tds[0]->plaintext, 'id' => $id); scraperwiki::save(array('id'), $comuni); } } }
function loadPageGallery($url) { $htmlGallery = scraperWiki::scrape($url); $domGallery = new simple_html_dom(); $domGallery->load($htmlGallery); foreach ($domGallery->find("div#contentDetail1") as $data) { $title = $data->find("h3"); $adressclass = $data->find('.adres'); $urlandemail = $data->find('.adres a'); $artists = $data->find('.artists'); $contactName = explode("\n", $adressclass[0]->plaintext); list($contactNameGallery) = $contactName; $tels = explode("\n", $adressclass[4]->plaintext); list($tel1, $tel2) = $tels; $record = array('name' => $title[0]->plaintext, 'contact' => $contactNameGallery, 'url' => $urlandemail[0]->plaintext, 'email' => $urlandemail[1]->plaintext, 'address' => $adressclass[1]->plaintext, 'tel1' => $tel1, 'tel2' => $tel2, 'artists' => $artists[0]->plaintext); scraperwiki::save(array('name', 'contact', 'url', 'email', 'address', 'tel1', 'tel2', 'artists'), $record); //print_r($record); } }
function scrapeArengufond() { $html = scraperWiki::scrape("http://www.arengufond.ee/news/"); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('div.template') as $data) { //print($data); $newsTitle = $data->find('h2'); if (count($newsTitle) > 0) { print $newsTitle[0]->plaintext . "\n"; } $newsBody = $data->find('td'); //print($newsBody[0]->plaintext."\n"); /* $record = array( 'title' => $newsTitle[0]->plaintext, 'newsbody' => $newsBody[0]->plaintext); scraperwiki::save(array('title', 'newsbody'), $record);*/ } }
function getPageOfResults($url, $pagenum, $cat) { $html = scraperWiki::scrape($url . "?page=" . $pagenum); $dom = new simple_html_dom(); $dom->load($html); $links = $dom->find("h2 a"); $count = 0; foreach ($links as $link) { echo $link->href . "\n"; // if (alreadyKnown($cat, $link->href)) return; $count = $count + 1; $record = array('cat' => $cat, 'url' => $link->href); scraperwiki::save(array('cat', 'url'), $record); } echo "got " . $count . " results\n"; if ($count === 0) { return; } // getPageOfResults($url, $pagenum+1, $cat); }
function data_from_overview_page($url, $type) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $count = 0; $base_url = 'http://www.dnr.state.mn.us'; foreach ($dom->find(".paddingbig table tr table tr") as $rows) { $count++; $data = $rows->find("td"); $link_image = $data[0]->find("a"); $image = $data[0]->find("img"); $link_text = $data[1]->find("a"); $name = $link_text[0]->plaintext; if (!empty($data[0]->plaintext)) { $record = array('id' => $type . '--' . strtolower(str_replace(' ', '-', $name)), 'type' => $type, 'name' => $name, 'link' => !empty($link_image[0]->href) ? $base_url . $link_image[0]->href : '', 'thumb_url' => !empty($image[0]->src) ? $image[0]->src : '', 'timestamp' => time()); scraperwiki::save(array('id'), $record); } } }