function parseModelsPage($brandId, $brandName, $page) { $html_content = scraperwiki::scrape($page); $this->html = str_get_html($html_content); foreach ($this->html->find("div.makers a") as $el) { $img = $el->find('img', 0); $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext; $m['img'] = $img->src; $m['link'] = 'http://www.gsmarena.com/' . $el->href; $m['desc'] = $img->title; $temp = explode('-', $el->href); $m['id'] = (int) substr($temp[1], 0, -4); $m['brand_id'] = $brandId; scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model"); $this->models++; } $pagination = $this->html->find("div.nav-pages", 0); if ($pagination) { $nextPageLink = $pagination->lastChild(); if ($nextPageLink && $nextPageLink->title == "Next page") { $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href); } } $this->html->__destruct(); }
function scrapeTEDRSS($url, $sector) { print $url . " " . $sector . "\n"; // $xml = scraperWiki::scrape($url); $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); // 10 second before aborting // try CURLOPT_CONNECTTIMEOUT (in seconds) // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with): // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting $xml = curl_exec($curl); print curl_error($curl) . "\n"; $dom = new simple_html_dom(); $dom->load($xml); $items = $dom->find("item"); foreach ($items as $item) { $guid = $item->find("guid"); $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext); print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB"; echo "\n"; // $record = scrapeTEDDataPage ($noticeURL, $sector); $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL); scraperwiki::save(array('sector', 'url'), $record); sleep(1); } $dom->__destruct(); unset($items); unset($dom); unset($xml); print memory_get_usage() / 1024 / 1024 . "MB\n"; }
function saveIt($txt) { global $joke_count; $record = array('JOKE_ID' => ++$joke_count, 'JOKE_TEXT' => $txt); scraperwiki::save(array('JOKE_ID'), $record); //var_dump($record); }
function scrapeMarketGroup($url) { global $visitedIds; $html = scraperWiki::scrape($url); $html = str_replace("\n", "", $html); preg_match_all("|<a href=\"/importing/61000746/marketgroup/(\\d+?)/\">(.+?)</a>|s", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $groupId = $match[1]; $groupName = html_entity_decode($match[2]); //echo $groupName."\n"; if (!in_array($groupId, $visitedIds)) { $visitedIds[] = $groupId; scrapeMarketGroup("http://goonmetrics.com/importing/61000746/marketgroup/" . $groupId . "/"); } } preg_match_all("|<tr(.*?)>(.*?)<td(.*?)><a href=\"http://games.chruker.dk/eve_online/item.php\\?type_id=(.+?)\" target=\"_blank\">(.*?)<span class=\"dot\" onclick=\"CCPEVE.showMarketDetails\\((.*?)\\)\">(.+?)</span>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)</tr>|s", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $item = array("itemId" => trim($match[4]), "name" => trim(mb_check_encoding($match[7], 'UTF-8') ? $match[7] : utf8_encode($match[7])), "weekVol" => trim(mb_check_encoding($match[11], 'UTF-8') ? $match[11] : utf8_encode($match[11])), "k6Stock" => trim(mb_check_encoding($match[17], 'UTF-8') ? $match[17] : utf8_encode($match[17]))); $item['weekVol'] = str_replace(",", "", $item['weekVol']); $item['k6Stock'] = str_replace(",", "", $item['k6Stock']); $saved = false; $delay = 0; while (!$saved && $delay < 600) { try { @scraperwiki::save_sqlite(array('itemId'), $item, 'eve_goonmetrics'); $saved = true; } catch (Exception $e) { sleep(10); $delay++; } } } }
function do_day($rec) { $html = scraperwiki::scrape($rec['url']); $dom = new simple_html_dom(); $dom->load($html); $cell = $dom->find('a[name=discs]'); $lines = $cell[0]->parent->find('text'); print $lines[10] . "\n"; print count($lines) . "\n"; # loop by number, as null lines stop a foreach $n = 0; for ($line_no = 0; $line_no < count($lines); $line_no++) { $line = $lines[$line_no]; if (strlen($line) == 3) { # the DOM object crashes on this row, so ignore continue; } #if (preg_match("#^" . $n . "#", $line, $matches)) { print $line_no . " " . strlen($line) . "\n"; $n = $n + 1; print $line . "\n"; #} } #scraperwiki::save(array('data'), array('data' => $data->plaintext)); }
function scrapPage($page) { print "Scraping page " . $page; $url = "http://www.geipan.fr/index.php?id=202"; $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens="; $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); curl_setopt($curl, CURLOPT_POST, 11); curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string); $html = curl_exec($curl); print curl_error($curl) . "\n"; // print($html); $dom = new simple_html_dom(); $dom->load($html); $trs = $dom->find("tr"); foreach ($trs as $tr) { if (isset($tr->attr['onclick'])) { $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13); print $ID . "\n"; $tds = $tr->find("td"); $title = utf8_encode($tds[0]->plaintext); $date = $tds[1]->plaintext; $departement = utf8_encode($tds[2]->plaintext); $classe = $tds[3]->plaintext; $maj = $tds[4]->plaintext; $city = substr($title, 0, strpos($title, "(") - 1); $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city); scraperwiki::save(array('ID', 'maj'), $record); } } }
function scrape_page() { $row = 0; $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); if (count($tds) > 8) { $row++; $GLOBALS['rowTotal']++; $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext); scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']); print $GLOBALS['rowTotal'] . "\n"; print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n"; } } if ($row == 50) { print "Vielä jatkuu, haetaan seuraava sivu..." . "\n"; $GLOBALS['z']++; scrape_page(); } else { print "Skrääpiminen suoritettu." . "\n"; print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n"; print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n"; } }
function clubURL($url) { $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext)); $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName)); $_GLOBAL['clubs'][] = $formatClubName; echo 'running ' . $formatClubName . "\n"; foreach ($dom->find('table', 2)->find('tr') as $row) { if (is_numeric($row->find('td', 0)->plaintext)) { $year = trim($row->find('td', 0)->plaintext); $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext)); if (trim($position) == 'Champion') { $position = 1; } $leagueLevel = trim($row->find('td', 2)->plaintext); $overallPosition = trim($row->find('td', 3)->plaintext); $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext)); $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext)); $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance); scraperwiki::save(array('club', 'year'), $dataset); } } /* * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak */ $dom->clear(); unset($dom); }
function grep_munich($url, $table_name) { $html = scraperWiki::scrape($url); $count = 0; # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); //Drop all old informations by dropping the table scraperwiki::sqliteexecute("drop table if exists " . $table_name); scraperwiki::sqlitecommit(); $table = $dom->getElementById('flight_info_area'); foreach ($table->find('tr') as $data) { // Flight details. Read tds or ths $tds = $data->find("td"); //if there are less then 7 columns continue to next loop if (sizeof($tds) < 7) { continue; } //print $data->plaintext . "\n"; $flightnr = $tds[1]->plaintext; $from = $tds[2]->plaintext; $time = $tds[3]->plaintext; $expected_time = $tds[4]->plaintext; //Create date $date = date("Y-m-d"); //Build array of flight informations $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time); //Save the informations of one flight scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name); $count = $count + 1; } }
function scrap_yp($last_alphabet = '', $last_page = '') { $alphabet = range('a', 'z'); if (is_null($last_alphabet) || $last_alphabet == '') { $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded'); if (!is_null($temp_alphabet)) { $last_alphabet = $temp_alphabet; } else { $last_alphabet = 'a'; } } if (is_null($last_page) || $last_page == '') { $temp_page = scraperwiki::get_var('last_page_loaded'); if (!is_null($temp_page)) { $last_page = $temp_page; } else { $last_page = 1; } } $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page; $html = scraperWiki::scrape($yp_base_url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("ul.directory-list") as $data) { echo $data; } }
function getIngredients($html) { $i = 0; $dom = new simple_html_dom(); $dom->load($html); //foreach($dom->find('result-item',1)->href as $data) //{ // if ($data != null) //$res = trim($data->plaintext); $res = $dom->find('a[class=callout]', 1)->href; $res = str_replace("reviews/", "", $res); echo "http://www.foodnetwork.com" . $res; $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res); $domFoods = new simple_html_dom(); //$domFoods->load($html1); $h = str_get_html($html1); //echo $domFoods; echo "\n\n"; foreach ($h->find('li[class=ingredient]') as $data) { $ingredient = $data->plaintext; if (isset($h->href)) { $href = $h->href; } //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){ //echo $data->plaintext; scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href)); } }
function run_ml($q_num = 0) { $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext))); $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext); /* * Stores results */ scraperwiki::save_sqlite(array("No"), $record); unset($temp_data); } foreach ($dom->find("a") as $a) { if ($a->plaintext == 'Next') { $tmp_a = $a->href; $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a); if ($tmp_a > 0) { continue; } } } if ((int) $tmp_a != 0) { run_ml($tmp_a); } else { exit; } }
function getCategories($u) { global $baseurl, $f; $path = ""; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); echo "Loaded URL: " . $u . "\n"; if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) { $breadcrumb = $d->find('div[id=breadcrumb]', 0); //foreach($breadcrumb as $b) { //echo "Breadcrumb = " . $b;} if (!is_null($breadcrumb)) { foreach ($breadcrumb->children() as $crumb) { $path .= trim($crumb->innertext) . "/"; } $path .= trim(strrchr($breadcrumb->innertext, ">"), "> "); } foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) { $name = trim(strstr($div->children(0)->innertext, "(", true)); $url = $baseurl . $div->children(0)->href; $data = array("Name" => $name, "Path" => $path, "URL" => $url); echo $path . "/" . $name . "\n"; if ($local) { fputcsv($f, array($name, $path, $url)); } else { scraperwiki::save_sqlite(array("URL"), $data); } getCategories($url); } } }
function scrapeDetails($ngo) { $html_content = scraperwiki::scrape($ngo["url"]); $dom = new simple_html_dom(); $dom->load($html_content); $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:'); // Scrape Details from all paragraphs $paragraphs = $dom->find('p'); foreach ($paragraphs as $p) { if (strstr($p->plaintext, "Website")) { $ngo["website"] = $p->find('a', 0)->href; } if (strstr($p->plaintext, "Email")) { $ngo["email"] = $p->find('a', 0)->plaintext; } foreach ($infosWeWant as $key => $info) { $res = extractInfo($p, $info); if ($res) { $ngo[$info] = $res; //Do not search for this info again unset($infosWeWant[$key]); } } } print_r($ngo); return $ngo; }
function getProducts($u, $cat) { global $o; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); //echo "Loaded URL: " . $u . "\n"; $items = $d->find('li.grid-item'); if (count($items) > 0) { foreach ($items as $p) { $prod = $p->find('p.product-name > a', 0); $prodname = trim($prod->innertext); $prodURL = $prod->href; if (!is_null($p->find('p.minimal-price', 0))) { $prodtype = 1; } else { $prodtype = 0; } fputcsv($o, array($prodname, $prodtype, $cat, $prodURL)); echo $prodname . "\n"; } if (!is_null($d->find('p.next', 0))) { getProducts($d->find('p.next', 0)->href, $cat); } } }
function kcci($uuid) { // Create DOM from URL or file $html = file_get_html('http://www.kcci.com.pk/UserProfile/tabid/42/userId/' . $uuid . '/Default.aspx'); // Extract member profile from table $table = $html->find('table', 1); $profile = array(); foreach ($table->find('td') as $td) { array_push($profile, $td->plaintext); } $record['UUID'] = $uuid; for ($i = 0; $i < count($profile); $i += 2) { $record[$profile[$i]] = $profile[$i + 1]; } // Save the record ksort($record); $unique_keys = array('UUID'); scraperwiki::save_sqlite($unique_keys, $record, $table_name = "kcci", $verbose = 2); // Clean up unset($record); unset($profile); $td->clear(); unset($td); $table->clear(); unset($table); $html->clear(); unset($html); }
function alreadyKnown($cat, $url) { $data = scraperwiki::sqliteexecute("select distinct id from swdata where cat='" . $cat . "' and url='" . $url . "'"); if (count($data->data) === 0) { return false; } echo "already known : " . $url . " in " . $cat . "\n"; return true; }
function scraper($url_search, $country_id) { $has_next = false; $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet"; $html = scraperwiki::scrape($url_search); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('table[class=JResult]') as $result) { foreach ($result->find('td[class=JRTitle] a') as $job_page) { $chars = explode("'", $job_page->onclick); $url_job = $base_url . substr($chars[1], 1); $url_id = strstr($url_job, 'uniqueJvId='); $url_id = str_replace('uniqueJvId=', "", $url_id); echo "JOB: " . $url_job . "<br />"; } foreach ($result->find('th') as $data) { $text = trim($data->plaintext); if ($text == 'Description:') { $description = trim($data->next_sibling()->plaintext); echo "DESCRIPTION: " . $description . "<br />"; } if ($text == 'Source:') { $source = trim($data->next_sibling()->plaintext); $source = str_replace("'", "\\'", $source); if ($source != '' && $source != ' ') { $source_id = insert_name('source', $source); echo "SOURCE: " . $source . "<br /><br />"; } } } $description = str_replace("'", "\\'", $description); $description = str_replace("</BR>", "", $description); $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'"); $cont = mysql_num_rows($sql); if ($cont == 0) { mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()"); } else { echo "Job URL already extracted: " . $url_job . "<br /><br />"; } } foreach ($dom->find('div[class=prevNext] a') as $next_page) { $text = $next_page->plaintext; if ($text == "Next page") { $url_next = substr($next_page->href, 1); $url_next = $base_url . $url_next; $has_next = true; print "<br /><br />NEXT: " . $url_next . "<br /><br />"; } } unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search); //Comment this for tests, uncomment this to get all data // if ($has_next == true){ // sleep(1); // scraper($url_next, $country_id); // } }
function get_codes($dom) { foreach ($dom->find("select") as $data) { foreach ($data->find("option") as $op) { $record = array('stockCode' => $op->value, 'stockSymbol' => $op->plaintext); $message = scraperwiki::save_sqlite(array("stockCode"), $record); #print_r($message); } } }
function get_codes($dom) { foreach ($dom->find('tr[class^="list_row"]') as $data) { $tds = $data->find("td"); //print $tds[0]->plaintext . "\n"; $record = array('item' => $tds[0]->plaintext, 'BUY_CND' => $tds[1]->plaintext, 'SELL_CND' => $tds[2]->plaintext, 'BUY_US' => $tds[3]->plaintext, 'SELL_US' => $tds[4]->plaintext); scraperwiki::save_sqlite(array("item"), $record); print_r($record); } }
function scrape($source) { global $source, $utmSource, $utmMedium, $utmTerm, $utmContent, $utmCampaign; $link = scraperwiki::scrape($source); $html = str_get_html($link); foreach ($html->find('a[href]') as $a) { $href = $a->href; $a->href = $href . '#utm_source=' . $utmSource . '&utm_medium=' . $utmMedium . '&utm_term=' . $utmTerm . '&utm_content=' . $utmContent . '&utm_campaign=' . $utmCampaign; } print $html; }
function ripByPage($page) { $pathToDetails = 'http://aramestan.e-sanandaj.ir/BurialRequest/DeadSearch?keyword=&firstName=&lastName=&fatherName=&partNo=0&rowNo=&graveNo=&deathDateFrom=&deathDateTo=&bornDateFrom=&bornDateTo=&page=' . $page; $output = scraperwiki::scrape($pathToDetails); $resultingJsonObject = json_decode($output); for ($id = 0; $id <= 9; $id++) { $entry = array('id' => $resultingJsonObject->{'result'}[$id]->{'Id'}, 'fullname' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFullName'}), 'fathername' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFatherName'}), 'birthdate' => strVal($resultingJsonObject->{'result'}[$id]->{'BornDate'}), 'deathdate' => strVal($resultingJsonObject->{'result'}[$id]->{'DeathDate'}), 'partno' => strVal($resultingJsonObject->{'result'}[$id]->{'PartNo'}), 'rowno' => strVal($resultingJsonObject->{'result'}[$id]->{'RowNo'}), 'graveno' => strVal($resultingJsonObject->{'result'}[$id]->{'GraveNo'}), 'gender' => strVal($resultingJsonObject->{'result'}[$id]->{'Gender'}), 'identitycode' => strVal($resultingJsonObject->{'result'}[$id]->{'IdentityCode'})); scraperwiki::save_sqlite(array('data'), $entry); $pagecount = $resultingJsonObject->{'PageNumber'}; } }
function errorHandler($errno, $errstr, $errfile, $errline) { // if error has been surpressed with an @ // see: http://php.net/manual/en/function.set-error-handler.php if (error_reporting() == 0) { return; } global $script; $etb = errorParserStack($errno, $errstr, $script); scraperwiki::sw_dumpMessage($etb); return true; }
function grab($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("#tbl_proxy_list tr") as $data) { $tds = $data->find("td"); if (count($tds) == 7) { $input = decode_ip((string) $tds[0]); $record = array('ip' => $input); scraperwiki::save(array('ip'), $record); } } }
function getLangs() { $url = "http://mappings.dbpedia.org/server/statistics/"; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $i = 0; $langs = array(); foreach ($dom->find('/html/body/p/a') as $result) { $lang = str_replace("/", "", trim($result->href)); $langs[] = $lang; } return $langs; }
function scrapeIdeeLab() { $html = scraperWiki::scrape("http://ideelab.wordpress.com/category/uudis/"); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('div.status-publish') as $data) { $newsTitle = $data->find('div.posttitle h2.pagetitle'); // print($newsTitle[0]->plaintext."\n"); $newsBody = $data->find('div.entry'); // print($newsBody[0]->plaintext."\n"); $record = array('title' => $newsTitle[0]->plaintext, 'newsbody' => $newsBody[0]->plaintext); scraperwiki::save(array('title', 'newsbody'), $record); } }
function gazelangs($url, $lang) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $michi = "strong"; $michi = $michi . " hope"; foreach ($dom->find("ul[@class='trans_sent']") as $data) { $tds = $data->find("li"); $record = array('user_input' => $tds[0]->plaintext, 'babelfish_output' => $tds[1]->plaintext, 'timestamp_scrape' => date("Y-m-d H:i:s"), 'page' => $url, 'language' => $lang); // print json_encode($record) . "\n"; scraperwiki::save(array('user_input', 'babelfish_output', 'timestamp_scrape', 'page', 'language'), $record); } }
function scrapeIndex($url) { $html_content = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html_content); $ngos = array(); foreach ($dom->find('h2') as $h2) { $name = str_replace("–", "-", html_entity_decode($h2->plaintext)); $url = $h2->find('a', 0); $url = $url->href; $ngos[] = array("name" => $name, "url" => $url); scraperwiki::save_sqlite(array("name"), array("name" => $name, "url" => $url), "ngos"); } print_r($ngos); return $ngos; }
function crawlAgents($pageUrl, $domObj) { $html = scraperwiki::scrape($pageUrl); $domObj->load($html); $html = null; $table = $domObj->find('/html/body/table[5]'); foreach ($table[0]->find('tr') as $trs) { if (strpos($trs->firstChild()->plaintext, " String ") == false) { $tds = $trs->find('td'); $agentstring = str_replace(' ', '', $tds[0]->plaintext); $agentdescription = str_replace(' ', '', $tds[1]->plaintext); $agenttype = str_replace(' ', '', $tds[2]->plaintext); $record = array('agent' => $agentstring, 'description' => $agentdescription, 'agent_type' => $agenttype); scraperwiki::save_sqlite(array('agent'), $record, $table_name = "UserAgents"); } } }
function getExcuse($extension) { global $html; global $count; $root = "http://www.goodexcuses.co.uk"; //$extension = "/Excuses/My-fish-is-sick-and-I-need-to-take-it-to-the-vet/" ; $html = file_get_html($root . $extension); //The excuse $excuse = $html->find('h2', 0)->innertext; echo $excuse . "\n"; //save to DB $record = array('EXCUSE_ID' => ++$count, 'EXCUSE_TEXT' => $excuse, 'EXCUSE_URL' => $extension); scraperwiki::save(array('EXCUSE_ID'), $record); //Get next url //echo "\n".goToNextURL()."\n"; goToNextURL(); }