function parseModelsPage($brandId, $brandName, $page) { $html_content = scraperwiki::scrape($page); $this->html = str_get_html($html_content); foreach ($this->html->find("div.makers a") as $el) { $img = $el->find('img', 0); $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext; $m['img'] = $img->src; $m['link'] = 'http://www.gsmarena.com/' . $el->href; $m['desc'] = $img->title; $temp = explode('-', $el->href); $m['id'] = (int) substr($temp[1], 0, -4); $m['brand_id'] = $brandId; scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model"); $this->models++; } $pagination = $this->html->find("div.nav-pages", 0); if ($pagination) { $nextPageLink = $pagination->lastChild(); if ($nextPageLink && $nextPageLink->title == "Next page") { $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href); } } $this->html->__destruct(); }
function kcci($uuid) { // Create DOM from URL or file $html = file_get_html('http://www.kcci.com.pk/UserProfile/tabid/42/userId/' . $uuid . '/Default.aspx'); // Extract member profile from table $table = $html->find('table', 1); $profile = array(); foreach ($table->find('td') as $td) { array_push($profile, $td->plaintext); } $record['UUID'] = $uuid; for ($i = 0; $i < count($profile); $i += 2) { $record[$profile[$i]] = $profile[$i + 1]; } // Save the record ksort($record); $unique_keys = array('UUID'); scraperwiki::save_sqlite($unique_keys, $record, $table_name = "kcci", $verbose = 2); // Clean up unset($record); unset($profile); $td->clear(); unset($td); $table->clear(); unset($table); $html->clear(); unset($html); }
function grep_munich($url, $table_name) { $html = scraperWiki::scrape($url); $count = 0; # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); //Drop all old informations by dropping the table scraperwiki::sqliteexecute("drop table if exists " . $table_name); scraperwiki::sqlitecommit(); $table = $dom->getElementById('flight_info_area'); foreach ($table->find('tr') as $data) { // Flight details. Read tds or ths $tds = $data->find("td"); //if there are less then 7 columns continue to next loop if (sizeof($tds) < 7) { continue; } //print $data->plaintext . "\n"; $flightnr = $tds[1]->plaintext; $from = $tds[2]->plaintext; $time = $tds[3]->plaintext; $expected_time = $tds[4]->plaintext; //Create date $date = date("Y-m-d"); //Build array of flight informations $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time); //Save the informations of one flight scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name); $count = $count + 1; } }
function run_ml($q_num = 0) { $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext))); $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext); /* * Stores results */ scraperwiki::save_sqlite(array("No"), $record); unset($temp_data); } foreach ($dom->find("a") as $a) { if ($a->plaintext == 'Next') { $tmp_a = $a->href; $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a); if ($tmp_a > 0) { continue; } } } if ((int) $tmp_a != 0) { run_ml($tmp_a); } else { exit; } }
function scrapeMarketGroup($url) { global $visitedIds; $html = scraperWiki::scrape($url); $html = str_replace("\n", "", $html); preg_match_all("|<a href=\"/importing/61000746/marketgroup/(\\d+?)/\">(.+?)</a>|s", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $groupId = $match[1]; $groupName = html_entity_decode($match[2]); //echo $groupName."\n"; if (!in_array($groupId, $visitedIds)) { $visitedIds[] = $groupId; scrapeMarketGroup("http://goonmetrics.com/importing/61000746/marketgroup/" . $groupId . "/"); } } preg_match_all("|<tr(.*?)>(.*?)<td(.*?)><a href=\"http://games.chruker.dk/eve_online/item.php\\?type_id=(.+?)\" target=\"_blank\">(.*?)<span class=\"dot\" onclick=\"CCPEVE.showMarketDetails\\((.*?)\\)\">(.+?)</span>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)</tr>|s", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $item = array("itemId" => trim($match[4]), "name" => trim(mb_check_encoding($match[7], 'UTF-8') ? $match[7] : utf8_encode($match[7])), "weekVol" => trim(mb_check_encoding($match[11], 'UTF-8') ? $match[11] : utf8_encode($match[11])), "k6Stock" => trim(mb_check_encoding($match[17], 'UTF-8') ? $match[17] : utf8_encode($match[17]))); $item['weekVol'] = str_replace(",", "", $item['weekVol']); $item['k6Stock'] = str_replace(",", "", $item['k6Stock']); $saved = false; $delay = 0; while (!$saved && $delay < 600) { try { @scraperwiki::save_sqlite(array('itemId'), $item, 'eve_goonmetrics'); $saved = true; } catch (Exception $e) { sleep(10); $delay++; } } } }
function scrape_page() { $row = 0; $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); if (count($tds) > 8) { $row++; $GLOBALS['rowTotal']++; $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext); scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']); print $GLOBALS['rowTotal'] . "\n"; print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n"; } } if ($row == 50) { print "Vielä jatkuu, haetaan seuraava sivu..." . "\n"; $GLOBALS['z']++; scrape_page(); } else { print "Skrääpiminen suoritettu." . "\n"; print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n"; print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n"; } }
function getCategories($u) { global $baseurl, $f; $path = ""; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); echo "Loaded URL: " . $u . "\n"; if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) { $breadcrumb = $d->find('div[id=breadcrumb]', 0); //foreach($breadcrumb as $b) { //echo "Breadcrumb = " . $b;} if (!is_null($breadcrumb)) { foreach ($breadcrumb->children() as $crumb) { $path .= trim($crumb->innertext) . "/"; } $path .= trim(strrchr($breadcrumb->innertext, ">"), "> "); } foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) { $name = trim(strstr($div->children(0)->innertext, "(", true)); $url = $baseurl . $div->children(0)->href; $data = array("Name" => $name, "Path" => $path, "URL" => $url); echo $path . "/" . $name . "\n"; if ($local) { fputcsv($f, array($name, $path, $url)); } else { scraperwiki::save_sqlite(array("URL"), $data); } getCategories($url); } } }
function ripById($id) { $pathToDetails = 'http://beheshtezahra.tehran.ir/Default.aspx?tabid=92&ctl=SearchDetails&mid=653&srid=' . $id; $output = scraperwiki::scrape($pathToDetails); $firstnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblNameBound_0"><b>(.*)<\\//smiU'; $surnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblLastNameBound_0"><b>(.*)<\\//smiU'; $fathernamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblFatherNameBound_0"><b>(.*)<\\//smiU'; $birthdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblBirthDateBound_0"><b>(.*)<\\//smiU'; $deathdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnDateBound_0"><b>(.*)<\\//smiU'; $deathplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDeastTownshipTitle_0"><b>(.*)<\\//smiU'; $graveplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnPlace_0"><b>(.*)<\\//smiU'; preg_match($firstnamepattern, $output, $temp); $firstname = isset($temp[1]) ? $temp[1] : ''; preg_match($surnamepattern, $output, $temp); $surname = isset($temp[1]) ? $temp[1] : ''; preg_match($fathernamepattern, $output, $temp); $fathername = isset($temp[1]) ? $temp[1] : ''; preg_match($birthdatepattern, $output, $temp); $birthdate = isset($temp[1]) ? $temp[1] : ''; preg_match($deathdatepattern, $output, $temp); $deathdate = isset($temp[1]) ? $temp[1] : ''; preg_match($deathplacepattern, $output, $temp); $deathplace = isset($temp[1]) ? $temp[1] : ''; preg_match($graveplacepattern, $output, $temp); $graveplace = isset($temp[1]) ? $temp[1] : ''; scraperwiki::save_sqlite(array('data'), array('id' => $id, 'firstname' => $firstname, 'surname' => $surname, 'fathername' => $fathername, 'birthdate' => $birthdate, 'deathdate' => $deathdate, 'deathplace' => $deathplace, 'graveplace' => $graveplace)); }
function get_codes($dom) { foreach ($dom->find("select") as $data) { foreach ($data->find("option") as $op) { $record = array('stockCode' => $op->value, 'stockSymbol' => $op->plaintext); $message = scraperwiki::save_sqlite(array("stockCode"), $record); #print_r($message); } } }
function get_codes($dom) { foreach ($dom->find('tr[class^="list_row"]') as $data) { $tds = $data->find("td"); //print $tds[0]->plaintext . "\n"; $record = array('item' => $tds[0]->plaintext, 'BUY_CND' => $tds[1]->plaintext, 'SELL_CND' => $tds[2]->plaintext, 'BUY_US' => $tds[3]->plaintext, 'SELL_US' => $tds[4]->plaintext); scraperwiki::save_sqlite(array("item"), $record); print_r($record); } }
function ripByPage($page) { $pathToDetails = 'http://aramestan.e-sanandaj.ir/BurialRequest/DeadSearch?keyword=&firstName=&lastName=&fatherName=&partNo=0&rowNo=&graveNo=&deathDateFrom=&deathDateTo=&bornDateFrom=&bornDateTo=&page=' . $page; $output = scraperwiki::scrape($pathToDetails); $resultingJsonObject = json_decode($output); for ($id = 0; $id <= 9; $id++) { $entry = array('id' => $resultingJsonObject->{'result'}[$id]->{'Id'}, 'fullname' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFullName'}), 'fathername' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFatherName'}), 'birthdate' => strVal($resultingJsonObject->{'result'}[$id]->{'BornDate'}), 'deathdate' => strVal($resultingJsonObject->{'result'}[$id]->{'DeathDate'}), 'partno' => strVal($resultingJsonObject->{'result'}[$id]->{'PartNo'}), 'rowno' => strVal($resultingJsonObject->{'result'}[$id]->{'RowNo'}), 'graveno' => strVal($resultingJsonObject->{'result'}[$id]->{'GraveNo'}), 'gender' => strVal($resultingJsonObject->{'result'}[$id]->{'Gender'}), 'identitycode' => strVal($resultingJsonObject->{'result'}[$id]->{'IdentityCode'})); scraperwiki::save_sqlite(array('data'), $entry); $pagecount = $resultingJsonObject->{'PageNumber'}; } }
function grab($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("#tbl_proxy_list tr") as $data) { $tds = $data->find("td"); if (count($tds) == 6) { $input = decode_ip((string) $tds[0]); $record = array('ip' => $input); //scraperwiki::save(array('ip'), $record); scraperwiki::save_sqlite(array("ip"), array("ip" => $input)); } } }
function scrapeIndex($url) { $html_content = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html_content); $ngos = array(); foreach ($dom->find('h2') as $h2) { $name = str_replace("–", "-", html_entity_decode($h2->plaintext)); $url = $h2->find('a', 0); $url = $url->href; $ngos[] = array("name" => $name, "url" => $url); scraperwiki::save_sqlite(array("name"), array("name" => $name, "url" => $url), "ngos"); } print_r($ngos); return $ngos; }
function crawlAgents($pageUrl, $domObj) { $html = scraperwiki::scrape($pageUrl); $domObj->load($html); $html = null; $table = $domObj->find('/html/body/table[5]'); foreach ($table[0]->find('tr') as $trs) { if (strpos($trs->firstChild()->plaintext, " String ") == false) { $tds = $trs->find('td'); $agentstring = str_replace(' ', '', $tds[0]->plaintext); $agentdescription = str_replace(' ', '', $tds[1]->plaintext); $agenttype = str_replace(' ', '', $tds[2]->plaintext); $record = array('agent' => $agentstring, 'description' => $agentdescription, 'agent_type' => $agenttype); scraperwiki::save_sqlite(array('agent'), $record, $table_name = "UserAgents"); } } }
function ProductInfo($motherboards) { foreach ($motherboards as $mobo) { $html = scraperWiki::scrape($mobo['URI']); $dom = new simple_html_dom(); $dom->load($html); $specs = $dom->find('div#specifications'); $video = $specs[0]->find('tr#GraphicsOutput td', 1)->plaintext; $hdmi = preg_match('/hdmi/', strtolower($video)); $vga = preg_match('/vga/', strtolower($video)); $dp = preg_match('/dp|displayport|display[ ]port/', strtolower($video)); $details = array('Name' => $mobo['Name'], 'URI' => $mobo['URI'], 'Status' => $specs[0]->find('div#infosectionessentials tr', 1)->find('td', 1)->plaintext, 'Form factor' => $specs[0]->find('tr#FormFactor td', 1)->plaintext, 'Socket' => $specs[0]->find('tr#SupportedCPUSocket td', 1)->plaintext, 'HDMI' => $hdmi, 'VGA' => $vga, 'DP' => $dp); //print_r($details); scraperwiki::save_sqlite(array('Name'), $details); $output[] = $details; } return $output; }
function ProductInfo($motherboards) { foreach ($motherboards as $mobo) { $html = scraperWiki::scrape($mobo['URI']); $dom = new simple_html_dom(); $dom->load($html); $specs = $dom->find('div#specifications', 0); $details = array(); $details['Name'] = $mobo['Name']; foreach ($specs->find('tbody tr') as $row) { $tds = $row->find('td'); if (count($tds) == 2) { $details[$tds[0]->plaintext] = $tds[1]->plaintext; } } scraperwiki::save_sqlite(array('Name'), $details); $output[] = $details; } return $output; }
function listPage($host, $searchURL) { $html_content = scraperwiki::scrape($host . $searchURL); $html = str_get_html($html_content); $pageCount = 0; foreach ($html->find("div.photo ul.thumb a") as $el) { $propPage = str_replace("../", "/", $el->href); // echo "\nPAGE :" . $propPage; $property = listProperty($host, $propPage); //scraperwiki::save_sqlite(array('property'), array('property' => json_encode($property))); scraperwiki::save_sqlite(array('property'), $property); // scraperwiki::save_sqlite(array("a"),array("a"=>1, "bbb"=>"Hi there")); exit; } foreach ($html->find("a.pageResults") as $el) { if (trim($el->plaintext) == "Suivante") { $nextPage = $el->href; //echo "\nSEARCH : " . $nextPage; listPage($host, $nextPage); break; } } }
function save_generic($category, $title) { scraperwiki::save_sqlite(array("key"), array("key" => make_key($title . "-generic"), "name" => $title, "manufacturer" => "N/A", "url" => "", "description" => "", "category" => make_key($category) . "/" . make_key($title))); }
function file_get_contents_curl($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //Set curl to return the data instead of printing it to the browser. curl_setopt($ch, "http://" + CURLOPT_URL, $url); $data = curl_exec($ch); curl_close($ch); return $data; } foreach ($scraper as $scr) { scraperwiki::attach($scr); $qry = "* from " . $scr . ".swdata"; //echo $qry; $arr = scraperwiki::select($qry); // print_r($arr); foreach ($arr as $d) { // print $d["key"]; // print $d["site"]; $pr = (int) getPagerank($d["url"]); if (1) { $d_key = $d["key"]; $d_site = $d["site"]; //print_r($d["url"]." PR is ". (string)$pr ." site is ".$d_site); // ." key is " . $d_key); $record = array('url' => utf8_encode($d["url"]), 'pr' => utf8_encode($pr), 'ar' => utf8_encode($d["rank"]), 'id' => utf8_encode($d_key), 'desc' => $d["site"]); #print_r($record); scraperwiki::save_sqlite(array("id"), $record, "prank"); } } }
foreach ($routes as $route) { $routemap[$route['route']]['route'] = $route['route']; @($routemap[$route['route']]['coords'] .= $route['latitude'] . ',' . $route['longitude'] . ',2357' . "\n"); } $theroutes = array(); $count = 0; foreach ($routemap as $a_route) { $count++; $r = $a_route['route']; $c = $a_route['coords']; $theroutes[] = array('id' => $count, 'route' => $r, 'coords' => $c); } scraperwiki::save_sqlite(array("id"), $theroutes); //Whoops, seems that doing 600 queries in under 80 seconds isn't a smart idea. This scraper attempts to aggregate coordinates into something usable. scraperwiki::attach("tfl_bus_routes_scraper", "src"); $routes = scraperwiki::select("route, stop_name, latitude, longitude from src.tfl_buses where run = 1 order by sequence asc"); $routemap = array(); foreach ($routes as $route) { $routemap[$route['route']]['route'] = $route['route']; @($routemap[$route['route']]['coords'] .= $route['latitude'] . ',' . $route['longitude'] . ',2357' . "\n"); } $theroutes = array(); $count = 0; foreach ($routemap as $a_route) { $count++; $r = $a_route['route']; $c = $a_route['coords']; $theroutes[] = array('id' => $count, 'route' => $r, 'coords' => $c); } scraperwiki::save_sqlite(array("id"), $theroutes);
function getContent($page, $params, $cnt, $tag_counts) { $photo_cnt = $cnt; if ($page > 0) { $params['page'] = $page; } $encoded_params = array(); foreach ($params as $k => $v) { $encoded_params[] = urlencode($k) . '=' . urlencode($v); } $url = "http://api.flickr.com/services/rest/?" . implode('&', $encoded_params); $rsp = file_get_contents($url); $rsp_obj = unserialize($rsp); if ($rsp_obj['stat'] == 'ok') { $rows_p = array(); $rows_u = array(); $photo_rows = array(); foreach ($rsp_obj['photos']['photo'] as $photo) { $photo_rows[] = $photo; $photo_cnt++; $tags = explode(' ', trim($photo['tags'])); foreach ($tags as $t) { if ($t != '') { $row_u[] = array('tag' => $t, 'user' => $photo['owner']); $row_p[] = array('tag' => $t, 'photo' => $photo['id']); foreach ($tags as $t2) { if ($t != $t2) { $label = "{$t}<>{$t2}"; $entry = false; if (isset($tag_counts[$label])) { $entry = $tag_counts[$label]; } else { if (isset($tag_counts["{$t2}<>{$t}"])) { $label = "{$t2}<>{$t}"; $entry = $tag_counts[$label]; } } if (!$entry) { $entry = array("tag1" => $t, "tag2" => $t2, "count" => 1); } else { $entry['count']++; } $tag_counts[$label] = $entry; } } } } } scraperwiki::save_sqlite(array('id'), $photo_rows, $table_name = "photos"); scraperwiki::save_sqlite(array(), $row_u, $table_name = "tag_user"); scraperwiki::save_sqlite(array(), $row_p, $table_name = "tag_photo"); if ($page < $rsp_obj['photos']['pages']) { getContent($page + 1, $params, $photo_cnt, $tag_counts); } else { scraperwiki::save_sqlite(array('tag1', 'tag2'), array_values($tag_counts), $table_name = "tag_tag"); print "photos: {$photo_cnt}\n"; } } else { echo "Call failed: {$page}!"; } }
static function save_var($name, $value) { if (is_int($value)) { $jvalue = $value; } else { if (is_double($value)) { $jvalue = $value; } else { $jvalue = json_encode($value); } } $data = array("name" => $name, "value_blob" => $jvalue, "type" => gettype($value)); scraperwiki::save_sqlite(array("name"), $data, "swvariables"); }
} $html->clear(); unset($html); scraperwiki::save_var('last_id', $i); } require 'scraperwiki/simple_html_dom.php'; scraperwiki::attach("s-in-s", "src"); //scraperwiki::save_var('last_id', 1); //exit(); $id = scraperwiki::get_var('last_id'); for ($i = $id; $i < 1900; $i++) { $src = scraperwiki::select("* from src.swdata limit {$i},1"); $url = $src[0]['link']; $url = 'http://sexinsex.net/bbs/' . $url; $html_content = scraperwiki::scrape($url); $html = str_get_html($html_content); $data = array(); $tr = $html->find("div.postmessage div.t_msgfont"); $j = 0; foreach ($tr as $trr) { $noidung = $trr->find('div', 0)->innertext; //$noidung = utf8_encode($noidung); if (mb_strlen($noidung) > 1000) { $j++; @scraperwiki::save_sqlite(array('id'), array('id' => $j . '-' . $src[0]['url'], 'title' => $src[0]['title'], 'url' => $src[0]['url'], 'content' => base64_encode($noidung), 'order' => $j, 'num' => $src[0]['num'], 'reply' => $src[0]['reply'])); } } $html->clear(); unset($html); scraperwiki::save_var('last_id', $i); }
// All that matters is that your final data is written to an SQLite database // called "data.sqlite" in the current working directory which has at least a table // called "data". require 'scraperwiki.php'; function scrapePOST($url) { $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_POST, 1); // disable SSL checking to match behaviour in Python/Ruby. // ideally would be fixed by configuring curl to use a proper // reverse SSL proxy, and making our http proxy support that. curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); $res = curl_exec($curl); curl_close($curl); return $res; } $url = "http://motavafian.beheshtm.ir/peaplesearch.php"; $postdata = http_build_query(array('Family' => '', 'FName' => '', 'NationalCode' => '', 'Shn' => '', 'by' => '', 'ey' => '', 'submit' => 'جستجو')); $opts = array('http' => array('method' => "POST", 'content' => $postdata)); $context = stream_context_create($opts); $content = file_get_contents($url, false, $context, -1); preg_match_all("/<td class=\"alt2\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt1\" align=\"right\">(.*)<\\/td>.*<td class=\"alt2\" align=\"right\">(.*)<\\/td>.*<td class=\"alt1\" align=\"right\">(.*)<\\/td>.*<td class=\"alt2\" align=\"right\">(.*)<\\/td>.*<td class=\"alt2\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt1\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt2\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt1\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt2\" align=\"right\">(.*)<\\/td>/Usmi", $content, $output_array); $amount = count($output_array[1]); print $amount; for ($i = 0; $i <= $amount; $i++) { $record = array('id' => $output_array[1][$i], 'fullname' => $output_array[2][$i], 'fathername' => $output_array[3][$i], 'codemelli' => $output_array[4][$i], 'deathdate' => $output_array[5][$i], 'blockno' => $output_array[6][$i], 'partno' => $output_array[7][$i], 'rowno' => $output_array[8][$i], 'graveno' => $output_array[9][$i], 'nextto' => $output_array[10][$i]); scraperwiki::save_sqlite(array('data'), $record); }
<?php $regions = array(0 => array('id' => 11812, 'label' => 'North East', 'pop' => '2.6', 'geo' => ''), 1 => array('id' => 11807, 'label' => 'North West', 'pop' => '7.1', 'geo' => ''), 2 => array('id' => 11810, 'label' => 'Yorkshire and the Humber', 'pop' => '5.3', 'geo' => ''), 3 => array('id' => 11805, 'label' => 'East Midlands', 'pop' => '4.5', 'geo' => ''), 4 => array('id' => 11809, 'label' => 'West Midlands', 'pop' => '5.6', 'geo' => ''), 5 => array('id' => 11804, 'label' => 'Eastern England', 'pop' => '5.8', 'geo' => ''), 6 => array('id' => 11806, 'label' => 'London', 'pop' => '8.2', 'geo' => ''), 7 => array('id' => 11811, 'label' => 'South East', 'pop' => '8.6', 'geo' => ''), 8 => array('id' => 11814, 'label' => 'South West', 'pop' => '5.3', 'geo' => ''), 9 => array('id' => 11813, 'label' => 'Wales', 'pop' => '3.1', 'geo' => '')); //Get GeoJSON of regions foreach ($regions as $key => $region) { $regions[$key]['geo'] = serialize(json_decode(file_get_contents('http://mapit.mysociety.org/area/' . $region['id'] . '.geojson'))); } //print_r($var); scraperwiki::save_sqlite(array("id"), $regions); $regions = array(0 => array('id' => 11812, 'label' => 'North East', 'pop' => '2.6', 'geo' => ''), 1 => array('id' => 11807, 'label' => 'North West', 'pop' => '7.1', 'geo' => ''), 2 => array('id' => 11810, 'label' => 'Yorkshire and the Humber', 'pop' => '5.3', 'geo' => ''), 3 => array('id' => 11805, 'label' => 'East Midlands', 'pop' => '4.5', 'geo' => ''), 4 => array('id' => 11809, 'label' => 'West Midlands', 'pop' => '5.6', 'geo' => ''), 5 => array('id' => 11804, 'label' => 'Eastern England', 'pop' => '5.8', 'geo' => ''), 6 => array('id' => 11806, 'label' => 'London', 'pop' => '8.2', 'geo' => ''), 7 => array('id' => 11811, 'label' => 'South East', 'pop' => '8.6', 'geo' => ''), 8 => array('id' => 11814, 'label' => 'South West', 'pop' => '5.3', 'geo' => ''), 9 => array('id' => 11813, 'label' => 'Wales', 'pop' => '3.1', 'geo' => '')); //Get GeoJSON of regions foreach ($regions as $key => $region) { $regions[$key]['geo'] = serialize(json_decode(file_get_contents('http://mapit.mysociety.org/area/' . $region['id'] . '.geojson'))); } //print_r($var); scraperwiki::save_sqlite(array("id"), $regions);
static function save_var($name, $value) { $vtype = gettype($value); if ($vtype != "integer" && $vtype != "string" && $vtype != "double" && $vtype != "NULL") { print_r("*** object of type {$vtype} converted to string\n"); } $data = array("name" => $name, "value_blob" => strval($value), "type" => $vtype); scraperwiki::save_sqlite(array("name"), $data, "swvariables"); }
// // // Find something on the page using css selectors // $dom = new simple_html_dom(); // $dom->load($html); // print_r($dom->find("table.list")); // // // Write out to the sqlite database using scraperwiki library // scraperwiki::save_sqlite(array('name'), array('name' => 'susan', 'occupation' => 'software developer')); // // // An arbitrary query against the database // scraperwiki::select("* from data where 'name'='peter'") // You don't have to do things with the ScraperWiki library. You can use whatever is installed // on Morph for PHP (See https://github.com/openaustralia/morph-docker-php) and all that matters // is that your final data is written to an Sqlite database called data.sqlite in the current working directory which // has at least a table called data. require 'scraperwiki.php'; require 'scraperwiki/simple_html_dom.php'; // // // Read in a page $html = scraperwiki::scrape("http://www.ebay.com/sch/i.html?_from=R40&_trksid=p2050601.m570.l1313.TR0.TRC0.H0.XAmerican+Revolutionary+War&_nkw=American+Revolutionary+War&_sacat=0"); // // // Find something on the page using css selectors $dom = new simple_html_dom(); $dom->load($html); print_r($dom->find("h3[class='lvtitle'] a")); // // // Write out to the sqlite database using scraperwiki library scraperwiki::save_sqlite(array('name'), array('name' => 'susan', 'occupation' => 'software developer')); // // // An arbitrary query against the database // scraperwiki::select("* from data where 'name'='peter'")
$record = "<div class=\"kiji\">"; foreach ($obj->nodes as $v) { $v = preg_replace("/<!\\-\\-.*?\\-\\->/", "", $v->outertext()); $v = preg_replace("/(<p>|<\\/p>)/", "", $v); $v = preg_replace("/<a .*?<\\/a>/", "", $v); $v = strip_tags($v); if (strlen(trim($v)) == 0) { continue; } if (preg_match("/<h1/", $v)) { continue; } if (preg_match("/<div +class=(\"|')date/", $v)) { continue; } if (preg_match("/<div +class=(\"|')cl/", $v)) { continue; } $record .= $v . "\n"; } $record .= "</div>"; } $title = mb_convert_encoding($title, "utf8", "sjis-win"); $record = mb_convert_encoding($record, "utf8", "sjis-win"); $record = $title . $record; //echo $title; //echo "\n"; //echo $record; $date = date('Y/m/d H:i:s'); scraperwiki::save_sqlite(array("id"), array("id" => "1", "date" => $date, "news" => $record));
$name = $html->find('b', 5)->innertext; $name = strip_tags($name); $name = str_replace('"', "", $name); $name = str_replace(' ', "", $name); $name = str_replace('&', "", $name); $name = str_replace(' ', "", $name); $name = str_replace(' ', "", $name); $name = str_replace(' ', "", $name); $name = strip_tags($name); $number = $html->find('b', 6)->plaintext; $number = trim(str_replace("Roll Number :", "", $number)); $number = strip_tags($number); $number = str_replace('"', "", $number); $number = str_replace(' ', "", $number); $number = str_replace('&', "", $number); $number = str_replace(' ', "", $number); $number = str_replace(' ', "", $number); $number = str_replace(' ', "", $number); $number = strip_tags($number); $number = trim(str_replace(" ", "", $number)); $res = $html->find('b', 7)->plaintext; $res = trim(str_replace("Result :", "", $res)); $bee = $html->find('td', 11)->plaintext; $bmec = $html->find('td', 13)->plaintext; $be = $html->find('td', 15)->plaintext; $faa = $html->find('td', 17)->plaintext; $total = $html->find('td', 19)->plaintext; if ($number) { $message = scraperwiki::save_sqlite(array("number"), array("number" => $number, "name" => $name, "bee" => $bee, "bmec" => $bmec, "be" => $be, "faa" => $faa, "result" => $res, "total" => $total), $table_name = "swdata"); } }
$html = file_get_html('http://nadaguides.com/Cars/1996/Lincoln/Continental-V8/Sedan-4D/Values'); // Find table that contains prices $table = $html->find('table[class]'); //Find price last cell $td = $table[0]->find('td'); $data = $td[19]->plaintext; // Find table that contains prices $h1 = $html->find('h1'); //Find price last cell $h1clean = $h1[0]->plaintext; //Remove special characters $data2 = str_replace("\$", "", $data); $data2 = str_replace(",", "", $data2); $message = scraperwiki::save_sqlite(array("price"), array("price" => $data2, "title" => $h1clean)); require 'scraperwiki/simple_html_dom.php'; // Create DOM from URL or file $html = file_get_html('http://nadaguides.com/Cars/1996/Lincoln/Continental-V8/Sedan-4D/Values'); // Find table that contains prices $table = $html->find('table[class]'); //Find price last cell $td = $table[0]->find('td'); $data = $td[19]->plaintext; // Find table that contains prices $h1 = $html->find('h1'); //Find price last cell $h1clean = $h1[0]->plaintext; //Remove special characters $data2 = str_replace("\$", "", $data); $data2 = str_replace(",", "", $data2); $message = scraperwiki::save_sqlite(array("price"), array("price" => $data2, "title" => $h1clean));