$info['against'] = trim(get_first_string($part, 'proti:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'proti:</span>', '<br')); $info['abstain'] = trim(get_first_string($part, 'zdržel se:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'zdržel se:</span>', '<br')); $info['number_representatives'] = trim(get_first_string($part, 'Počet zastupitelů:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'Počet zastupitelů:</span>', '<br')); $info['present'] = trim(get_first_string($part, 'přítomno:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'přítomno:</span>', '<br')); $trs = $dom->find("table[class=data-grid]", 0)->find("tr"); array_shift($trs); $data = array(); foreach ($trs as $tr) { $tds = $tr->find("td"); $mp_id = get_first_string($tds[0]->find("a", 0)->href . "&", "memberId=", "&"); $data[] = array('division_id' => $info['id'], 'mp_id' => $mp_id, 'vote' => trim($tds[1]->plaintext), 'mp_name' => $tds[0]->plaintext); } //one division done scraperwiki::save_sqlite(array('id'), $info, 'division'); scraperwiki::save_sqlite(array('division_id', 'mp_id'), $data, 'mp_vote'); scraperwiki::save_var('last_id', $info['id']); } /** * finds substrings between opening and closing markers * @return result array of the substrings */ function returnSubstrings($text, $openingMarker, $closingMarker) { $openingMarkerLength = strlen($openingMarker); $closingMarkerLength = strlen($closingMarker); $result = array(); $position = 0; while (($position = strpos($text, $openingMarker, $position)) !== false) { $position += $openingMarkerLength; if (($closingMarkerPosition = strpos($text, $closingMarker, $position)) !== false) { $result[] = substr($text, $position, $closingMarkerPosition - $position);
static function save_metadata($metadata_name, $value) { return scraperwiki::save_var($metadata_name, $value); }
foreach ($dom->find("table.contenttable a") as $data) { $sectionlinks[] = $data->href; } $i = 0; foreach ($sectionlinks as $link) { $sections[] = scraperWiki::scrape("http://www.ruralfinance.org/" . $link); $dom = new simple_html_dom(); $dom->load(end($sections)); foreach ($dom->find("div[@id='navNext'] a") as $data) { $pagetwolinks[] = htmlspecialchars_decode($data->href); } } $i = 0; foreach ($pagetwolinks as $link) { $it = ""; $pagetwos[] = scraperWiki::scrape("http://www.ruralfinance.org/" . $link); $dom = new simple_html_dom(); $dom->load(end($pagetwos)); $j = 0; foreach ($dom->find("table.ab tr") as $data) { if ($j != 0) { $as = $data->find("td a"); if (strpos($as[0], "http://www.ruralfinance.org/discussion/") > 0) { $it .= $as[0] . "<br>"; } } $j++; } scraperwiki::save_var($i, $it); $i++; }
foreach ($entries as $entry) { $key = $entry->nodeValue; } $num = 1; preg_match_all('/(\\d{4})/', $key, $matches); foreach ($matches[0] as $val) { scraperwiki::save_var($num, $val); //print scraperwiki::get_var('last_page'); $num++; } $xpathExpr2 = '/html/body/form/table/tbody/tr[3]/td[1]/table/tr/td[2]/table[3]'; $entries2 = $xpath->query($xpathExpr2); foreach ($entries2 as $entry2) { $key2 = $entry2->nodeValue; } preg_match_all('/(\\d{4})/', $key2, $matches2); foreach ($matches2[0] as $val) { scraperwiki::save_var($num, $val); $num++; } $xpathExpr3 = '/html/body/form/table/tbody/tr[3]/td[1]/table/tr/td[2]/table[1]'; $entries3 = $xpath->query($xpathExpr3); foreach ($entries3 as $entry3) { $key3 = $entry3->nodeValue; } $matches3 = explode(" ", $key3); scraperwiki::save_var("day", $matches3[2]); $match = explode("\n", $matches3[1]); scraperwiki::save_var("date", $match[2]); //scraperwiki::save_var($num, $val); //$num++;
return null; } $blacklist = array(); $url = "http://www.coastandcountry.co.uk/cottage-details/"; scraperwiki::attach("coastandcountrycouk"); # get an array of the cottage data to scrape $cottData = scraperwiki::select("COTTAGE_URL, PRICE_HIGH, PRICE_LOW from 'coastandcountrycouk'.SWDATA order by COTTAGE_URL"); $placeholder = scraperwiki::get_var("cottID"); if ($placeholder != "") { $index = searchForId($placeholder, $cottData); $cottData = array_splice($cottData, $index); } require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); foreach ($cottData as $value) { scraperwiki::save_var("cottID", $value['COTTAGE_URL']); // check the cottage url against the blacklist foreach ($blacklist as $blItem) { if ($value['COTTAGE_URL'] == $blItem) { continue 2; } } //load the page into the scraper $html = scraperWiki::scrape($url . $value['COTTAGE_URL']); $dom->load($html); $feature = ""; $image = ""; $imgURL = ""; $xtraFeatures = ""; /* Get the Data */ // get Cottage name
} $i++; } require_once 'scraperwiki/simple_html_dom.php'; scraperwiki::save_var('dummy', 0); $maxpages = 46; $i = 1; $dom = new simple_html_dom(); $placeholder = scraperwiki::get_var("page"); if ($placeholder) { $i = $placeholder; } $j = 0; $counter = 0; while ($i <= $maxpages) { scraperwiki::save_var("page", $i); $url = "http://www.norfolkcottages.co.uk/cottage-search/amount-10/page-" . $i; // Load HTML from a URL $html = file_get_html($url); $dom->load($html); // get the list of cottages #echo $dom; // page foreach ($dom->find('div[id=search-results-container]') as $page) { // cottage foreach ($dom->find('div[class=property-result-container]') as $page) { foreach ($page->find('div[class=middle-container]') as $cottage) { $cottage_name = ""; # cottage name, URL and ID foreach ($cottage->find('h2') as $cottageNameData) { foreach ($cottageNameData->find('a') as $cottageURLData) {
$mp = trim(substr($td->innertext, 1)); $club = $h2s[$key]->innertext; $votes[] = array('division_id' => $html['id'], 'vote' => $vote, 'mp' => $mp, 'club' => $club); } } //print_r($votes);die(); scraperwiki::save_sqlite(array('division_id', 'mp'), $votes, 'vote'); /*$tds = $table->find('td'); if (count($tds) > 0) { foreach ($tds as $td) { //echo $td->outertext; $h3 = $td->find('h3',0); if ($h3 != '') { $party = $h3->innertext; } else { $vote = substr($td->innertext,0,1); $mp = trim(substr($td->innertext,1)); if ($mp != '') $votes[] = array( 'division_id' => $html['id'], 'vote' => $vote, 'mp' => $mp, 'club' => $party, ); } } scraperwiki::save_sqlite(array('division_id','mp'),$votes,'vote'); } */ scraperwiki::save_var('last_id', $html['id']); } }
function get_details($url, $CurrentMaxPages) { // get the scraperwiki methods and create a new intance require_once 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); // Get the bookmarked page if there is one // else start at 1 $getPage = scraperwiki::get_var("page"); $page = 1; if ($getPage != "") { $page = $getPage; } while ($page <= $CurrentMaxPages) { // bookmark record scraperwiki::save_var("page", $page); //load the page into the scraper $html = scraperWiki::scrape($url . $page); $dom->load($html); // get Details $i = 0; while ($i < 12) { // Get URL foreach ($dom->find('a[id=SearchResult1_linkTo_' . $i . ']') as $data) { $element = $dom->find('a'); $cotturl = $data->getAttribute('href'); $cotturl = str_replace("/cottages/", "", $cotturl); } // get High / Low Prices foreach ($dom->find('span[id=featureBoxPropertyWasPricePoundPr_' . $i . ']') as $data) { $prices = str_replace("Prices from ", "", $data->plaintext); $prices = str_replace(" based on available 7 nights", "", $prices); $prices = str_replace("£", "", $prices); $prices = explode("-", $prices); $price_low = $prices[0]; $price_high = $prices[1]; } // Put the records into an array $record = array('COTTAGE_URL' => trim($cotturl), 'PRICE_HIGH' => trim($price_high), 'PRICE_LOW' => trim($price_low)); # save the data scraperwiki::save(array('COTTAGE_URL'), $record); $i++; } // move on to the next record $page++; } }
scraperwiki::save_sqlite(array('date'), $data, 'info'); //print_r($data); /*charts*/ $data_chart = array(); //html $url = 'http://www.ceskatelevize.cz' . $charts_link; $html = scraperwiki::scrape($url); //get dom $dom = new simple_html_dom(); $dom->load($html); $script = $dom->find('script', 2); $ar1 = explode(']]', $script->innertext); //chart 1 $ar2 = explode('[[[', str_replace("\t", '', str_replace("\n", '', str_replace(' ', '', $ar1[0])))); $ar3 = explode('],[', trim(trim($ar2[1]), ']')); foreach ($ar3 as $row) { $ar4 = explode(',', $row); $data_chart[] = array('date' => $date->format('Y-m-d'), 'chart' => '1', 'minute' => $ar4[0], 'value' => $ar4[1]); } //chart 2 $ar2 = explode('[[[', str_replace("\t", '', str_replace("\n", '', str_replace(' ', '', $ar1[1])))); $ar3 = explode('],[', trim(trim($ar2[1]), ']')); foreach ($ar3 as $row) { $ar4 = explode(',', $row); $data_chart[] = array('date' => $date->format('Y-m-d'), 'chart' => '2', 'minute' => $ar4[0], 'value' => $ar4[1]); } scraperwiki::save_sqlite(array('date', 'chart', 'minute'), $data_chart, 'chart'); //print_r($data_chart); scraperwiki::save_var('last_date', $date->format('Y-m-d')); $date->add(new DateInterval('P1D')); }
$i++; } } //print_r($data); scraperwiki::save_var("date", $data[0]); scraperwiki::save_var("1yearcmt", $data[6]); $html->__destruct(); # Blank PHP print "Testing getting 1 year CMT from Bankrate.\n"; $html_content = scraperWiki::scrape("http://www.bankrate.com/rates/interest-rates/1-year-cmt.aspx"); //print $html . "\n"; require 'scraperwiki/simple_html_dom.php'; $html = str_get_html($html_content); $data[0] = ""; foreach ($html->find("div.interactivetopaction ") as $el) { //print $el . "\n"; //print "1 year CMT ". $el->innertext . "\n"; $data[0] = $el->innertext; } $i = 1; foreach ($html->find("div.boxcontent") as $box) { foreach ($box->find("td") as $el) { //print $el->innertext . "\n"; $data[$i] = $el->innertext; $i++; } } //print_r($data); scraperwiki::save_var("date", $data[0]); scraperwiki::save_var("1yearcmt", $data[6]); $html->__destruct();
function setVariable($key, $value) { scraperwiki::save_var($key, $value); }
scraperwiki::save_var('cno_max', $MaxCoNo); //$MaxCoNo = 10; $MaxDbCoNo = scraperwiki::get_var('cno_dbmax'); scraperwiki::save_var('cno_dbmax_old', $MaxDbCoNo); //$MaxDbCoNo = 1; // RUN for first time. $MaxCoNo = 50000; //it's workaround. echo $MaxDbCoNo . "->" . $MaxCoNo . "\r\n"; for ($iCom = $MaxDbCoNo; $iCom <= $MaxCoNo; $iCom++) { unset($ComData); $ComData = GetCommunityData($iCom); echo $ComData['cid'] . "/" . $ComData['ctype'] . "\r\n"; //var_dump($ComData); SaveCommunityData($ComData); if ($iCom % 50 == 0) { scraperwiki::save_var('cno_dbmax', $iCom); } //break; } exit; function SaveCommunityData($ArrComData) { if (!array_key_exists("cid", $ArrComData)) { return; } scraperwiki::save_sqlite(array("cid"), $ArrComData); } function GetMaxCoNumber() { $html = scraperWiki::scrape(URL_NEW); $dom = new simple_html_dom();
function scrape_stats() { global $debug; $CHI = 0; $badCnt = 0; $CHI = scraperwiki::get_var('IndxStat', $CHI, "0"); $churches = scraperwiki::select("church_id from churches"); //print "Scrape Church Stats @" . $CHI . " of " . sizeof($churches). " " . intval($CHI / sizeof($churches) * 100) . "%\n" ; while ($CHI < sizeof($churches)) { $church = $churches[$CHI]; $cid = $church['church_id']; if ($debug || $CHI % 100 == 0) { print "Stats church Empty=" . $badCnt . " " . $CHI . " of " . sizeof($churches) . " " . intval($CHI / sizeof($churches) * 100) . "%\n"; } if (scrape_one_stat($cid) == 0) { $badCnt++; } //print "Stats@" . $CHI . "/" . $cid . "\n"; $CHI++; scraperwiki::save_var('IndxStat', $CHI); } print "Scrape Stats Done Churches=" . $CHI . " empty=" . $badCnt . "\n"; scraperwiki::save_var('IndxStat', 0); }
scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'rank'), $goals_visitor, 'goal'); //cards $tr = $table->find('tr', 1); $tds = $tr->find('td'); $yellow_home = str2action($tds[0]->innertext, 'home', $match_id, 'yellow'); scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $yellow_home, 'card'); $yellow_visitor = str2action($tds[2]->innertext, 'visitor', $match_id, 'yellow'); scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $yellow_visitor, 'card'); $tr = $table->find('tr', 2); $tds = $tr->find('td'); $red_home = str2action($tds[0]->innertext, 'home', $match_id, 'red'); scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $red_home, 'card'); $red_visitor = str2action($tds[2]->innertext, 'visitor', $match_id, 'red'); scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $red_visitor, 'card'); } scraperwiki::save_var('last_id', $match_id); } function str2action($str, $home_visitor, $match_id, $card = null) { if ($str != '-') { $out = array(); $rank = 1; $goals_ar = explode('<br>', $str); foreach ($goals_ar as $goalstr) { $fake_dom = new simple_html_dom(); $fake_dom->load('<html><body>' . $goalstr . '</body></html>'); if (is_object($fake_dom->find('a', 0))) { $min_ar = explode('<', $goalstr); $minute = trim(trim($min_ar[0]), '.'); $player = $fake_dom->find('a', 0)->plaintext; $player_link = $fake_dom->find('a', 0)->href;
// used for debugging if ($run_environment == 'dev') { if (empty($records)) { $records = get_mayors($result); } else { $records = array_merge($records, get_mayors($result)); } } else { get_mayors($result); // sleep(10); // this might be needed on scraperwiki.com } // reset the progress bookmark scraperwiki::save_var('last_state', ''); } // Set state of scraper to complete so we know it didn't crash part way thru scraperwiki::save_var('scraper_state', 'complete'); // if testing if ($run_environment == 'dev') { header('Content-type: application/json'); print json_encode($records); } function get_post_response($url, $fields) { $fields_string = ''; //url-ify the data for the POST foreach ($fields as $key => $value) { $fields_string .= $key . '=' . $value . '&'; } rtrim($fields_string, '&'); //open connection $ch = curl_init();
} if ($k < 50) { $tempstr = $temparr2[1] . ";1;" . $arrName[0] . ";" . $strCat . ";" . $intRetail . ";30;" . $intPrice . ";;;;;;" . $temparr2[1] . ";" . $temparr2[1] . ";rem;" . ";;;;0.25;999;" . $strBrah . $arrBrand[0] . " " . $arrName[0] . " (" . $temparr2[3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";;;1;;1;..//img/sun/" . $temparr2[1] . ".jpg;;;;"; } /* $tempstr = $temparr2[$i][1] . "; 1;" . $arrName[0] . ";;;;;;;;;;" . $temparr2[$i][1] . "; " . $temparr2[$i][1] . "; " . "rem; " . ";;;;" . "0.25; 999;" . $arrBrand[0] . " " . $arrName[0] . " (" . $temparr2[$i][3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";" . ";;1;;1;..//img/" . $temparr2[$i][0] . ";" . ";;;"; */ //Saving data for Products.csv // $strVarName = $arrBrand[0] . " " . $arrName[0] . $temparr2[3]; // scraperwiki::save_var($strVarName, $tempstr); //print_r($i); //print_r($strVarName); //print_r($strVarName); // print_r($temparr2); //Saving images $image_url = "http://www.remeyewear.com/showimage.aspx?img=" . $temparr2[0] . ".jpg&sku=" . $temparr2[1] . "&w=667"; $image_local = $temparr2[1] . ".jpg,"; scraperwiki::save_var($image_local, $image_url); //copy( $image_url, '/tmp/' . $image_local ); } //end for count($arr) //print_r($arrofarr); //print_r($arrofarr[2][2]); //print_r($temparr2); /* $str = $arrofarr[0][1] . "; 1;" . $arrName[0] . ";;;;;;;;;;" . $arrofarr[0][1] . "; " . $arrofarr[0][1] . "; " . "rem; " . ";;;;" . "0.25; 999;" . $arrBrand[0] . " " . $arrName[0] . " (" . $arrofarr[0][3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";" . ";;1;;1;..//img/" . $arrofarr[0][0] . ";" . ";;;"; echo $str ; echo "****************************" ; $str1 = $arrofarr[1][1] . "; 1;" . $arrName[0] . ";;;;;;;;;;" . $arrofarr[1][1] . "; " . $arrofarr[1][1] . "; " . "rem; " . ";;;;" . "0.25; 999;" . $arrBrand[0] . " " . $arrName[0] . " (" . $arrofarr[1][3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";" . ";;1;;1;..//img/" . $arrofarr[1][0] . ";" . ";;;";
# Determine the URL of the linked page (e.g. the info for 'WRK' $href = html_entity_decode($row->find('a', 0)->href); $url = $domain . $href; # Get the postcode from the linked page $html2 = scraperWiki::scrape($url); $dom2 = new simple_html_dom(); $dom2->load($html2); $address = trim($dom2->find('address', 0)->plaintext); $lines = explode("\n", $address); $postcode = trim(array_pop($lines)); # Convert postcode to lat/lon list($lat, $lng) = scraperwiki::gb_postcode_to_latlng($postcode); //$postcodeTrimmed = str_replace (' ', '', $postcode); //$latlng = scraperwiki::select("* from ukp.swdata where postcode='{$postcodeTrimmed}';"); # Assemble the record $station = array('code' => trim($cols[1]->plaintext), 'name' => html_entity_decode(trim($cols[0]->plaintext)), 'postcode' => $postcode, 'latitude' => $lat, 'longitude' => $lng, 'url' => $url); # Save the record scraperwiki::save(array('code'), $station); print_r($record); # Limit while testing //if ($i == 10) {break;} # Save the current position scraperwiki::save_var('run_first', $i); } /* Useful pages: http://scraperwiki.com/scrapers/swansea_food_safety_inspections_1/edit/ http://scraperwiki.com/docs/php/php_intro_tutorial/ http://scraperwiki.com/docs/php/php_help_documentation/ http://scraperwiki.com/views/postcode_and_geo_cheat_sheet/edit/ */
//$presence = str_replace(' ','',trim($tds[2]->plaintext)); //simple_html_dom.php $presence = str_replace(' ', '', trim(strip_tags('<td' . $tds[2]))); } else { //vote $d = array('date' => $date, 'session' => $session, 'presence' => $presence, 'name' => trim(strip_tags('<td' . $tds[1])), 'vote' => trim(strip_tags('<td' . $tds[3])), 'mp_id' => $html[0]['mp_id'], 'term' => $html[0]['term']); $data[] = $d; } } scraperwiki::save_sqlite(array('term', 'mp_id', 'date', 'name'), $data, 'vote'); } scraperwiki::save_var('last_term', $html[0]['term']); scraperwiki::save_var('last_mp_id', $html[0]['mp_id']); } } scraperwiki::save_var('last_term', 0); scraperwiki::save_var('last_mp_id', 0); /** * finds substrings between opening and closing markers * @return result array of the substrings */ function returnSubstrings($text, $openingMarker, $closingMarker) { $openingMarkerLength = strlen($openingMarker); $closingMarkerLength = strlen($closingMarker); $result = array(); $position = 0; while (($position = strpos($text, $openingMarker, $position)) !== false) { $position += $openingMarkerLength; if (($closingMarkerPosition = strpos($text, $closingMarker, $position)) !== false) { $result[] = substr($text, $position, $closingMarkerPosition - $position); $position = $closingMarkerPosition + $closingMarkerLength;
$options = $divs[0]->find('option[selected=selected]'); $term = $options[0]->value; //compare saved term $saved_term = scraperwiki::get_var('current_term'); $info = scraperwiki::show_tables(); if ($term != $saved_term) { if (isset($info['club'])) { scraperwiki::sqliteexecute("delete from club"); scraperwiki::sqlitecommit(); } if (isset($info['membership'])) { scraperwiki::sqliteexecute("delete from membership"); scraperwiki::sqlitecommit(); } } scraperwiki::save_var('current_term', $term); //current clubs foreach ($ids as $i) { $url = "http://www.nrsr.sk/web/Default.aspx?sid=poslanci/kluby/klub&ID=" . $i; $html = scraperwiki::scrape($url); //get dom $dom = new simple_html_dom(); $dom->load($html); //is it a valid club (or empty) $h2s = $dom->find("h2"); if (trim($h2s[0]->plaintext) == 'Neočakávaná chyba!') { } else { //club if (isset($info['club'])) { scraperwiki::sqliteexecute("delete from club where id = '{$i}'"); scraperwiki::sqlitecommit();
if (strpos($html, 'Para esta zona no hay elección') > 0) { continue; } $data = get_data($dom, $house, $sex, $year, $number - 400); scraperwiki::save_sqlite(array('year', 'house', 'district', 'i', 'sex'), $data); } scraperwiki::save_var('last_number', 400); $last_number = 400; } scraperwiki::save_var('last_year', 2009); $last_year = 2009; } scraperwiki::save_var('last_sex', 0); $last_sex = 0; } scraperwiki::save_var('last_house', 0); $last_house = 0; function get_data($dom, $house, $sex, $year, $district) { $data = array(); $i = 1; $table = $dom->find('table[CellPadding=4]', 0); $trs = $table->find('tr'); array_shift($trs); array_pop($trs); foreach ($trs as $tr) { $row = array('year' => $year, 'house' => $house, 'sex' => $sex, 'district' => $district); $tds = $tr->find('td'); $row['name'] = $tds[0]->plaintext; $row['party'] = $tds[1]->plaintext; $row['votes'] = str_replace('.', '', trim($tds[2]->plaintext));
$value = intval($columns[1]->plaintext); // Some rows are empty, and we should exclude them. if (strlen($attribute) > 0) { //print $attribute . ": "; //print $value . "\n"; $data["id"] = $productId; $data[$attribute] = $value; } } // Get the price. $priceDom = $dom->find("td[background*=add_cart_bgd02.jpg]"); $price = floatval(str_replace('$', "", $priceDom[0]->plaintext)); $data["price"] = $price; // Calculate out a few extra fields: $cells = $data["Config(s)"]; $capacity = $data["Capacity(mAh)"]; $energy = $cells * 3.7 * $capacity / 1000; $value = $energy / $price; $data["Energy (Wh)"] = $energy; $data["Value (Wh/\$)"] = $value; //print_r($data); scraperwiki::save(array("id"), $data); scraperwiki::save_var("currentId", $productId); $loopCount++; } // Check to see if we have scraped everything - if so, start again! $lastBattery = end($batteries); if ($lastBattery['id'] == $productId) { print "All known batteries processed. Clearing progress marker so scraper can start again."; scraperwiki::save_var("currentId", -1); }
scraperwiki::save(array('VOTOS'), $record); $saved_counter++; scraperwiki::save_var("LastRunCounter", $saved_counter); } } scraperwiki::save_var("MesaCounter", $mesa_counter); } $mesa_counter++; } // Mesa scraperwiki::save_var("CentroCounter", $centro_counter); } $centro_counter++; } // Centro scraperwiki::save_var("ParroquiaCounter", $parroquia_counter); } $parroquia_counter++; } // Parroquia scraperwiki::save_var("MunicipioCounter", $municipio_counter); } $municipio_counter++; } // Municipio scraperwiki::save_var("EstadoCounter", $estado_counter); } $estado_counter++; } // Estado # print $html . "\n";
$twitter = str_replace("@", "", $link->data); $twitter = preg_replace("/\\s+/", "", $twitter); $twitter = str_replace("Twitter:", "", $twitter); $OBJ['twitter'] = $twitter; // Klout (based on Twitter) $klout = scraperWiki::scrape('http://api.klout.com/v2/identity.json/twitter?screenName=' . $twitter . '&key=v23b2ddvdf8n5fvap95kk56r'); $klout = json_decode($klout); $klout = scraperWiki::scrape('http://api.klout.com/v2/user.json/' . $klout->id . '?key=v23b2ddvdf8n5fvap95kk56r'); $klout = json_decode($klout); $OBJ['klout'] = $klout->score->score; } if ($link->label == 'Profile URL:') { $profile_url = str_replace("Profile URL:", "", $link->data); $profile_url = preg_replace("/\\s+/", "", $profile_url); $OBJ['profile_url'] = $profile_url; } if ($link->label == 'LinkedIn:') { $LinkedIn = str_replace("LinkedIn:", "", $link->data); $LinkedIn = preg_replace("/\\s+/", "", $LinkedIn); $OBJ['linkedIn'] = $LinkedIn; } } // Clean certifications $certifications = array_unique(json_decode($row['certifications'])); $OBJ['certifications'] = json_encode($certifications); // Geo scraperwiki::save_sqlite(array('id', 'name', 'company', 'location', 'date', 'url', 'profile', 'twitter', 'klout', 'profile_url', 'linkedIn', 'certifications'), $OBJ); scraperwiki::save_var('last_page', $counter); $counter = $counter + 1; print_r($counter); }
scraperwiki::save(array('company'), $record); //print json_encode($record) . "\n"; scraperwiki::save_var('last', $profile_no); } } //scraperwiki::save_var('last', 0); scraperwiki::attach("find_4n_profiles"); $links = scraperwiki::select("profile from find_4n_profiles.swdata"); require 'scraperwiki/simple_html_dom.php'; $profile = new simple_html_dom(); foreach ($links as $link) { set_time_limit(0); $profile_no = intval(str_replace('http://www.4networking.biz/Members/Details/', '', $link['profile'])); if ($profile_no > scraperwiki::get_var('last')) { $html = scraperWiki::scrape($link['profile']); $profile->load($html); if (!($company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->title)) { $company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->plaintext; } $website = $profile->find("span.orange-text a", 0) ? $profile->find("span.orange-text a", 0)->href : ''; if ($profile->find("div.blue3-empty-box div.content div.word-wrap", 0)) { $info = $profile->find("div.blue3-empty-box div.content div.word-wrap", 0)->plaintext; } else { $info = ''; } $record = array('name' => $profile->find("//div/a/span", 1)->plaintext, 'company' => $company, 'phone' => $profile->find("strong.big-blue3-text span", 0)->plaintext, 'website' => $website); scraperwiki::save(array('company'), $record); //print json_encode($record) . "\n"; scraperwiki::save_var('last', $profile_no); } }
foreach ($result as $element) { $val = $element->innertext; $messages[] = $element; //print json_encode($galleria) . "\n"; } //Description Details li $result = $html->find('ul[id=description_details]/li'); $inputs = array(); foreach ($result as $element) { $key = $element->find('span.property', 0)->plaintext; $val = $element->find('span.value', 0)->plaintext; $inputs[$key] = $val; //print json_encode($inputs) . "\n"; } //Other Page Attributes $attributes = array('id' => $hostingid, 'title' => $html->getElementsByTagName('title')->plaintext, 'saved_count' => $html->find('div.saved_count span', 0)->plaintext, 'price' => $html->find('h2[id=price_amount]', 0)->plaintext, 'description' => $html->find('div[id=description_text_wrapper]', 0)->plaintext, 'images' => implode(' ^^', $galleria), 'review_count' => $html->find('span[itemprop=reviewCount]', 0)->plaintext, 'stars' => $html->find('meta[property=airbedandbreakfast:rating]', 0)->content, 'og_image' => $html->find('meta[property=og:image]', 0)->content, 'postal_code' => $html->find('meta[property=airbedandbreakfast:postal-code]', 0)->content, 'locality' => $html->find('meta[property=airbedandbreakfast:locality]', 0)->content, 'region' => isset($el->find('meta[property=airbedandbreakfast:region]', 0)->plaintext) ? $el->find('meta[property=airbedandbreakfast:region]', 0)->plaintext : null, 'country_name' => $html->find('meta[property=airbedandbreakfast:country-name]', 0)->content, 'city' => $html->find('meta[property=airbedandbreakfast:city]', 0)->content, 'neighborhood' => $neigh, 'lat' => $html->find('meta[property=airbedandbreakfast:location:latitude]', 0)->content, 'lat' => $html->find('meta[property=airbedandbreakfast:location:latitude]', 0)->content, 'currency' => $html->find('div[id=pricing]/meta', 0)->content, 'messages' => implode(' ^^', $messages)); $dbdata = $attributes + $inputs; //$dbdata = mb_check_encoding($dbdata, 'UTF-8') ? $dbdata : utf8_encode($dbdata); //print json_encode($dbdata) . "\n"; scraperwiki::save(array('id'), $dbdata); $html->__destruct(); } //End single room load } // End the listingnum loop } // End if scraperwiki::save_var('last_page', $i); $html->__destruct(); } // End the for loop
$data = retrieve($url); //echo $url; print_r($data); save($data, $dri, $period, $form, 'chapter', $chapter); //echo '*'; } scraperwiki::save_var('last_c', 0); $c = 0; } } scraperwiki::save_var('last_f', 0); $f = 0; } scraperwiki::save_var('last_p', 0); $p = 0; } scraperwiki::save_var('last_d', 0); $d = 0; function save($data, $dri, $period, $form, $type, $type_value) { foreach ((array) $data as $da) { $d = array('org_id' => $da['value'], 'dri' => $dri['value'], 'period' => $period['value'], 'form' => $form['value']); $o = array('id' => $da['value'], 'name' => $da['label']); switch ($type) { case 'chapter': $o['chapter'] = $type_value['value']; $d['chapter'] = $type_value['value']; break; case 'region': $o['region'] = $type_value['value']; $d['region'] = $type_value['value']; break;
} $html->clear(); unset($html); scraperwiki::save_var('last_id', $i); } require 'scraperwiki/simple_html_dom.php'; scraperwiki::attach("s-in-s", "src"); //scraperwiki::save_var('last_id', 1); //exit(); $id = scraperwiki::get_var('last_id'); for ($i = $id; $i < 1900; $i++) { $src = scraperwiki::select("* from src.swdata limit {$i},1"); $url = $src[0]['link']; $url = 'http://sexinsex.net/bbs/' . $url; $html_content = scraperwiki::scrape($url); $html = str_get_html($html_content); $data = array(); $tr = $html->find("div.postmessage div.t_msgfont"); $j = 0; foreach ($tr as $trr) { $noidung = $trr->find('div', 0)->innertext; //$noidung = utf8_encode($noidung); if (mb_strlen($noidung) > 1000) { $j++; @scraperwiki::save_sqlite(array('id'), array('id' => $j . '-' . $src[0]['url'], 'title' => $src[0]['title'], 'url' => $src[0]['url'], 'content' => base64_encode($noidung), 'order' => $j, 'num' => $src[0]['num'], 'reply' => $src[0]['reply'])); } } $html->clear(); unset($html); scraperwiki::save_var('last_id', $i); }
$synthList3 = file_get_contents("https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=current_synths&query=select%20DISTINCT%20manufacturer%2C%20url%2C%20name%20from%20%60swdata%60"); if (!empty($synthList3)) { $synthList3 = json_decode($synthList3); } $synths = array(); $synths = traverseList($synthList1); $synths = array_merge(traverseList($synthList2), $synths); $synths = array_merge(traverseList($synthList3), $synths); $synths = array_map('unserialize', array_unique(array_map('serialize', $synths))); echo "Total synths: " . count($synths) . "\n"; //var_dump($synths); if (!empty($synths)) { //$dbName = "vintagesynth-scrape-".$today = date("m-d-Y"); $saveMessage = scraperWiki::save_sqlite(array('manufacturer', 'name', 'url'), $synths); //print strval($saveMessage); scraperwiki::save_var('total_results', count($synths)); print scraperWiki::get_var('total_results'); } function traverseList($list) { $dataList = array(); foreach ($list as $item) { //Clean up the data foreach ($item as $key => $value) { $item->{$key} = preg_replace("/<*.>/", "", $value); //echo $item->$key."\n"; } $dataList[] = $item; } return $dataList; }
static function save_metadata($metadata_name, $value) { return scraperwiki::save_var($metadata_name, $value); //return SW_MetadataClient::create()->save($metadata_name, $value); }
} else { $primary = ''; } #echo "primary \n"; #echo json_encode($primary); #echo "\n"; $primary = parseAddress($primary); $legal = parseAddress($legal); if (trim($name) != '') { scraperwiki::save_sqlite(array('ukprn'), array('ukprn' => clean($num), 'instname' => clean($name), 'trading' => clean($trading)), "data"); } scraperwiki::save_var('counter', $counter); } $counter++; if ($counter >= $max) { scraperwiki::save_var('counter', 10000000); $i = 1001; } } function parseAddress($val) { preg_match_all('|<strong>Telephone: </strong>(.*?)<br />|', $val, $phone); if (isset($phone[1][0])) { $dat['phone'] = trim($phone[1][0]); } else { $dat['phone'] = ''; } preg_match_all('|<strong>E-mail: </strong><a href="mailto:(.*?)">.*?</a><br />|', $val, $email); if (isset($email[1][0])) { $dat['email'] = trim($email[1][0]); } else {