$info['against'] = trim(get_first_string($part, 'proti:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'proti:</span>', '<br'));
    $info['abstain'] = trim(get_first_string($part, 'zdržel se:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'zdržel se:</span>', '<br'));
    $info['number_representatives'] = trim(get_first_string($part, 'Počet zastupitelů:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'Počet zastupitelů:</span>', '<br'));
    $info['present'] = trim(get_first_string($part, 'přítomno:</span>', '<br')) == '' ? 0 : trim(get_first_string($part, 'přítomno:</span>', '<br'));
    $trs = $dom->find("table[class=data-grid]", 0)->find("tr");
    array_shift($trs);
    $data = array();
    foreach ($trs as $tr) {
        $tds = $tr->find("td");
        $mp_id = get_first_string($tds[0]->find("a", 0)->href . "&", "memberId=", "&");
        $data[] = array('division_id' => $info['id'], 'mp_id' => $mp_id, 'vote' => trim($tds[1]->plaintext), 'mp_name' => $tds[0]->plaintext);
    }
    //one division done
    scraperwiki::save_sqlite(array('id'), $info, 'division');
    scraperwiki::save_sqlite(array('division_id', 'mp_id'), $data, 'mp_vote');
    scraperwiki::save_var('last_id', $info['id']);
}
/**
 * finds substrings between opening and closing markers
 * @return result array of the substrings
 */
function returnSubstrings($text, $openingMarker, $closingMarker)
{
    $openingMarkerLength = strlen($openingMarker);
    $closingMarkerLength = strlen($closingMarker);
    $result = array();
    $position = 0;
    while (($position = strpos($text, $openingMarker, $position)) !== false) {
        $position += $openingMarkerLength;
        if (($closingMarkerPosition = strpos($text, $closingMarker, $position)) !== false) {
            $result[] = substr($text, $position, $closingMarkerPosition - $position);
 static function save_metadata($metadata_name, $value)
 {
     return scraperwiki::save_var($metadata_name, $value);
 }
foreach ($dom->find("table.contenttable a") as $data) {
    $sectionlinks[] = $data->href;
}
$i = 0;
foreach ($sectionlinks as $link) {
    $sections[] = scraperWiki::scrape("http://www.ruralfinance.org/" . $link);
    $dom = new simple_html_dom();
    $dom->load(end($sections));
    foreach ($dom->find("div[@id='navNext'] a") as $data) {
        $pagetwolinks[] = htmlspecialchars_decode($data->href);
    }
}
$i = 0;
foreach ($pagetwolinks as $link) {
    $it = "";
    $pagetwos[] = scraperWiki::scrape("http://www.ruralfinance.org/" . $link);
    $dom = new simple_html_dom();
    $dom->load(end($pagetwos));
    $j = 0;
    foreach ($dom->find("table.ab tr") as $data) {
        if ($j != 0) {
            $as = $data->find("td a");
            if (strpos($as[0], "http://www.ruralfinance.org/discussion/") > 0) {
                $it .= $as[0] . "<br>";
            }
        }
        $j++;
    }
    scraperwiki::save_var($i, $it);
    $i++;
}
Esempio n. 4
0
foreach ($entries as $entry) {
    $key = $entry->nodeValue;
}
$num = 1;
preg_match_all('/(\\d{4})/', $key, $matches);
foreach ($matches[0] as $val) {
    scraperwiki::save_var($num, $val);
    //print scraperwiki::get_var('last_page');
    $num++;
}
$xpathExpr2 = '/html/body/form/table/tbody/tr[3]/td[1]/table/tr/td[2]/table[3]';
$entries2 = $xpath->query($xpathExpr2);
foreach ($entries2 as $entry2) {
    $key2 = $entry2->nodeValue;
}
preg_match_all('/(\\d{4})/', $key2, $matches2);
foreach ($matches2[0] as $val) {
    scraperwiki::save_var($num, $val);
    $num++;
}
$xpathExpr3 = '/html/body/form/table/tbody/tr[3]/td[1]/table/tr/td[2]/table[1]';
$entries3 = $xpath->query($xpathExpr3);
foreach ($entries3 as $entry3) {
    $key3 = $entry3->nodeValue;
}
$matches3 = explode(" ", $key3);
scraperwiki::save_var("day", $matches3[2]);
$match = explode("\n", $matches3[1]);
scraperwiki::save_var("date", $match[2]);
//scraperwiki::save_var($num, $val);
//$num++;
    return null;
}
$blacklist = array();
$url = "http://www.coastandcountry.co.uk/cottage-details/";
scraperwiki::attach("coastandcountrycouk");
# get an array of the cottage data to scrape
$cottData = scraperwiki::select("COTTAGE_URL, PRICE_HIGH, PRICE_LOW from 'coastandcountrycouk'.SWDATA order by COTTAGE_URL");
$placeholder = scraperwiki::get_var("cottID");
if ($placeholder != "") {
    $index = searchForId($placeholder, $cottData);
    $cottData = array_splice($cottData, $index);
}
require 'scraperwiki/simple_html_dom.php';
$dom = new simple_html_dom();
foreach ($cottData as $value) {
    scraperwiki::save_var("cottID", $value['COTTAGE_URL']);
    // check the cottage url against the blacklist
    foreach ($blacklist as $blItem) {
        if ($value['COTTAGE_URL'] == $blItem) {
            continue 2;
        }
    }
    //load the page into the scraper
    $html = scraperWiki::scrape($url . $value['COTTAGE_URL']);
    $dom->load($html);
    $feature = "";
    $image = "";
    $imgURL = "";
    $xtraFeatures = "";
    /*  Get the Data  */
    // get Cottage name
    }
    $i++;
}
require_once 'scraperwiki/simple_html_dom.php';
scraperwiki::save_var('dummy', 0);
$maxpages = 46;
$i = 1;
$dom = new simple_html_dom();
$placeholder = scraperwiki::get_var("page");
if ($placeholder) {
    $i = $placeholder;
}
$j = 0;
$counter = 0;
while ($i <= $maxpages) {
    scraperwiki::save_var("page", $i);
    $url = "http://www.norfolkcottages.co.uk/cottage-search/amount-10/page-" . $i;
    // Load HTML from a URL
    $html = file_get_html($url);
    $dom->load($html);
    // get the list of cottages
    #echo $dom;
    // page
    foreach ($dom->find('div[id=search-results-container]') as $page) {
        // cottage
        foreach ($dom->find('div[class=property-result-container]') as $page) {
            foreach ($page->find('div[class=middle-container]') as $cottage) {
                $cottage_name = "";
                # cottage name, URL and ID
                foreach ($cottage->find('h2') as $cottageNameData) {
                    foreach ($cottageNameData->find('a') as $cottageURLData) {
                $mp = trim(substr($td->innertext, 1));
                $club = $h2s[$key]->innertext;
                $votes[] = array('division_id' => $html['id'], 'vote' => $vote, 'mp' => $mp, 'club' => $club);
            }
        }
        //print_r($votes);die();
        scraperwiki::save_sqlite(array('division_id', 'mp'), $votes, 'vote');
        /*$tds = $table->find('td');
            if (count($tds) > 0) {
              foreach ($tds as $td) {
        //echo $td->outertext;
                $h3 = $td->find('h3',0);
                if ($h3 != '') {
                  $party = $h3->innertext;
                } else {
                  $vote = substr($td->innertext,0,1);
                  $mp = trim(substr($td->innertext,1));
                  if ($mp != '') 
                      $votes[] = array(
                        'division_id' => $html['id'],
                        'vote' => $vote,
                        'mp' => $mp,
                        'club' => $party,
                      );
                }
              }
              scraperwiki::save_sqlite(array('division_id','mp'),$votes,'vote');
            } */
        scraperwiki::save_var('last_id', $html['id']);
    }
}
function get_details($url, $CurrentMaxPages)
{
    // get the scraperwiki methods and create a new intance
    require_once 'scraperwiki/simple_html_dom.php';
    $dom = new simple_html_dom();
    // Get the bookmarked page if there is one
    // else start at 1
    $getPage = scraperwiki::get_var("page");
    $page = 1;
    if ($getPage != "") {
        $page = $getPage;
    }
    while ($page <= $CurrentMaxPages) {
        // bookmark record
        scraperwiki::save_var("page", $page);
        //load the page into the scraper
        $html = scraperWiki::scrape($url . $page);
        $dom->load($html);
        // get Details
        $i = 0;
        while ($i < 12) {
            // Get URL
            foreach ($dom->find('a[id=SearchResult1_linkTo_' . $i . ']') as $data) {
                $element = $dom->find('a');
                $cotturl = $data->getAttribute('href');
                $cotturl = str_replace("/cottages/", "", $cotturl);
            }
            // get High / Low Prices
            foreach ($dom->find('span[id=featureBoxPropertyWasPricePoundPr_' . $i . ']') as $data) {
                $prices = str_replace("Prices from ", "", $data->plaintext);
                $prices = str_replace(" based on available 7 nights", "", $prices);
                $prices = str_replace("£", "", $prices);
                $prices = explode("-", $prices);
                $price_low = $prices[0];
                $price_high = $prices[1];
            }
            // Put the records into an array
            $record = array('COTTAGE_URL' => trim($cotturl), 'PRICE_HIGH' => trim($price_high), 'PRICE_LOW' => trim($price_low));
            # save the data
            scraperwiki::save(array('COTTAGE_URL'), $record);
            $i++;
        }
        // move on to the next record
        $page++;
    }
}
    scraperwiki::save_sqlite(array('date'), $data, 'info');
    //print_r($data);
    /*charts*/
    $data_chart = array();
    //html
    $url = 'http://www.ceskatelevize.cz' . $charts_link;
    $html = scraperwiki::scrape($url);
    //get dom
    $dom = new simple_html_dom();
    $dom->load($html);
    $script = $dom->find('script', 2);
    $ar1 = explode(']]', $script->innertext);
    //chart 1
    $ar2 = explode('[[[', str_replace("\t", '', str_replace("\n", '', str_replace(' ', '', $ar1[0]))));
    $ar3 = explode('],[', trim(trim($ar2[1]), ']'));
    foreach ($ar3 as $row) {
        $ar4 = explode(',', $row);
        $data_chart[] = array('date' => $date->format('Y-m-d'), 'chart' => '1', 'minute' => $ar4[0], 'value' => $ar4[1]);
    }
    //chart 2
    $ar2 = explode('[[[', str_replace("\t", '', str_replace("\n", '', str_replace(' ', '', $ar1[1]))));
    $ar3 = explode('],[', trim(trim($ar2[1]), ']'));
    foreach ($ar3 as $row) {
        $ar4 = explode(',', $row);
        $data_chart[] = array('date' => $date->format('Y-m-d'), 'chart' => '2', 'minute' => $ar4[0], 'value' => $ar4[1]);
    }
    scraperwiki::save_sqlite(array('date', 'chart', 'minute'), $data_chart, 'chart');
    //print_r($data_chart);
    scraperwiki::save_var('last_date', $date->format('Y-m-d'));
    $date->add(new DateInterval('P1D'));
}
        $i++;
    }
}
//print_r($data);
scraperwiki::save_var("date", $data[0]);
scraperwiki::save_var("1yearcmt", $data[6]);
$html->__destruct();
# Blank PHP
print "Testing getting 1 year CMT from Bankrate.\n";
$html_content = scraperWiki::scrape("http://www.bankrate.com/rates/interest-rates/1-year-cmt.aspx");
//print $html . "\n";
require 'scraperwiki/simple_html_dom.php';
$html = str_get_html($html_content);
$data[0] = "";
foreach ($html->find("div.interactivetopaction ") as $el) {
    //print $el . "\n";
    //print "1 year CMT ". $el->innertext . "\n";
    $data[0] = $el->innertext;
}
$i = 1;
foreach ($html->find("div.boxcontent") as $box) {
    foreach ($box->find("td") as $el) {
        //print $el->innertext . "\n";
        $data[$i] = $el->innertext;
        $i++;
    }
}
//print_r($data);
scraperwiki::save_var("date", $data[0]);
scraperwiki::save_var("1yearcmt", $data[6]);
$html->__destruct();
function setVariable($key, $value)
{
    scraperwiki::save_var($key, $value);
}
scraperwiki::save_var('cno_max', $MaxCoNo);
//$MaxCoNo = 10;
$MaxDbCoNo = scraperwiki::get_var('cno_dbmax');
scraperwiki::save_var('cno_dbmax_old', $MaxDbCoNo);
//$MaxDbCoNo = 1; // RUN for first time.
$MaxCoNo = 50000;
//it's workaround.
echo $MaxDbCoNo . "->" . $MaxCoNo . "\r\n";
for ($iCom = $MaxDbCoNo; $iCom <= $MaxCoNo; $iCom++) {
    unset($ComData);
    $ComData = GetCommunityData($iCom);
    echo $ComData['cid'] . "/" . $ComData['ctype'] . "\r\n";
    //var_dump($ComData);
    SaveCommunityData($ComData);
    if ($iCom % 50 == 0) {
        scraperwiki::save_var('cno_dbmax', $iCom);
    }
    //break;
}
exit;
function SaveCommunityData($ArrComData)
{
    if (!array_key_exists("cid", $ArrComData)) {
        return;
    }
    scraperwiki::save_sqlite(array("cid"), $ArrComData);
}
function GetMaxCoNumber()
{
    $html = scraperWiki::scrape(URL_NEW);
    $dom = new simple_html_dom();
function scrape_stats()
{
    global $debug;
    $CHI = 0;
    $badCnt = 0;
    $CHI = scraperwiki::get_var('IndxStat', $CHI, "0");
    $churches = scraperwiki::select("church_id from churches");
    //print "Scrape Church Stats @" . $CHI . " of " . sizeof($churches). " " . intval($CHI / sizeof($churches) * 100) . "%\n" ;
    while ($CHI < sizeof($churches)) {
        $church = $churches[$CHI];
        $cid = $church['church_id'];
        if ($debug || $CHI % 100 == 0) {
            print "Stats church Empty=" . $badCnt . " " . $CHI . " of " . sizeof($churches) . " " . intval($CHI / sizeof($churches) * 100) . "%\n";
        }
        if (scrape_one_stat($cid) == 0) {
            $badCnt++;
        }
        //print "Stats@" . $CHI . "/" . $cid . "\n";
        $CHI++;
        scraperwiki::save_var('IndxStat', $CHI);
    }
    print "Scrape Stats Done Churches=" . $CHI . " empty=" . $badCnt . "\n";
    scraperwiki::save_var('IndxStat', 0);
}
        scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'rank'), $goals_visitor, 'goal');
        //cards
        $tr = $table->find('tr', 1);
        $tds = $tr->find('td');
        $yellow_home = str2action($tds[0]->innertext, 'home', $match_id, 'yellow');
        scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $yellow_home, 'card');
        $yellow_visitor = str2action($tds[2]->innertext, 'visitor', $match_id, 'yellow');
        scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $yellow_visitor, 'card');
        $tr = $table->find('tr', 2);
        $tds = $tr->find('td');
        $red_home = str2action($tds[0]->innertext, 'home', $match_id, 'red');
        scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $red_home, 'card');
        $red_visitor = str2action($tds[2]->innertext, 'visitor', $match_id, 'red');
        scraperwiki::save_sqlite(array('match_id', 'home_visitor', 'card', 'rank'), $red_visitor, 'card');
    }
    scraperwiki::save_var('last_id', $match_id);
}
function str2action($str, $home_visitor, $match_id, $card = null)
{
    if ($str != '-') {
        $out = array();
        $rank = 1;
        $goals_ar = explode('<br>', $str);
        foreach ($goals_ar as $goalstr) {
            $fake_dom = new simple_html_dom();
            $fake_dom->load('<html><body>' . $goalstr . '</body></html>');
            if (is_object($fake_dom->find('a', 0))) {
                $min_ar = explode('<', $goalstr);
                $minute = trim(trim($min_ar[0]), '.');
                $player = $fake_dom->find('a', 0)->plaintext;
                $player_link = $fake_dom->find('a', 0)->href;
    // used for debugging
    if ($run_environment == 'dev') {
        if (empty($records)) {
            $records = get_mayors($result);
        } else {
            $records = array_merge($records, get_mayors($result));
        }
    } else {
        get_mayors($result);
        // sleep(10); // this might be needed on scraperwiki.com
    }
    // reset the progress bookmark
    scraperwiki::save_var('last_state', '');
}
// Set state of scraper to complete so we know it didn't crash part way thru
scraperwiki::save_var('scraper_state', 'complete');
// if testing
if ($run_environment == 'dev') {
    header('Content-type: application/json');
    print json_encode($records);
}
function get_post_response($url, $fields)
{
    $fields_string = '';
    //url-ify the data for the POST
    foreach ($fields as $key => $value) {
        $fields_string .= $key . '=' . $value . '&';
    }
    rtrim($fields_string, '&');
    //open connection
    $ch = curl_init();
     }
     if ($k < 50) {
         $tempstr = $temparr2[1] . ";1;" . $arrName[0] . ";" . $strCat . ";" . $intRetail . ";30;" . $intPrice . ";;;;;;" . $temparr2[1] . ";" . $temparr2[1] . ";rem;" . ";;;;0.25;999;" . $strBrah . $arrBrand[0] . " " . $arrName[0] . " (" . $temparr2[3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";;;1;;1;..//img/sun/" . $temparr2[1] . ".jpg;;;;";
     }
     /* $tempstr = $temparr2[$i][1] . "; 1;" . $arrName[0] . ";;;;;;;;;;" . $temparr2[$i][1] . "; " . $temparr2[$i][1] . "; " . "rem; " . ";;;;" . "0.25; 999;" . $arrBrand[0] . " " . $arrName[0] . " (" . $temparr2[$i][3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";" . ";;1;;1;..//img/" . $temparr2[$i][0] . ";" . ";;;";  */
     //Saving data for Products.csv
     //  $strVarName = $arrBrand[0] . " " . $arrName[0] . $temparr2[3];
     // scraperwiki::save_var($strVarName, $tempstr);
     //print_r($i);
     //print_r($strVarName);
     //print_r($strVarName);
     // print_r($temparr2);
     //Saving images
     $image_url = "http://www.remeyewear.com/showimage.aspx?img=" . $temparr2[0] . ".jpg&sku=" . $temparr2[1] . "&w=667";
     $image_local = $temparr2[1] . ".jpg,";
     scraperwiki::save_var($image_local, $image_url);
     //copy( $image_url, '/tmp/' . $image_local );
 }
 //end for count($arr)
 //print_r($arrofarr);
 //print_r($arrofarr[2][2]);
 //print_r($temparr2);
 /*
 
 $str = $arrofarr[0][1] . "; 1;" . $arrName[0] . ";;;;;;;;;;" . $arrofarr[0][1] . "; " . $arrofarr[0][1] . "; " . "rem; " . ";;;;" . "0.25; 999;" . $arrBrand[0] . " " . $arrName[0] . " (" . $arrofarr[0][3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";" . ";;1;;1;..//img/" . $arrofarr[0][0] . ";" . ";;;";
 
 echo $str ;
 echo "****************************" ;
 
 
 $str1 = $arrofarr[1][1] . "; 1;" . $arrName[0] . ";;;;;;;;;;" . $arrofarr[1][1] . "; " . $arrofarr[1][1] . "; " . "rem; " . ";;;;" . "0.25; 999;" . $arrBrand[0] . " " . $arrName[0] . " (" . $arrofarr[1][3] . ");" . $arrDescStr . ";" . ";;;;" . $arrName[0] . ";" . ";;1;;1;..//img/" . $arrofarr[1][0] . ";" . ";;;";
    # Determine the URL of the linked page (e.g. the info for 'WRK'
    $href = html_entity_decode($row->find('a', 0)->href);
    $url = $domain . $href;
    # Get the postcode from the linked page
    $html2 = scraperWiki::scrape($url);
    $dom2 = new simple_html_dom();
    $dom2->load($html2);
    $address = trim($dom2->find('address', 0)->plaintext);
    $lines = explode("\n", $address);
    $postcode = trim(array_pop($lines));
    # Convert postcode to lat/lon
    list($lat, $lng) = scraperwiki::gb_postcode_to_latlng($postcode);
    //$postcodeTrimmed = str_replace (' ', '', $postcode);
    //$latlng = scraperwiki::select("* from ukp.swdata where postcode='{$postcodeTrimmed}';");
    # Assemble the record
    $station = array('code' => trim($cols[1]->plaintext), 'name' => html_entity_decode(trim($cols[0]->plaintext)), 'postcode' => $postcode, 'latitude' => $lat, 'longitude' => $lng, 'url' => $url);
    # Save the record
    scraperwiki::save(array('code'), $station);
    print_r($record);
    # Limit while testing
    //if ($i == 10) {break;}
    # Save the current position
    scraperwiki::save_var('run_first', $i);
}
/*
    Useful pages:
    http://scraperwiki.com/scrapers/swansea_food_safety_inspections_1/edit/
    http://scraperwiki.com/docs/php/php_intro_tutorial/
    http://scraperwiki.com/docs/php/php_help_documentation/
    http://scraperwiki.com/views/postcode_and_geo_cheat_sheet/edit/
*/
                    //$presence = str_replace('&nbsp;','',trim($tds[2]->plaintext));  //simple_html_dom.php
                    $presence = str_replace('&nbsp;', '', trim(strip_tags('<td' . $tds[2])));
                } else {
                    //vote
                    $d = array('date' => $date, 'session' => $session, 'presence' => $presence, 'name' => trim(strip_tags('<td' . $tds[1])), 'vote' => trim(strip_tags('<td' . $tds[3])), 'mp_id' => $html[0]['mp_id'], 'term' => $html[0]['term']);
                    $data[] = $d;
                }
            }
            scraperwiki::save_sqlite(array('term', 'mp_id', 'date', 'name'), $data, 'vote');
        }
        scraperwiki::save_var('last_term', $html[0]['term']);
        scraperwiki::save_var('last_mp_id', $html[0]['mp_id']);
    }
}
scraperwiki::save_var('last_term', 0);
scraperwiki::save_var('last_mp_id', 0);
/**
* finds substrings between opening and closing markers
* @return result array of the substrings
*/
function returnSubstrings($text, $openingMarker, $closingMarker)
{
    $openingMarkerLength = strlen($openingMarker);
    $closingMarkerLength = strlen($closingMarker);
    $result = array();
    $position = 0;
    while (($position = strpos($text, $openingMarker, $position)) !== false) {
        $position += $openingMarkerLength;
        if (($closingMarkerPosition = strpos($text, $closingMarker, $position)) !== false) {
            $result[] = substr($text, $position, $closingMarkerPosition - $position);
            $position = $closingMarkerPosition + $closingMarkerLength;
$options = $divs[0]->find('option[selected=selected]');
$term = $options[0]->value;
//compare saved term
$saved_term = scraperwiki::get_var('current_term');
$info = scraperwiki::show_tables();
if ($term != $saved_term) {
    if (isset($info['club'])) {
        scraperwiki::sqliteexecute("delete from club");
        scraperwiki::sqlitecommit();
    }
    if (isset($info['membership'])) {
        scraperwiki::sqliteexecute("delete from membership");
        scraperwiki::sqlitecommit();
    }
}
scraperwiki::save_var('current_term', $term);
//current clubs
foreach ($ids as $i) {
    $url = "http://www.nrsr.sk/web/Default.aspx?sid=poslanci/kluby/klub&ID=" . $i;
    $html = scraperwiki::scrape($url);
    //get dom
    $dom = new simple_html_dom();
    $dom->load($html);
    //is it a valid club (or empty)
    $h2s = $dom->find("h2");
    if (trim($h2s[0]->plaintext) == 'Neočakávaná chyba!') {
    } else {
        //club
        if (isset($info['club'])) {
            scraperwiki::sqliteexecute("delete from club where id = '{$i}'");
            scraperwiki::sqlitecommit();
                if (strpos($html, 'Para esta zona no hay elección') > 0) {
                    continue;
                }
                $data = get_data($dom, $house, $sex, $year, $number - 400);
                scraperwiki::save_sqlite(array('year', 'house', 'district', 'i', 'sex'), $data);
            }
            scraperwiki::save_var('last_number', 400);
            $last_number = 400;
        }
        scraperwiki::save_var('last_year', 2009);
        $last_year = 2009;
    }
    scraperwiki::save_var('last_sex', 0);
    $last_sex = 0;
}
scraperwiki::save_var('last_house', 0);
$last_house = 0;
function get_data($dom, $house, $sex, $year, $district)
{
    $data = array();
    $i = 1;
    $table = $dom->find('table[CellPadding=4]', 0);
    $trs = $table->find('tr');
    array_shift($trs);
    array_pop($trs);
    foreach ($trs as $tr) {
        $row = array('year' => $year, 'house' => $house, 'sex' => $sex, 'district' => $district);
        $tds = $tr->find('td');
        $row['name'] = $tds[0]->plaintext;
        $row['party'] = $tds[1]->plaintext;
        $row['votes'] = str_replace('.', '', trim($tds[2]->plaintext));
Esempio n. 21
0
        $value = intval($columns[1]->plaintext);
        // Some rows are empty, and we should exclude them.
        if (strlen($attribute) > 0) {
            //print $attribute . ": ";
            //print $value . "\n";
            $data["id"] = $productId;
            $data[$attribute] = $value;
        }
    }
    // Get the price.
    $priceDom = $dom->find("td[background*=add_cart_bgd02.jpg]");
    $price = floatval(str_replace('$', "", $priceDom[0]->plaintext));
    $data["price"] = $price;
    // Calculate out a few extra fields:
    $cells = $data["Config(s)"];
    $capacity = $data["Capacity(mAh)"];
    $energy = $cells * 3.7 * $capacity / 1000;
    $value = $energy / $price;
    $data["Energy (Wh)"] = $energy;
    $data["Value (Wh/\$)"] = $value;
    //print_r($data);
    scraperwiki::save(array("id"), $data);
    scraperwiki::save_var("currentId", $productId);
    $loopCount++;
}
// Check to see if we have scraped everything - if so, start again!
$lastBattery = end($batteries);
if ($lastBattery['id'] == $productId) {
    print "All known batteries processed. Clearing progress marker so scraper can start again.";
    scraperwiki::save_var("currentId", -1);
}
                                                scraperwiki::save(array('VOTOS'), $record);
                                                $saved_counter++;
                                                scraperwiki::save_var("LastRunCounter", $saved_counter);
                                            }
                                        }
                                        scraperwiki::save_var("MesaCounter", $mesa_counter);
                                    }
                                    $mesa_counter++;
                                }
                                // Mesa
                                scraperwiki::save_var("CentroCounter", $centro_counter);
                            }
                            $centro_counter++;
                        }
                        // Centro
                        scraperwiki::save_var("ParroquiaCounter", $parroquia_counter);
                    }
                    $parroquia_counter++;
                }
                // Parroquia
                scraperwiki::save_var("MunicipioCounter", $municipio_counter);
            }
            $municipio_counter++;
        }
        // Municipio
        scraperwiki::save_var("EstadoCounter", $estado_counter);
    }
    $estado_counter++;
}
// Estado
# print $html . "\n";
            $twitter = str_replace("@", "", $link->data);
            $twitter = preg_replace("/\\s+/", "", $twitter);
            $twitter = str_replace("Twitter:", "", $twitter);
            $OBJ['twitter'] = $twitter;
            // Klout (based on Twitter)
            $klout = scraperWiki::scrape('http://api.klout.com/v2/identity.json/twitter?screenName=' . $twitter . '&key=v23b2ddvdf8n5fvap95kk56r');
            $klout = json_decode($klout);
            $klout = scraperWiki::scrape('http://api.klout.com/v2/user.json/' . $klout->id . '?key=v23b2ddvdf8n5fvap95kk56r');
            $klout = json_decode($klout);
            $OBJ['klout'] = $klout->score->score;
        }
        if ($link->label == 'Profile URL:') {
            $profile_url = str_replace("Profile URL:", "", $link->data);
            $profile_url = preg_replace("/\\s+/", "", $profile_url);
            $OBJ['profile_url'] = $profile_url;
        }
        if ($link->label == 'LinkedIn:') {
            $LinkedIn = str_replace("LinkedIn:", "", $link->data);
            $LinkedIn = preg_replace("/\\s+/", "", $LinkedIn);
            $OBJ['linkedIn'] = $LinkedIn;
        }
    }
    // Clean certifications
    $certifications = array_unique(json_decode($row['certifications']));
    $OBJ['certifications'] = json_encode($certifications);
    // Geo
    scraperwiki::save_sqlite(array('id', 'name', 'company', 'location', 'date', 'url', 'profile', 'twitter', 'klout', 'profile_url', 'linkedIn', 'certifications'), $OBJ);
    scraperwiki::save_var('last_page', $counter);
    $counter = $counter + 1;
    print_r($counter);
}
        scraperwiki::save(array('company'), $record);
        //print json_encode($record) . "\n";
        scraperwiki::save_var('last', $profile_no);
    }
}
//scraperwiki::save_var('last', 0);
scraperwiki::attach("find_4n_profiles");
$links = scraperwiki::select("profile from find_4n_profiles.swdata");
require 'scraperwiki/simple_html_dom.php';
$profile = new simple_html_dom();
foreach ($links as $link) {
    set_time_limit(0);
    $profile_no = intval(str_replace('http://www.4networking.biz/Members/Details/', '', $link['profile']));
    if ($profile_no > scraperwiki::get_var('last')) {
        $html = scraperWiki::scrape($link['profile']);
        $profile->load($html);
        if (!($company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->title)) {
            $company = $profile->find("//*[@id='main']/div[1]/div/div[1]/div[2]/div[1]/div[1]/span/span", 0)->plaintext;
        }
        $website = $profile->find("span.orange-text a", 0) ? $profile->find("span.orange-text a", 0)->href : '';
        if ($profile->find("div.blue3-empty-box div.content div.word-wrap", 0)) {
            $info = $profile->find("div.blue3-empty-box div.content div.word-wrap", 0)->plaintext;
        } else {
            $info = '';
        }
        $record = array('name' => $profile->find("//div/a/span", 1)->plaintext, 'company' => $company, 'phone' => $profile->find("strong.big-blue3-text span", 0)->plaintext, 'website' => $website);
        scraperwiki::save(array('company'), $record);
        //print json_encode($record) . "\n";
        scraperwiki::save_var('last', $profile_no);
    }
}
                foreach ($result as $element) {
                    $val = $element->innertext;
                    $messages[] = $element;
                    //print json_encode($galleria) . "\n";
                }
                //Description Details li
                $result = $html->find('ul[id=description_details]/li');
                $inputs = array();
                foreach ($result as $element) {
                    $key = $element->find('span.property', 0)->plaintext;
                    $val = $element->find('span.value', 0)->plaintext;
                    $inputs[$key] = $val;
                    //print json_encode($inputs) . "\n";
                }
                //Other Page Attributes
                $attributes = array('id' => $hostingid, 'title' => $html->getElementsByTagName('title')->plaintext, 'saved_count' => $html->find('div.saved_count span', 0)->plaintext, 'price' => $html->find('h2[id=price_amount]', 0)->plaintext, 'description' => $html->find('div[id=description_text_wrapper]', 0)->plaintext, 'images' => implode(' ^^', $galleria), 'review_count' => $html->find('span[itemprop=reviewCount]', 0)->plaintext, 'stars' => $html->find('meta[property=airbedandbreakfast:rating]', 0)->content, 'og_image' => $html->find('meta[property=og:image]', 0)->content, 'postal_code' => $html->find('meta[property=airbedandbreakfast:postal-code]', 0)->content, 'locality' => $html->find('meta[property=airbedandbreakfast:locality]', 0)->content, 'region' => isset($el->find('meta[property=airbedandbreakfast:region]', 0)->plaintext) ? $el->find('meta[property=airbedandbreakfast:region]', 0)->plaintext : null, 'country_name' => $html->find('meta[property=airbedandbreakfast:country-name]', 0)->content, 'city' => $html->find('meta[property=airbedandbreakfast:city]', 0)->content, 'neighborhood' => $neigh, 'lat' => $html->find('meta[property=airbedandbreakfast:location:latitude]', 0)->content, 'lat' => $html->find('meta[property=airbedandbreakfast:location:latitude]', 0)->content, 'currency' => $html->find('div[id=pricing]/meta', 0)->content, 'messages' => implode(' ^^', $messages));
                $dbdata = $attributes + $inputs;
                //$dbdata = mb_check_encoding($dbdata, 'UTF-8') ? $dbdata : utf8_encode($dbdata);
                //print json_encode($dbdata) . "\n";
                scraperwiki::save(array('id'), $dbdata);
                $html->__destruct();
            }
            //End single room load
        }
        // End the listingnum loop
    }
    // End if
    scraperwiki::save_var('last_page', $i);
    $html->__destruct();
}
// End the  for loop
                        $data = retrieve($url);
                        //echo $url; print_r($data);
                        save($data, $dri, $period, $form, 'chapter', $chapter);
                        //echo '*';
                    }
                    scraperwiki::save_var('last_c', 0);
                    $c = 0;
            }
        }
        scraperwiki::save_var('last_f', 0);
        $f = 0;
    }
    scraperwiki::save_var('last_p', 0);
    $p = 0;
}
scraperwiki::save_var('last_d', 0);
$d = 0;
function save($data, $dri, $period, $form, $type, $type_value)
{
    foreach ((array) $data as $da) {
        $d = array('org_id' => $da['value'], 'dri' => $dri['value'], 'period' => $period['value'], 'form' => $form['value']);
        $o = array('id' => $da['value'], 'name' => $da['label']);
        switch ($type) {
            case 'chapter':
                $o['chapter'] = $type_value['value'];
                $d['chapter'] = $type_value['value'];
                break;
            case 'region':
                $o['region'] = $type_value['value'];
                $d['region'] = $type_value['value'];
                break;
    }
    $html->clear();
    unset($html);
    scraperwiki::save_var('last_id', $i);
}
require 'scraperwiki/simple_html_dom.php';
scraperwiki::attach("s-in-s", "src");
//scraperwiki::save_var('last_id', 1);
//exit();
$id = scraperwiki::get_var('last_id');
for ($i = $id; $i < 1900; $i++) {
    $src = scraperwiki::select("* from src.swdata limit {$i},1");
    $url = $src[0]['link'];
    $url = 'http://sexinsex.net/bbs/' . $url;
    $html_content = scraperwiki::scrape($url);
    $html = str_get_html($html_content);
    $data = array();
    $tr = $html->find("div.postmessage div.t_msgfont");
    $j = 0;
    foreach ($tr as $trr) {
        $noidung = $trr->find('div', 0)->innertext;
        //$noidung = utf8_encode($noidung);
        if (mb_strlen($noidung) > 1000) {
            $j++;
            @scraperwiki::save_sqlite(array('id'), array('id' => $j . '-' . $src[0]['url'], 'title' => $src[0]['title'], 'url' => $src[0]['url'], 'content' => base64_encode($noidung), 'order' => $j, 'num' => $src[0]['num'], 'reply' => $src[0]['reply']));
        }
    }
    $html->clear();
    unset($html);
    scraperwiki::save_var('last_id', $i);
}
$synthList3 = file_get_contents("https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=current_synths&query=select%20DISTINCT%20manufacturer%2C%20url%2C%20name%20from%20%60swdata%60");
if (!empty($synthList3)) {
    $synthList3 = json_decode($synthList3);
}
$synths = array();
$synths = traverseList($synthList1);
$synths = array_merge(traverseList($synthList2), $synths);
$synths = array_merge(traverseList($synthList3), $synths);
$synths = array_map('unserialize', array_unique(array_map('serialize', $synths)));
echo "Total synths: " . count($synths) . "\n";
//var_dump($synths);
if (!empty($synths)) {
    //$dbName = "vintagesynth-scrape-".$today = date("m-d-Y");
    $saveMessage = scraperWiki::save_sqlite(array('manufacturer', 'name', 'url'), $synths);
    //print strval($saveMessage);
    scraperwiki::save_var('total_results', count($synths));
    print scraperWiki::get_var('total_results');
}
function traverseList($list)
{
    $dataList = array();
    foreach ($list as $item) {
        //Clean up the data
        foreach ($item as $key => $value) {
            $item->{$key} = preg_replace("/<*.>/", "", $value);
            //echo $item->$key."\n";
        }
        $dataList[] = $item;
    }
    return $dataList;
}
 static function save_metadata($metadata_name, $value)
 {
     return scraperwiki::save_var($metadata_name, $value);
     //return SW_MetadataClient::create()->save($metadata_name, $value);
 }
Esempio n. 30
0
        } else {
            $primary = '';
        }
        #echo "primary \n";
        #echo json_encode($primary);
        #echo "\n";
        $primary = parseAddress($primary);
        $legal = parseAddress($legal);
        if (trim($name) != '') {
            scraperwiki::save_sqlite(array('ukprn'), array('ukprn' => clean($num), 'instname' => clean($name), 'trading' => clean($trading)), "data");
        }
        scraperwiki::save_var('counter', $counter);
    }
    $counter++;
    if ($counter >= $max) {
        scraperwiki::save_var('counter', 10000000);
        $i = 1001;
    }
}
function parseAddress($val)
{
    preg_match_all('|<strong>Telephone: </strong>(.*?)<br />|', $val, $phone);
    if (isset($phone[1][0])) {
        $dat['phone'] = trim($phone[1][0]);
    } else {
        $dat['phone'] = '';
    }
    preg_match_all('|<strong>E-mail: </strong><a href="mailto:(.*?)">.*?</a><br />|', $val, $email);
    if (isset($email[1][0])) {
        $dat['email'] = trim($email[1][0]);
    } else {