Exemple #1
2
 function parseModelsPage($brandId, $brandName, $page)
 {
     $html_content = scraperwiki::scrape($page);
     $this->html = str_get_html($html_content);
     foreach ($this->html->find("div.makers a") as $el) {
         $img = $el->find('img', 0);
         $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext;
         $m['img'] = $img->src;
         $m['link'] = 'http://www.gsmarena.com/' . $el->href;
         $m['desc'] = $img->title;
         $temp = explode('-', $el->href);
         $m['id'] = (int) substr($temp[1], 0, -4);
         $m['brand_id'] = $brandId;
         scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model");
         $this->models++;
     }
     $pagination = $this->html->find("div.nav-pages", 0);
     if ($pagination) {
         $nextPageLink = $pagination->lastChild();
         if ($nextPageLink && $nextPageLink->title == "Next page") {
             $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href);
         }
     }
     $this->html->__destruct();
 }
function kcci($uuid)
{
    // Create DOM from URL or file
    $html = file_get_html('http://www.kcci.com.pk/UserProfile/tabid/42/userId/' . $uuid . '/Default.aspx');
    // Extract member profile from table
    $table = $html->find('table', 1);
    $profile = array();
    foreach ($table->find('td') as $td) {
        array_push($profile, $td->plaintext);
    }
    $record['UUID'] = $uuid;
    for ($i = 0; $i < count($profile); $i += 2) {
        $record[$profile[$i]] = $profile[$i + 1];
    }
    // Save the record
    ksort($record);
    $unique_keys = array('UUID');
    scraperwiki::save_sqlite($unique_keys, $record, $table_name = "kcci", $verbose = 2);
    // Clean up
    unset($record);
    unset($profile);
    $td->clear();
    unset($td);
    $table->clear();
    unset($table);
    $html->clear();
    unset($html);
}
function grep_munich($url, $table_name)
{
    $html = scraperWiki::scrape($url);
    $count = 0;
    # Use the PHP Simple HTML DOM Parser to extract <td> tags
    $dom = new simple_html_dom();
    $dom->load($html);
    //Drop all old informations by dropping the table
    scraperwiki::sqliteexecute("drop table if exists " . $table_name);
    scraperwiki::sqlitecommit();
    $table = $dom->getElementById('flight_info_area');
    foreach ($table->find('tr') as $data) {
        // Flight details. Read tds or ths
        $tds = $data->find("td");
        //if there are less then 7 columns continue to next loop
        if (sizeof($tds) < 7) {
            continue;
        }
        //print $data->plaintext . "\n";
        $flightnr = $tds[1]->plaintext;
        $from = $tds[2]->plaintext;
        $time = $tds[3]->plaintext;
        $expected_time = $tds[4]->plaintext;
        //Create date
        $date = date("Y-m-d");
        //Build array of flight informations
        $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time);
        //Save the informations of one flight
        scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name);
        $count = $count + 1;
    }
}
function run_ml($q_num = 0)
{
    $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext)));
        $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext);
        /*
         *  Stores results
         */
        scraperwiki::save_sqlite(array("No"), $record);
        unset($temp_data);
    }
    foreach ($dom->find("a") as $a) {
        if ($a->plaintext == 'Next') {
            $tmp_a = $a->href;
            $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a);
            if ($tmp_a > 0) {
                continue;
            }
        }
    }
    if ((int) $tmp_a != 0) {
        run_ml($tmp_a);
    } else {
        exit;
    }
}
function scrapeMarketGroup($url)
{
    global $visitedIds;
    $html = scraperWiki::scrape($url);
    $html = str_replace("\n", "", $html);
    preg_match_all("|<a href=\"/importing/61000746/marketgroup/(\\d+?)/\">(.+?)</a>|s", $html, $matches, PREG_SET_ORDER);
    foreach ($matches as $match) {
        $groupId = $match[1];
        $groupName = html_entity_decode($match[2]);
        //echo $groupName."\n";
        if (!in_array($groupId, $visitedIds)) {
            $visitedIds[] = $groupId;
            scrapeMarketGroup("http://goonmetrics.com/importing/61000746/marketgroup/" . $groupId . "/");
        }
    }
    preg_match_all("|<tr(.*?)>(.*?)<td(.*?)><a href=\"http://games.chruker.dk/eve_online/item.php\\?type_id=(.+?)\" target=\"_blank\">(.*?)<span class=\"dot\" onclick=\"CCPEVE.showMarketDetails\\((.*?)\\)\">(.+?)</span>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)</tr>|s", $html, $matches, PREG_SET_ORDER);
    foreach ($matches as $match) {
        $item = array("itemId" => trim($match[4]), "name" => trim(mb_check_encoding($match[7], 'UTF-8') ? $match[7] : utf8_encode($match[7])), "weekVol" => trim(mb_check_encoding($match[11], 'UTF-8') ? $match[11] : utf8_encode($match[11])), "k6Stock" => trim(mb_check_encoding($match[17], 'UTF-8') ? $match[17] : utf8_encode($match[17])));
        $item['weekVol'] = str_replace(",", "", $item['weekVol']);
        $item['k6Stock'] = str_replace(",", "", $item['k6Stock']);
        $saved = false;
        $delay = 0;
        while (!$saved && $delay < 600) {
            try {
                @scraperwiki::save_sqlite(array('itemId'), $item, 'eve_goonmetrics');
                $saved = true;
            } catch (Exception $e) {
                sleep(10);
                $delay++;
            }
        }
    }
}
function scrape_page()
{
    $row = 0;
    $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) > 8) {
            $row++;
            $GLOBALS['rowTotal']++;
            $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext);
            scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']);
            print $GLOBALS['rowTotal'] . "\n";
            print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n";
        }
    }
    if ($row == 50) {
        print "Vielä jatkuu, haetaan seuraava sivu..." . "\n";
        $GLOBALS['z']++;
        scrape_page();
    } else {
        print "Skrääpiminen suoritettu." . "\n";
        print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n";
        print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n";
    }
}
Exemple #7
0
function getCategories($u)
{
    global $baseurl, $f;
    $path = "";
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    echo "Loaded URL: " . $u . "\n";
    if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) {
        $breadcrumb = $d->find('div[id=breadcrumb]', 0);
        //foreach($breadcrumb as $b) {
        //echo "Breadcrumb = " . $b;}
        if (!is_null($breadcrumb)) {
            foreach ($breadcrumb->children() as $crumb) {
                $path .= trim($crumb->innertext) . "/";
            }
            $path .= trim(strrchr($breadcrumb->innertext, ">"), "> ");
        }
        foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) {
            $name = trim(strstr($div->children(0)->innertext, "(", true));
            $url = $baseurl . $div->children(0)->href;
            $data = array("Name" => $name, "Path" => $path, "URL" => $url);
            echo $path . "/" . $name . "\n";
            if ($local) {
                fputcsv($f, array($name, $path, $url));
            } else {
                scraperwiki::save_sqlite(array("URL"), $data);
            }
            getCategories($url);
        }
    }
}
function ripById($id)
{
    $pathToDetails = 'http://beheshtezahra.tehran.ir/Default.aspx?tabid=92&ctl=SearchDetails&mid=653&srid=' . $id;
    $output = scraperwiki::scrape($pathToDetails);
    $firstnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblNameBound_0"><b>(.*)<\\//smiU';
    $surnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblLastNameBound_0"><b>(.*)<\\//smiU';
    $fathernamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblFatherNameBound_0"><b>(.*)<\\//smiU';
    $birthdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblBirthDateBound_0"><b>(.*)<\\//smiU';
    $deathdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnDateBound_0"><b>(.*)<\\//smiU';
    $deathplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDeastTownshipTitle_0"><b>(.*)<\\//smiU';
    $graveplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnPlace_0"><b>(.*)<\\//smiU';
    preg_match($firstnamepattern, $output, $temp);
    $firstname = isset($temp[1]) ? $temp[1] : '';
    preg_match($surnamepattern, $output, $temp);
    $surname = isset($temp[1]) ? $temp[1] : '';
    preg_match($fathernamepattern, $output, $temp);
    $fathername = isset($temp[1]) ? $temp[1] : '';
    preg_match($birthdatepattern, $output, $temp);
    $birthdate = isset($temp[1]) ? $temp[1] : '';
    preg_match($deathdatepattern, $output, $temp);
    $deathdate = isset($temp[1]) ? $temp[1] : '';
    preg_match($deathplacepattern, $output, $temp);
    $deathplace = isset($temp[1]) ? $temp[1] : '';
    preg_match($graveplacepattern, $output, $temp);
    $graveplace = isset($temp[1]) ? $temp[1] : '';
    scraperwiki::save_sqlite(array('data'), array('id' => $id, 'firstname' => $firstname, 'surname' => $surname, 'fathername' => $fathername, 'birthdate' => $birthdate, 'deathdate' => $deathdate, 'deathplace' => $deathplace, 'graveplace' => $graveplace));
}
function get_codes($dom)
{
    foreach ($dom->find("select") as $data) {
        foreach ($data->find("option") as $op) {
            $record = array('stockCode' => $op->value, 'stockSymbol' => $op->plaintext);
            $message = scraperwiki::save_sqlite(array("stockCode"), $record);
            #print_r($message);
        }
    }
}
function get_codes($dom)
{
    foreach ($dom->find('tr[class^="list_row"]') as $data) {
        $tds = $data->find("td");
        //print $tds[0]->plaintext . "\n";
        $record = array('item' => $tds[0]->plaintext, 'BUY_CND' => $tds[1]->plaintext, 'SELL_CND' => $tds[2]->plaintext, 'BUY_US' => $tds[3]->plaintext, 'SELL_US' => $tds[4]->plaintext);
        scraperwiki::save_sqlite(array("item"), $record);
        print_r($record);
    }
}
Exemple #11
0
function ripByPage($page)
{
    $pathToDetails = 'http://aramestan.e-sanandaj.ir/BurialRequest/DeadSearch?keyword=&firstName=&lastName=&fatherName=&partNo=0&rowNo=&graveNo=&deathDateFrom=&deathDateTo=&bornDateFrom=&bornDateTo=&page=' . $page;
    $output = scraperwiki::scrape($pathToDetails);
    $resultingJsonObject = json_decode($output);
    for ($id = 0; $id <= 9; $id++) {
        $entry = array('id' => $resultingJsonObject->{'result'}[$id]->{'Id'}, 'fullname' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFullName'}), 'fathername' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFatherName'}), 'birthdate' => strVal($resultingJsonObject->{'result'}[$id]->{'BornDate'}), 'deathdate' => strVal($resultingJsonObject->{'result'}[$id]->{'DeathDate'}), 'partno' => strVal($resultingJsonObject->{'result'}[$id]->{'PartNo'}), 'rowno' => strVal($resultingJsonObject->{'result'}[$id]->{'RowNo'}), 'graveno' => strVal($resultingJsonObject->{'result'}[$id]->{'GraveNo'}), 'gender' => strVal($resultingJsonObject->{'result'}[$id]->{'Gender'}), 'identitycode' => strVal($resultingJsonObject->{'result'}[$id]->{'IdentityCode'}));
        scraperwiki::save_sqlite(array('data'), $entry);
        $pagecount = $resultingJsonObject->{'PageNumber'};
    }
}
function grab($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("#tbl_proxy_list tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) == 6) {
            $input = decode_ip((string) $tds[0]);
            $record = array('ip' => $input);
            //scraperwiki::save(array('ip'), $record);
            scraperwiki::save_sqlite(array("ip"), array("ip" => $input));
        }
    }
}
function scrapeIndex($url)
{
    $html_content = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html_content);
    $ngos = array();
    foreach ($dom->find('h2') as $h2) {
        $name = str_replace("&#8211;", "-", html_entity_decode($h2->plaintext));
        $url = $h2->find('a', 0);
        $url = $url->href;
        $ngos[] = array("name" => $name, "url" => $url);
        scraperwiki::save_sqlite(array("name"), array("name" => $name, "url" => $url), "ngos");
    }
    print_r($ngos);
    return $ngos;
}
function crawlAgents($pageUrl, $domObj)
{
    $html = scraperwiki::scrape($pageUrl);
    $domObj->load($html);
    $html = null;
    $table = $domObj->find('/html/body/table[5]');
    foreach ($table[0]->find('tr') as $trs) {
        if (strpos($trs->firstChild()->plaintext, " String ") == false) {
            $tds = $trs->find('td');
            $agentstring = str_replace('&nbsp;', '', $tds[0]->plaintext);
            $agentdescription = str_replace('&nbsp;', '', $tds[1]->plaintext);
            $agenttype = str_replace('&nbsp;', '', $tds[2]->plaintext);
            $record = array('agent' => $agentstring, 'description' => $agentdescription, 'agent_type' => $agenttype);
            scraperwiki::save_sqlite(array('agent'), $record, $table_name = "UserAgents");
        }
    }
}
function ProductInfo($motherboards)
{
    foreach ($motherboards as $mobo) {
        $html = scraperWiki::scrape($mobo['URI']);
        $dom = new simple_html_dom();
        $dom->load($html);
        $specs = $dom->find('div#specifications');
        $video = $specs[0]->find('tr#GraphicsOutput td', 1)->plaintext;
        $hdmi = preg_match('/hdmi/', strtolower($video));
        $vga = preg_match('/vga/', strtolower($video));
        $dp = preg_match('/dp|displayport|display[ ]port/', strtolower($video));
        $details = array('Name' => $mobo['Name'], 'URI' => $mobo['URI'], 'Status' => $specs[0]->find('div#infosectionessentials tr', 1)->find('td', 1)->plaintext, 'Form factor' => $specs[0]->find('tr#FormFactor td', 1)->plaintext, 'Socket' => $specs[0]->find('tr#SupportedCPUSocket td', 1)->plaintext, 'HDMI' => $hdmi, 'VGA' => $vga, 'DP' => $dp);
        //print_r($details);
        scraperwiki::save_sqlite(array('Name'), $details);
        $output[] = $details;
    }
    return $output;
}
function ProductInfo($motherboards)
{
    foreach ($motherboards as $mobo) {
        $html = scraperWiki::scrape($mobo['URI']);
        $dom = new simple_html_dom();
        $dom->load($html);
        $specs = $dom->find('div#specifications', 0);
        $details = array();
        $details['Name'] = $mobo['Name'];
        foreach ($specs->find('tbody tr') as $row) {
            $tds = $row->find('td');
            if (count($tds) == 2) {
                $details[$tds[0]->plaintext] = $tds[1]->plaintext;
            }
        }
        scraperwiki::save_sqlite(array('Name'), $details);
        $output[] = $details;
    }
    return $output;
}
function listPage($host, $searchURL)
{
    $html_content = scraperwiki::scrape($host . $searchURL);
    $html = str_get_html($html_content);
    $pageCount = 0;
    foreach ($html->find("div.photo ul.thumb a") as $el) {
        $propPage = str_replace("../", "/", $el->href);
        // echo "\nPAGE :" . $propPage;
        $property = listProperty($host, $propPage);
        //scraperwiki::save_sqlite(array('property'), array('property' => json_encode($property)));
        scraperwiki::save_sqlite(array('property'), $property);
        // scraperwiki::save_sqlite(array("a"),array("a"=>1, "bbb"=>"Hi there"));
        exit;
    }
    foreach ($html->find("a.pageResults") as $el) {
        if (trim($el->plaintext) == "Suivante") {
            $nextPage = $el->href;
            //echo "\nSEARCH : " . $nextPage;
            listPage($host, $nextPage);
            break;
        }
    }
}
function save_generic($category, $title)
{
    scraperwiki::save_sqlite(array("key"), array("key" => make_key($title . "-generic"), "name" => $title, "manufacturer" => "N/A", "url" => "", "description" => "", "category" => make_key($category) . "/" . make_key($title)));
}
function file_get_contents_curl($url)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    //Set curl to return the data instead of printing it to the browser.
    curl_setopt($ch, "http://" + CURLOPT_URL, $url);
    $data = curl_exec($ch);
    curl_close($ch);
    return $data;
}
foreach ($scraper as $scr) {
    scraperwiki::attach($scr);
    $qry = "* from " . $scr . ".swdata";
    //echo $qry;
    $arr = scraperwiki::select($qry);
    // print_r($arr);
    foreach ($arr as $d) {
        //    print $d["key"];
        //    print $d["site"];
        $pr = (int) getPagerank($d["url"]);
        if (1) {
            $d_key = $d["key"];
            $d_site = $d["site"];
            //print_r($d["url"]." PR is ". (string)$pr ." site is ".$d_site); // ." key is " . $d_key);
            $record = array('url' => utf8_encode($d["url"]), 'pr' => utf8_encode($pr), 'ar' => utf8_encode($d["rank"]), 'id' => utf8_encode($d_key), 'desc' => $d["site"]);
            #print_r($record);
            scraperwiki::save_sqlite(array("id"), $record, "prank");
        }
    }
}
foreach ($routes as $route) {
    $routemap[$route['route']]['route'] = $route['route'];
    @($routemap[$route['route']]['coords'] .= $route['latitude'] . ',' . $route['longitude'] . ',2357' . "\n");
}
$theroutes = array();
$count = 0;
foreach ($routemap as $a_route) {
    $count++;
    $r = $a_route['route'];
    $c = $a_route['coords'];
    $theroutes[] = array('id' => $count, 'route' => $r, 'coords' => $c);
}
scraperwiki::save_sqlite(array("id"), $theroutes);
//Whoops, seems that doing 600 queries in under 80 seconds isn't a smart idea. This scraper attempts to aggregate coordinates into something usable.
scraperwiki::attach("tfl_bus_routes_scraper", "src");
$routes = scraperwiki::select("route, stop_name, latitude, longitude from src.tfl_buses where run = 1 order by sequence asc");
$routemap = array();
foreach ($routes as $route) {
    $routemap[$route['route']]['route'] = $route['route'];
    @($routemap[$route['route']]['coords'] .= $route['latitude'] . ',' . $route['longitude'] . ',2357' . "\n");
}
$theroutes = array();
$count = 0;
foreach ($routemap as $a_route) {
    $count++;
    $r = $a_route['route'];
    $c = $a_route['coords'];
    $theroutes[] = array('id' => $count, 'route' => $r, 'coords' => $c);
}
scraperwiki::save_sqlite(array("id"), $theroutes);
function getContent($page, $params, $cnt, $tag_counts)
{
    $photo_cnt = $cnt;
    if ($page > 0) {
        $params['page'] = $page;
    }
    $encoded_params = array();
    foreach ($params as $k => $v) {
        $encoded_params[] = urlencode($k) . '=' . urlencode($v);
    }
    $url = "http://api.flickr.com/services/rest/?" . implode('&', $encoded_params);
    $rsp = file_get_contents($url);
    $rsp_obj = unserialize($rsp);
    if ($rsp_obj['stat'] == 'ok') {
        $rows_p = array();
        $rows_u = array();
        $photo_rows = array();
        foreach ($rsp_obj['photos']['photo'] as $photo) {
            $photo_rows[] = $photo;
            $photo_cnt++;
            $tags = explode(' ', trim($photo['tags']));
            foreach ($tags as $t) {
                if ($t != '') {
                    $row_u[] = array('tag' => $t, 'user' => $photo['owner']);
                    $row_p[] = array('tag' => $t, 'photo' => $photo['id']);
                    foreach ($tags as $t2) {
                        if ($t != $t2) {
                            $label = "{$t}<>{$t2}";
                            $entry = false;
                            if (isset($tag_counts[$label])) {
                                $entry = $tag_counts[$label];
                            } else {
                                if (isset($tag_counts["{$t2}<>{$t}"])) {
                                    $label = "{$t2}<>{$t}";
                                    $entry = $tag_counts[$label];
                                }
                            }
                            if (!$entry) {
                                $entry = array("tag1" => $t, "tag2" => $t2, "count" => 1);
                            } else {
                                $entry['count']++;
                            }
                            $tag_counts[$label] = $entry;
                        }
                    }
                }
            }
        }
        scraperwiki::save_sqlite(array('id'), $photo_rows, $table_name = "photos");
        scraperwiki::save_sqlite(array(), $row_u, $table_name = "tag_user");
        scraperwiki::save_sqlite(array(), $row_p, $table_name = "tag_photo");
        if ($page < $rsp_obj['photos']['pages']) {
            getContent($page + 1, $params, $photo_cnt, $tag_counts);
        } else {
            scraperwiki::save_sqlite(array('tag1', 'tag2'), array_values($tag_counts), $table_name = "tag_tag");
            print "photos: {$photo_cnt}\n";
        }
    } else {
        echo "Call failed: {$page}!";
    }
}
 static function save_var($name, $value)
 {
     if (is_int($value)) {
         $jvalue = $value;
     } else {
         if (is_double($value)) {
             $jvalue = $value;
         } else {
             $jvalue = json_encode($value);
         }
     }
     $data = array("name" => $name, "value_blob" => $jvalue, "type" => gettype($value));
     scraperwiki::save_sqlite(array("name"), $data, "swvariables");
 }
    }
    $html->clear();
    unset($html);
    scraperwiki::save_var('last_id', $i);
}
require 'scraperwiki/simple_html_dom.php';
scraperwiki::attach("s-in-s", "src");
//scraperwiki::save_var('last_id', 1);
//exit();
$id = scraperwiki::get_var('last_id');
for ($i = $id; $i < 1900; $i++) {
    $src = scraperwiki::select("* from src.swdata limit {$i},1");
    $url = $src[0]['link'];
    $url = 'http://sexinsex.net/bbs/' . $url;
    $html_content = scraperwiki::scrape($url);
    $html = str_get_html($html_content);
    $data = array();
    $tr = $html->find("div.postmessage div.t_msgfont");
    $j = 0;
    foreach ($tr as $trr) {
        $noidung = $trr->find('div', 0)->innertext;
        //$noidung = utf8_encode($noidung);
        if (mb_strlen($noidung) > 1000) {
            $j++;
            @scraperwiki::save_sqlite(array('id'), array('id' => $j . '-' . $src[0]['url'], 'title' => $src[0]['title'], 'url' => $src[0]['url'], 'content' => base64_encode($noidung), 'order' => $j, 'num' => $src[0]['num'], 'reply' => $src[0]['reply']));
        }
    }
    $html->clear();
    unset($html);
    scraperwiki::save_var('last_id', $i);
}
Exemple #24
0
// All that matters is that your final data is written to an SQLite database
// called "data.sqlite" in the current working directory which has at least a table
// called "data".
require 'scraperwiki.php';
function scrapePOST($url)
{
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_POST, 1);
    // disable SSL checking to match behaviour in Python/Ruby.
    // ideally would be fixed by configuring curl to use a proper
    // reverse SSL proxy, and making our http proxy support that.
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    $res = curl_exec($curl);
    curl_close($curl);
    return $res;
}
$url = "http://motavafian.beheshtm.ir/peaplesearch.php";
$postdata = http_build_query(array('Family' => '', 'FName' => '', 'NationalCode' => '', 'Shn' => '', 'by' => '', 'ey' => '', 'submit' => 'جستجو'));
$opts = array('http' => array('method' => "POST", 'content' => $postdata));
$context = stream_context_create($opts);
$content = file_get_contents($url, false, $context, -1);
preg_match_all("/<td class=\"alt2\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt1\" align=\"right\">(.*)<\\/td>.*<td class=\"alt2\" align=\"right\">(.*)<\\/td>.*<td class=\"alt1\" align=\"right\">(.*)<\\/td>.*<td class=\"alt2\" align=\"right\">(.*)<\\/td>.*<td class=\"alt2\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt1\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt2\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt1\" align=\"right\">(\\d*)<\\/td>.*<td class=\"alt2\" align=\"right\">(.*)<\\/td>/Usmi", $content, $output_array);
$amount = count($output_array[1]);
print $amount;
for ($i = 0; $i <= $amount; $i++) {
    $record = array('id' => $output_array[1][$i], 'fullname' => $output_array[2][$i], 'fathername' => $output_array[3][$i], 'codemelli' => $output_array[4][$i], 'deathdate' => $output_array[5][$i], 'blockno' => $output_array[6][$i], 'partno' => $output_array[7][$i], 'rowno' => $output_array[8][$i], 'graveno' => $output_array[9][$i], 'nextto' => $output_array[10][$i]);
    scraperwiki::save_sqlite(array('data'), $record);
}
<?php

$regions = array(0 => array('id' => 11812, 'label' => 'North East', 'pop' => '2.6', 'geo' => ''), 1 => array('id' => 11807, 'label' => 'North West', 'pop' => '7.1', 'geo' => ''), 2 => array('id' => 11810, 'label' => 'Yorkshire and the Humber', 'pop' => '5.3', 'geo' => ''), 3 => array('id' => 11805, 'label' => 'East Midlands', 'pop' => '4.5', 'geo' => ''), 4 => array('id' => 11809, 'label' => 'West Midlands', 'pop' => '5.6', 'geo' => ''), 5 => array('id' => 11804, 'label' => 'Eastern England', 'pop' => '5.8', 'geo' => ''), 6 => array('id' => 11806, 'label' => 'London', 'pop' => '8.2', 'geo' => ''), 7 => array('id' => 11811, 'label' => 'South East', 'pop' => '8.6', 'geo' => ''), 8 => array('id' => 11814, 'label' => 'South West', 'pop' => '5.3', 'geo' => ''), 9 => array('id' => 11813, 'label' => 'Wales', 'pop' => '3.1', 'geo' => ''));
//Get GeoJSON of regions
foreach ($regions as $key => $region) {
    $regions[$key]['geo'] = serialize(json_decode(file_get_contents('http://mapit.mysociety.org/area/' . $region['id'] . '.geojson')));
}
//print_r($var);
scraperwiki::save_sqlite(array("id"), $regions);
$regions = array(0 => array('id' => 11812, 'label' => 'North East', 'pop' => '2.6', 'geo' => ''), 1 => array('id' => 11807, 'label' => 'North West', 'pop' => '7.1', 'geo' => ''), 2 => array('id' => 11810, 'label' => 'Yorkshire and the Humber', 'pop' => '5.3', 'geo' => ''), 3 => array('id' => 11805, 'label' => 'East Midlands', 'pop' => '4.5', 'geo' => ''), 4 => array('id' => 11809, 'label' => 'West Midlands', 'pop' => '5.6', 'geo' => ''), 5 => array('id' => 11804, 'label' => 'Eastern England', 'pop' => '5.8', 'geo' => ''), 6 => array('id' => 11806, 'label' => 'London', 'pop' => '8.2', 'geo' => ''), 7 => array('id' => 11811, 'label' => 'South East', 'pop' => '8.6', 'geo' => ''), 8 => array('id' => 11814, 'label' => 'South West', 'pop' => '5.3', 'geo' => ''), 9 => array('id' => 11813, 'label' => 'Wales', 'pop' => '3.1', 'geo' => ''));
//Get GeoJSON of regions
foreach ($regions as $key => $region) {
    $regions[$key]['geo'] = serialize(json_decode(file_get_contents('http://mapit.mysociety.org/area/' . $region['id'] . '.geojson')));
}
//print_r($var);
scraperwiki::save_sqlite(array("id"), $regions);
 static function save_var($name, $value)
 {
     $vtype = gettype($value);
     if ($vtype != "integer" && $vtype != "string" && $vtype != "double" && $vtype != "NULL") {
         print_r("*** object of type {$vtype} converted to string\n");
     }
     $data = array("name" => $name, "value_blob" => strval($value), "type" => $vtype);
     scraperwiki::save_sqlite(array("name"), $data, "swvariables");
 }
//
// // Find something on the page using css selectors
// $dom = new simple_html_dom();
// $dom->load($html);
// print_r($dom->find("table.list"));
//
// // Write out to the sqlite database using scraperwiki library
// scraperwiki::save_sqlite(array('name'), array('name' => 'susan', 'occupation' => 'software developer'));
//
// // An arbitrary query against the database
// scraperwiki::select("* from data where 'name'='peter'")
// You don't have to do things with the ScraperWiki library. You can use whatever is installed
// on Morph for PHP (See https://github.com/openaustralia/morph-docker-php) and all that matters
// is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
// has at least a table called data.
require 'scraperwiki.php';
require 'scraperwiki/simple_html_dom.php';
//
// // Read in a page
$html = scraperwiki::scrape("http://www.ebay.com/sch/i.html?_from=R40&_trksid=p2050601.m570.l1313.TR0.TRC0.H0.XAmerican+Revolutionary+War&_nkw=American+Revolutionary+War&_sacat=0");
//
// // Find something on the page using css selectors
$dom = new simple_html_dom();
$dom->load($html);
print_r($dom->find("h3[class='lvtitle'] a"));
//
// // Write out to the sqlite database using scraperwiki library
scraperwiki::save_sqlite(array('name'), array('name' => 'susan', 'occupation' => 'software developer'));
//
// // An arbitrary query against the database
// scraperwiki::select("* from data where 'name'='peter'")
    $record = "<div class=\"kiji\">";
    foreach ($obj->nodes as $v) {
        $v = preg_replace("/<!\\-\\-.*?\\-\\->/", "", $v->outertext());
        $v = preg_replace("/(<p>|<\\/p>)/", "", $v);
        $v = preg_replace("/<a .*?<\\/a>/", "", $v);
        $v = strip_tags($v);
        if (strlen(trim($v)) == 0) {
            continue;
        }
        if (preg_match("/<h1/", $v)) {
            continue;
        }
        if (preg_match("/<div +class=(\"|')date/", $v)) {
            continue;
        }
        if (preg_match("/<div +class=(\"|')cl/", $v)) {
            continue;
        }
        $record .= $v . "\n";
    }
    $record .= "</div>";
}
$title = mb_convert_encoding($title, "utf8", "sjis-win");
$record = mb_convert_encoding($record, "utf8", "sjis-win");
$record = $title . $record;
//echo $title;
//echo "\n";
//echo $record;
$date = date('Y/m/d H:i:s');
scraperwiki::save_sqlite(array("id"), array("id" => "1", "date" => $date, "news" => $record));
    $name = $html->find('b', 5)->innertext;
    $name = strip_tags($name);
    $name = str_replace('"', "", $name);
    $name = str_replace('  ', "", $name);
    $name = str_replace('&amp;', "", $name);
    $name = str_replace('&nbsp;', "", $name);
    $name = str_replace('&nbsp;&nbsp;', "", $name);
    $name = str_replace(' &nbsp;&nbsp;', "", $name);
    $name = strip_tags($name);
    $number = $html->find('b', 6)->plaintext;
    $number = trim(str_replace("Roll Number :", "", $number));
    $number = strip_tags($number);
    $number = str_replace('"', "", $number);
    $number = str_replace('  ', "", $number);
    $number = str_replace('&amp;', "", $number);
    $number = str_replace('&nbsp;', "", $number);
    $number = str_replace('&nbsp;&nbsp;', "", $number);
    $number = str_replace(' &nbsp;&nbsp;', "", $number);
    $number = strip_tags($number);
    $number = trim(str_replace(" ", "", $number));
    $res = $html->find('b', 7)->plaintext;
    $res = trim(str_replace("Result :", "", $res));
    $bee = $html->find('td', 11)->plaintext;
    $bmec = $html->find('td', 13)->plaintext;
    $be = $html->find('td', 15)->plaintext;
    $faa = $html->find('td', 17)->plaintext;
    $total = $html->find('td', 19)->plaintext;
    if ($number) {
        $message = scraperwiki::save_sqlite(array("number"), array("number" => $number, "name" => $name, "bee" => $bee, "bmec" => $bmec, "be" => $be, "faa" => $faa, "result" => $res, "total" => $total), $table_name = "swdata");
    }
}
$html = file_get_html('http://nadaguides.com/Cars/1996/Lincoln/Continental-V8/Sedan-4D/Values');
// Find table that contains prices
$table = $html->find('table[class]');
//Find price last cell
$td = $table[0]->find('td');
$data = $td[19]->plaintext;
// Find table that contains prices
$h1 = $html->find('h1');
//Find price last cell
$h1clean = $h1[0]->plaintext;
//Remove special characters
$data2 = str_replace("\$", "", $data);
$data2 = str_replace(",", "", $data2);
$message = scraperwiki::save_sqlite(array("price"), array("price" => $data2, "title" => $h1clean));
require 'scraperwiki/simple_html_dom.php';
// Create DOM from URL or file
$html = file_get_html('http://nadaguides.com/Cars/1996/Lincoln/Continental-V8/Sedan-4D/Values');
// Find table that contains prices
$table = $html->find('table[class]');
//Find price last cell
$td = $table[0]->find('td');
$data = $td[19]->plaintext;
// Find table that contains prices
$h1 = $html->find('h1');
//Find price last cell
$h1clean = $h1[0]->plaintext;
//Remove special characters
$data2 = str_replace("\$", "", $data);
$data2 = str_replace(",", "", $data2);
$message = scraperwiki::save_sqlite(array("price"), array("price" => $data2, "title" => $h1clean));