コード例 #1
0
function scrapPage($page)
{
    print "Scraping page " . $page;
    $url = "http://www.geipan.fr/index.php?id=202";
    $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens=";
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_POST, 11);
    curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
    $html = curl_exec($curl);
    print curl_error($curl) . "\n";
    //      print($html);
    $dom = new simple_html_dom();
    $dom->load($html);
    $trs = $dom->find("tr");
    foreach ($trs as $tr) {
        if (isset($tr->attr['onclick'])) {
            $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13);
            print $ID . "\n";
            $tds = $tr->find("td");
            $title = utf8_encode($tds[0]->plaintext);
            $date = $tds[1]->plaintext;
            $departement = utf8_encode($tds[2]->plaintext);
            $classe = $tds[3]->plaintext;
            $maj = $tds[4]->plaintext;
            $city = substr($title, 0, strpos($title, "(") - 1);
            $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city);
            scraperwiki::save(array('ID', 'maj'), $record);
        }
    }
}
コード例 #2
0
function clubURL($url)
{
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext));
    $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName));
    $_GLOBAL['clubs'][] = $formatClubName;
    echo 'running ' . $formatClubName . "\n";
    foreach ($dom->find('table', 2)->find('tr') as $row) {
        if (is_numeric($row->find('td', 0)->plaintext)) {
            $year = trim($row->find('td', 0)->plaintext);
            $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext));
            if (trim($position) == 'Champion') {
                $position = 1;
            }
            $leagueLevel = trim($row->find('td', 2)->plaintext);
            $overallPosition = trim($row->find('td', 3)->plaintext);
            $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext));
            $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext));
            $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance);
            scraperwiki::save(array('club', 'year'), $dataset);
        }
    }
    /*
     * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak
     */
    $dom->clear();
    unset($dom);
}
コード例 #3
0
function scrapeTEDRSS($url, $sector)
{
    print $url . " " . $sector . "\n";
    // $xml = scraperWiki::scrape($url);
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    // 10 second before aborting
    // try CURLOPT_CONNECTTIMEOUT (in seconds)
    // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with):
    // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting
    $xml = curl_exec($curl);
    print curl_error($curl) . "\n";
    $dom = new simple_html_dom();
    $dom->load($xml);
    $items = $dom->find("item");
    foreach ($items as $item) {
        $guid = $item->find("guid");
        $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext);
        print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB";
        echo "\n";
        // $record = scrapeTEDDataPage ($noticeURL, $sector);
        $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL);
        scraperwiki::save(array('sector', 'url'), $record);
        sleep(1);
    }
    $dom->__destruct();
    unset($items);
    unset($dom);
    unset($xml);
    print memory_get_usage() / 1024 / 1024 . "MB\n";
}
コード例 #4
0
function saveIt($txt)
{
    global $joke_count;
    $record = array('JOKE_ID' => ++$joke_count, 'JOKE_TEXT' => $txt);
    scraperwiki::save(array('JOKE_ID'), $record);
    //var_dump($record);
}
コード例 #5
0
function getIngredients($html)
{
    $i = 0;
    $dom = new simple_html_dom();
    $dom->load($html);
    //foreach($dom->find('result-item',1)->href as $data)
    //{
    // if ($data != null)
    //$res = trim($data->plaintext);
    $res = $dom->find('a[class=callout]', 1)->href;
    $res = str_replace("reviews/", "", $res);
    echo "http://www.foodnetwork.com" . $res;
    $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res);
    $domFoods = new simple_html_dom();
    //$domFoods->load($html1);
    $h = str_get_html($html1);
    //echo $domFoods;
    echo "\n\n";
    foreach ($h->find('li[class=ingredient]') as $data) {
        $ingredient = $data->plaintext;
        if (isset($h->href)) {
            $href = $h->href;
        }
        //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){
        //echo $data->plaintext;
        scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href));
    }
}
コード例 #6
0
function gazelangs($url, $lang)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $michi = "strong";
    $michi = $michi . " hope";
    foreach ($dom->find("ul[@class='trans_sent']") as $data) {
        $tds = $data->find("li");
        $record = array('user_input' => $tds[0]->plaintext, 'babelfish_output' => $tds[1]->plaintext, 'timestamp_scrape' => date("Y-m-d H:i:s"), 'page' => $url, 'language' => $lang);
        // print json_encode($record) . "\n";
        scraperwiki::save(array('user_input', 'babelfish_output', 'timestamp_scrape', 'page', 'language'), $record);
    }
}
コード例 #7
0
function scrapeIdeeLab()
{
    $html = scraperWiki::scrape("http://ideelab.wordpress.com/category/uudis/");
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find('div.status-publish') as $data) {
        $newsTitle = $data->find('div.posttitle h2.pagetitle');
        //    print($newsTitle[0]->plaintext."\n");
        $newsBody = $data->find('div.entry');
        //    print($newsBody[0]->plaintext."\n");
        $record = array('title' => $newsTitle[0]->plaintext, 'newsbody' => $newsBody[0]->plaintext);
        scraperwiki::save(array('title', 'newsbody'), $record);
    }
}
コード例 #8
0
function grab($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("#tbl_proxy_list tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) == 7) {
            $input = decode_ip((string) $tds[0]);
            $record = array('ip' => $input);
            scraperwiki::save(array('ip'), $record);
        }
    }
}
コード例 #9
0
function extract_data($value)
{
    $htmlvalue = str_get_html($value);
    //print $htmlvalue;
    $link = $htmlvalue->find('li[class="first last"] a', 0);
    $title = $htmlvalue->find('li[class="first last"] a', 0);
    $description = $htmlvalue->find('li[class="first last"] a', 0);
    $date = $htmlvalue->find('span[class="date-display-single"]', 0);
    $processdate = substr($date->plaintext, -10);
    //print $link->href. "\n";
    //print $title->plaintext. "\n";
    //print $description->plaintext. "\n";
    $when = date_create_from_format('d/m/Y', $processdate);
    print_date($when);
    $data = array('link' => $link->href, 'title' => $title->plaintext, 'description' => $description->plaintext, 'date' => $when);
    scraperwiki::save(array('title'), $data);
}
コード例 #10
0
function getExcuse($extension)
{
    global $html;
    global $count;
    $root = "http://www.goodexcuses.co.uk";
    //$extension = "/Excuses/My-fish-is-sick-and-I-need-to-take-it-to-the-vet/" ;
    $html = file_get_html($root . $extension);
    //The excuse
    $excuse = $html->find('h2', 0)->innertext;
    echo $excuse . "\n";
    //save to DB
    $record = array('EXCUSE_ID' => ++$count, 'EXCUSE_TEXT' => $excuse, 'EXCUSE_URL' => $extension);
    scraperwiki::save(array('EXCUSE_ID'), $record);
    //Get next url
    //echo "\n".goToNextURL()."\n";
    goToNextURL();
}
コード例 #11
0
function loadPageGallery($url)
{
    $htmlGallery = scraperWiki::scrape($url);
    $domGallery = new simple_html_dom();
    $domGallery->load($htmlGallery);
    foreach ($domGallery->find("div#contentDetail1") as $data) {
        $title = $data->find("h3");
        $adressclass = $data->find('.adres');
        $urlandemail = $data->find('.adres a');
        $artists = $data->find('.artists');
        $contactName = explode("\n", $adressclass[0]->plaintext);
        list($contactNameGallery) = $contactName;
        $tels = explode("\n", $adressclass[4]->plaintext);
        list($tel1, $tel2) = $tels;
        $record = array('name' => $title[0]->plaintext, 'contact' => $contactNameGallery, 'url' => $urlandemail[0]->plaintext, 'email' => $urlandemail[1]->plaintext, 'address' => $adressclass[1]->plaintext, 'tel1' => $tel1, 'tel2' => $tel2, 'artists' => $artists[0]->plaintext);
        scraperwiki::save(array('name', 'contact', 'url', 'email', 'address', 'tel1', 'tel2', 'artists'), $record);
        //print_r($record);
    }
}
コード例 #12
0
function scrape_job_page($page)
{
    $page_html = scraperWiki::scrape("https://jobsearch.direct.gov.uk/JobSearch/PowerSearch.aspx?tm=0&pg=" . $page);
    $dom = new simple_html_dom();
    $dom->load($page_html);
    foreach ($dom->find("table tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) == 5) {
            $id_hyperlink = $tds[0]->find('a[name]', 0);
            $id = intval($id_hyperlink->name);
            $more_info_hyperlink = $tds[2]->find('a', 0)->href;
            print $more_info_hyperlink;
            $record = array('id' => $id, 'posted_date' => date_create($tds[0]->plaintext), 'job_title' => trim($tds[2]->plaintext), 'company' => trim($tds[3]->plaintext), 'location' => trim($tds[4]->plaintext), 'url' => $more_info_hyperlink);
            //print json_encode($record) . "\n";
            scraperwiki::save(array('id'), $record);
        }
    }
    $dom->__destruct();
}
コード例 #13
0
function getCitieListByATO($p_atoCODE = "")
{
    $html = scraperWiki::scrape("http://www.rifiutiebonifica.puglia.it/dettaglio_differenziata.php?ato=" . $p_atoCODE . "&data=12");
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("table tr") as $data) {
        $tds = $data->find("td");
        $a = $data->find("a");
        if (isset($a[0])) {
            $link = $a[0]->href;
            $link = str_replace("dettaglio_trasmissione.php?IdComune=", "", $link);
            $position = strrpos($link, "&");
            $id = substr($link, 0, $position);
            $ato = $p_atoCODE;
            $comuni = array('comune' => $tds[0]->plaintext, 'id' => $id);
            scraperwiki::save(array('id'), $comuni);
        }
    }
}
コード例 #14
0
function topSites()
{
    $page = 0;
    $country = 'IT';
    $limit = 20;
    $count = 0;
    while ($limit > $page) {
        $html = scraperWiki::scrape("http://www.alexa.com/topsites/countries;" . $page . "/" . $country);
        $dom = new simple_html_dom();
        $dom->load($html);
        foreach ($dom->find("span[class=topsites-label]") as $data) {
            $record = array('site' => $data->plaintext);
            scraperwiki::save(array('site'), $record);
            $count++;
        }
        ++$page;
    }
    print $count;
}
コード例 #15
0
function data_from_overview_page($url, $type)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $count = 0;
    $base_url = 'http://www.dnr.state.mn.us';
    foreach ($dom->find(".paddingbig table tr table tr") as $rows) {
        $count++;
        $data = $rows->find("td");
        $link_image = $data[0]->find("a");
        $image = $data[0]->find("img");
        $link_text = $data[1]->find("a");
        $name = $link_text[0]->plaintext;
        if (!empty($data[0]->plaintext)) {
            $record = array('id' => $type . '--' . strtolower(str_replace(' ', '-', $name)), 'type' => $type, 'name' => $name, 'link' => !empty($link_image[0]->href) ? $base_url . $link_image[0]->href : '', 'thumb_url' => !empty($image[0]->src) ? $image[0]->src : '', 'timestamp' => time());
            scraperwiki::save(array('id'), $record);
        }
    }
}
コード例 #16
0
function getPageOfResults($url, $pagenum, $cat)
{
    $html = scraperWiki::scrape($url . "?page=" . $pagenum);
    $dom = new simple_html_dom();
    $dom->load($html);
    $links = $dom->find("h2 a");
    $count = 0;
    foreach ($links as $link) {
        echo $link->href . "\n";
        // if (alreadyKnown($cat, $link->href)) return;
        $count = $count + 1;
        $record = array('cat' => $cat, 'url' => $link->href);
        scraperwiki::save(array('cat', 'url'), $record);
    }
    echo "got " . $count . " results\n";
    if ($count === 0) {
        return;
    }
    // getPageOfResults($url, $pagenum+1, $cat);
}
function scrapepage($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $rows = $dom->find("table table table table table table tr");
    foreach ($rows as $row) {
        $tds = $row->find('td');
        if ($tds[0]->height == 30) {
            $document = array();
            $document['name'] = $tds[0]->plaintext;
            if ($tds[2]->plaintext == '-') {
                $document['date'] = '';
            } else {
                $document['date'] = get_date(date_create($tds[2]->plaintext));
            }
            $document['house'] = $tds[4]->plaintext;
            $document['language'] = $tds[6]->plaintext;
            $link = $tds[8]->find('a');
            $img = $tds[8]->find('img');
            $document['url'] = 'http://www.parliament.gov.za/live/' . $link[0]->href;
            if ($img[0]->src == 'images/icon_word.gif') {
                $type = '.doc';
            }
            if ($img[0]->src == 'images/icon_pdf.gif') {
                $type = '.pdf';
            }
            $document['type'] = $type;
            scraperwiki::save(array('url'), $document);
            //print_r($document);
            //print $row->plaintext;
        }
    }
    //find next page to scrape
    $links = $dom->find("table[style=height:26px] a");
    foreach ($links as $link) {
        if ($link->plaintext == 'Next') {
            scrapepage('http://www.parliament.gov.za/live/' . $link->href);
        }
    }
}
コード例 #18
0
function topSites()
{
    $page = 0;
    $country = 'US';
    $limit = 20;
    $count = 0;
    while ($limit > $page) {
        $html = scraperWiki::scrape("http://www.alexa.com/topsites/countries;" . $page . "/" . $country);
        $dom = new simple_html_dom();
        $dom->load($html);
        foreach ($dom->find("div[class=description]") as $data) {
            $foobar = 'none' . $page . $count;
            if ($data->plaintext != " ") {
                $foobar = $data->plaintext;
            }
            $record = array('site' => $foobar);
            scraperwiki::save(array('site'), $record);
            $count++;
        }
        ++$page;
    }
    print $count;
}
コード例 #19
0
function scrape_ministers($id, $type)
{
    $html = scraperWiki::scrape("http://apps.gcis.gov.za/gcis/InternetIncludes/gcis_list.jsp?id={$id}&heading={$type}");
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) == 2) {
            if ($tds[0]->plaintext == "Minister:") {
                $minister = $tds[1]->plaintext;
            } elseif ($tds[0]->plaintext == "Portfolio:") {
                $portfolio = $tds[1]->plaintext;
            } elseif ($tds[0]->plaintext == "Tel no:") {
                $tel_no = $tds[1]->plaintext;
            } elseif ($tds[0]->plaintext == "Fax no:") {
                $fax_no = $tds[1]->plaintext;
            } elseif ($tds[0]->plaintext == "Mail address:") {
                $record = array('minister' => $minister, 'portfolio' => $portfolio, 'tel_no' => $tel_no, 'fax_no' => $fax_no, 'mail address' => $tds[1]->plaintext);
                scraperwiki::save(array('minister'), $record);
                #print json_encode($record) . "\n";
            }
        }
    }
}
コード例 #20
0
function create_dataset2($html)
{
    $i = 0;
    $dom = new simple_html_dom();
    $dom->load($html);
    #controllo se esiste veramente prima di entrare
    $table = $dom->find('table', 2);
    if (isset($table)) {
        foreach ($dom->find('table', 2)->children() as $data) {
            echo "parsing info tabella principale";
            if ($data != null) {
                $res = trim($data->plaintext);
            }
            if ($i > 0 && strlen($res) > 0) {
                # Store data in the datastore
                #print $res;
                $res = str_replace(''', "'", $res);
                #splitto i risultati in un array
                $array_result = split(' ', $res);
                #print_r($res);
                #echo $denom;
                # Mi salvo il codiceMPI
                $codMPI = trim($array_result[1]);
                $url_MPI = "http://www.trampi.istruzione.it/ricScu/dettaglio.do?cod=" . $codMPI;
                #print $url_MPI."\n";
                $html = scraperwiki::scrape($url_MPI);
                $dom_mpi = new simple_html_dom();
                $dom_mpi->load($html);
                $tel = "";
                $fax = "";
                $email = "";
                $web = "";
                $indS = "";
                $tr = $dom_mpi->find('table[cellspacing=1] tr');
                if (isset($tr)) {
                    foreach ($dom_mpi->find('table[cellspacing=1] tr') as $data_mpi) {
                        $res = $data_mpi->plaintext . "\n";
                        $values = split(':', $res);
                        #print_r($values);
                        if (strlen($values[0]) > 0) {
                            if (stripos($values[0], 'tel') !== false) {
                                $tel = trim($values[1]);
                                #print "tel:".$tel."\t";
                            } else {
                                if (stripos($values[0], 'fax') !== false) {
                                    $fax = trim($values[1]);
                                    #print "fax:".$fax."\t";
                                } else {
                                    if (stripos($values[0], 'e-mail') !== false) {
                                        $email = trim($values[1]);
                                    } else {
                                        if (stripos($values[0], 'web') !== false) {
                                            while (list($key, $value) = each($values)) {
                                                if ($key = 2) {
                                                    $web = $values[1] . ":" . $value;
                                                }
                                            }
                                        } else {
                                            if (stripos($values[0], 'studio') !== false) {
                                                $indS = str_replace('</td>', '', $values[1]);
                                                $indS = str_replace('</tr>', '', $indS);
                                                $indS = str_replace(array("\r", "\t", "\n"), '', $indS);
                                                $indS = trim($indS);
                                                #print "ind studio:".$indS."\n";
                                            }
                                        }
                                    }
                                }
                            }
                            #echo $web."\n";
                        }
                    }
                    unset($values);
                }
                $dom_mpi->clear();
                unset($dom_mpi);
                $dataset = array('denominazione' => trim(html_entity_decode($array_result[0])), 'codiceMPI' => trim($array_result[1]), 'tipologia' => trim(html_entity_decode($array_result[2])), 'tipologiaIIgrado' => trim(html_entity_decode($array_result[3])), 'descrizione' => trim(html_entity_decode($array_result[4])), 'indirizzo' => trim(html_entity_decode($array_result[5])), 'località' => trim(html_entity_decode($array_result[6])), 'cap' => trim($array_result[7]), 'comune' => trim(html_entity_decode($array_result[8])), 'provincia' => trim(html_entity_decode($array_result[9])), 'regione' => trim(html_entity_decode($array_result[10])), 'codIstitutoComprensivo' => trim(html_entity_decode($array_result[11])), 'telefono' => $tel, 'fax' => $fax, 'email' => $email, 'web' => $web, 'IndirizziStudio' => trim(html_entity_decode($indS)));
                #print_r($dataset);
                #scraperwiki::save(array('data'), array('data' => $data->plaintext));
                if (strlen($dataset['denominazione']) > 1) {
                    scraperwiki::save(array('denominazione', 'codiceMPI'), $dataset);
                }
                unset($dataset);
                unset($res);
                unset($tel);
                unset($fax);
                unset($email);
                unset($web);
                unset($indS);
            }
            $i = $i + 1;
        }
        #dealloco il dom sennò schianta
        $dom->clear();
        unset($dom);
    }
}
コード例 #21
0
         $terminate = $json->count;
         if ($terminate == 0) {
             $t_count = 1;
             break;
         }
         $list = $html2->find('div[class=fk-srch-item fk-inf-scroll-item]');
         if (!empty($list)) {
             foreach ($list as $src) {
                 $k = $k + 1;
                 $linki = $src->find('h2', 0);
                 $link = $linki->first_child()->href;
                 $title1 = $linki->first_child()->plaintext;
                 $author = $linki->next_sibling()->plaintext;
                 $info = $src->first_child()->children(2)->plaintext;
                 $record = array('id' => $k, 'link' => $link, 'title' => $title1, 'author' => $author, 'info' => $info, 'search_word' => $title);
                 scraperwiki::save(array('id'), $record);
                 $linki = "NA";
                 $link = "NA";
                 $title1 = "NA";
                 $author = "NA";
                 $info = "NA";
             }
         }
         $html2->clear();
         //if loop
     }
     $k1 = $k1 + 1;
     //break;
     //while loop
 }
 // break;
コード例 #22
0
$urls[] = 'http://www.opentable.com/chicago-graduation-party-places';
$urls[] = 'http://www.opentable.com/chicago-graduation-party-places';
$urls[] = 'http://www.opentable.com/chicago-wedding-reception-venues';
foreach ($urls as $url) {
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $row) {
        $url = $row->find('a.GDRNmLk');
        $venueHtml = scraperWiki::scrape($url[0]->href);
        $domInner = new simple_html_dom();
        $domInner->load($venueHtml);
        $name = $domInner->find('h1.RestProfileTitle span');
        $description = $domInner->find('#RestaurantProfile_RestProfileGroupDiningTab_lblPrivateDiningContent, #RestaurantProfile_RestaurantProfileInfo_lblDescription');
        //$type = $domInner->find('#Header_trType td');
        $location = $domInner->find('div.RestProfileAddress span');
        $parts = explode("<br/>", $location[0]->innertext);
        $addr2 = '';
        $city_state = $parts[1];
        if (count($parts) == 4) {
            $addr2 = $parts[1];
            $city_state = $parts[2];
        }
        //print_r($city_state);
        preg_match("/([^,]+),\\s*(\\w+)\\s*(\\d{5}(?:-\\d{4})?)/", $city_state, $matches);
        list($arr['addr'], $arr['city'], $arr['state'], $arr['zip']) = $matches;
        //print_r($arr);
        $record = array('name' => $name[0]->innertext, 'address1' => $parts[0], 'address2' => $addr2, 'city' => $arr['city'], 'state' => $arr['state'], 'zip' => $arr['zip'], 'type' => 'Restaruant', 'description' => $description[0]->innertext);
        scraperwiki::save(array('name'), $record);
    }
}
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("div.item-content tr") as $row) {
        $headers = $row->find("th");
        $columns = $row->find("td");
        if (preg_match("/Postcode/", $headers[0]->plaintext)) {
            echo $code . ": " . $columns[0]->plaintext;
            scraperwiki::save(array("code"), array("code" => $code, "postcode" => $columns[0]->plaintext));
            break;
        }
    }
}
require 'scraperwiki/simple_html_dom.php';
//  The following is a list of Exchange SAUIDs gleaned from SamKnows
$codes = array("SWLAK", "SWLAS", "SWLCA", "SWLDR", "SWLDV", "SWLGC", "SWLHY", "SWLJ", "SWLJV", "SWLJZ", "SWLKB", "SWLKD", "SWLKY", "SWLLD", "SWLLF", "SWLLG", "SWLLM", "SWLLO", "SWLLP", "SWLLR", "SWLLU", "SWLLW", "SWLNI", "SWLNN", "SWLPI", "SWLQW", "SWLY", "SWLYA", "SWLYJ", "SWLYW", "SWMAD", "SWMAL", "SWMDE", "SWMDX", "SWMES", "SWMF", "SWMGR", "SWMGX", "SWMLZ", "SWMMN", "SWMMV", "SWMNF", "SWMT_EX", "SWMU", "SWMWY", "SWMYE", "SWMYG", "SWMYS", "SWMYU", "SWNB", "SWNBI", "SWNDO", "SWNDU", "SWNE_CH", "SWNE_EX", "SWNEN", "SWNES", "SWNM", "SWNNA", "SWNSN", "SWNTD", "SWOAG", "SWPBL", "SWPBM", "SWPDU", "SWPDW", "SWPEC", "SWPEK", "SWPEU", "SWPEV", "SWPHX", "SWPM", "SWPMQ", "SWPN", "SWPND", "SWPOM", "SWPP", "SWPQS", "SWPRU", "SWPTB", "SWPTH", "SWPTM", "SWPTY", "SWPUN", "SWPYH", "SWRAG", "SWRDA", "SWRDX", "SWRHA", "SWRHR", "SWRLS", "SWRRY", "SWRSO", "SWRSV", "SWRTH", "SWRVH", "SWRWI", "SWSAS", "SWSAW", "SWSDV", "SWSFJ", "SWSKJ", "SWSKU", "SWSMX", "SWSNI", "SWSSQ", "SWSVB", "SWSX", "SWSZX", "SWTAF", "SWTAJ", "SWTAT", "SWTB", "SWTDE", "SWTDU", "SWTEK", "SWTEZ", "SWTFA", "SWTFS", "SWTLL", "SWTLU", "SWTR", "SWTRF", "SWTRH", "SWTSA", "SWTUC", "SWUAH", "SWUAZ", "SWUCW", "SWUGI", "SWUGU", "SWUHN", "SWUTK", "SWUWN", "SWVLD", "SWVVW", "SWWCP", "SWWHT", "SWWJK", "SWWXC", "SWXNH", "SWXSX", "SWXTP", "SWXUU", "SWYBL", "SWYDU", "SWYRO", "SWYYN", "THEAR", "THH", "THHC", "THHD", "THHDY", "THHE", "THHF", "THHH", "THHM", "THHN", "THHRJ", "THHS", "THHT", "THHW", "THIN", "THIP", "THKB", "THKC", "THKE", "THLG", "THLL", "THLM", "THLP", "THLSN", "THM", "THMD", "THML", "THMO", "THMS", "THMSD", "THNB", "THNE", "THNL", "THNU", "THOH", "THOK", "THOL", "THOV", "THP", "THPC", "THPM", "THPS", "THRD", "THRG", "THRO", "THS", "THSBN", "THSCR", "THSE", "THSL", "THSL_UD", "THSPD", "THT", "THTAD", "THTF", "THTG", "THTH", "THTI", "THTT", "THTV", "THUB", "THWA", "THWDY", "THWI", "THWK", "THWL", "THWM", "THWN", "THWO", "THWP", "THWR", "THWT", "THWTH", "THWY", "THY", "WEWHAM", "WEWLOR", "WEWMAI", "WEWMAR", "WEWMAY", "WEWNPN", "WEWPAD", "WEWPRI", "WEWSOH", "WMCIT", "WMHAM", "WMHAN", "WMHAR", "WMHAS", "WMHAV", "WMHCH", "WMHIL", "WMHIM", "WMHOL", "WMHSW", "WMHX", "WMINK", "WMIPN", "WMIPS", "WMKD", "WMKDG", "WMKEM", "WMKLT", "WMKNI", "WMLEE", "WMLEI", "WMLIT", "WMLON", "WMLOW", "WMMAD", "WMMAL", "WMMFD", "WMMIC", "WMMTL", "WMNAN", "WMNEW", "WMOAK", "WMOMB", "WMONE", "WMPAX", "WMPEO", "WMPER", "WMPKR", "WMPOW", "WMRCR", "WMRID", "WMRJ", "WMROK", "WMROM", "WMRUD", "WMRUG", "WMSAN", "WMSBH", "WMSEI", "WMSEV", "WMSHB", "WMSMA", "WMSPA", "WMSPE", "WMSRK", "WMSTA", "WMSTD", "WMSTJ", "WMSTK", "WMSTO", "WMSTP", "WMSTU", "WMSUC", "WMSWY", "WMTEA", "WMTRE", "WMUPS", "WMUSN", "WMUTT", "WMUUS", "WMWAR", "WMWAT", "WMWES", "WMWET", "WMWHE", "WMWHS", "WMWIC", "WMWLN", "WMWLY", "WMWOO", "WMWR", "WMWRK", "WMWTM", "WMWYB", "WMWYC", "WMYAR", "WNHAE", "WNHAM", "WNHAN", "WNHAR", "WNHAT", "WNHAW", "WNHAY", "WNHCP", "WNHER", "WNHL", "WNHLN", "WNHMR", "WNHOD", "WNHOL", "WNHR", "WNHUN", "WNHUX", "WNHW", "WNIB", "WNIV", "WNKEL", "WNKER", "WNKIN", "WNKNG", "WNKNI", "WNKNO", "WNKT", "WNKYR", "WNLAD", "WNLAN", "WNLAR", "WNLBD", "WNLBG", "WNLBH", "WNLBR", "WNLBW", "WNLC", "WNLDA", "WNLDC", "WNLDD", "WNLDF", "WNLDG", "WNLDO", "WNLEA", "WNLED", "WNLEI", "WNLEO", "WNLEY", "WNLFF", "WNLFN", "WNLFS", "WNLFU", "WNLGD", "WNLGF", "WNLGG", "WNLGL", "WNLGN", "WNLGO", "WNLGW", "WNLGY", "WNLIN", "WNLIT", "WNLMD", "WNLMR", "WNLMY", "WNLN", "WNLNF", "WNLNO", "WNLNS", "WNLNY", "WNLON", "WNLR", "WNLRD", "WNLSF", "WNLSN", "WNLST", "WNLTH", "WNLTN", "WNLU", "WNLUD", "WNLVL", "WNLW", "WNLWA", "WNLWN", "WNLWW", "WNLYD", "WNLYI", "WNLYO", "WNM", "WNMAC", "WNMAN", "WNMAP", "WNMAR", "WNMB", "WNMD", "WNMDL", "WNMEI", "WNMFB", "WNMIC", "WNMM", "WNMOC", "WNMOE", "WNMON", "WNMOS", "WNMSB", "WNMSL", "WNMT", "WNMUC", "WNMUN", "WNMW", "WNNAN", "WNNBG", "WNNBR", "WNNCL", "WNNEB", "WNNEF", "WNNN", "WNNOR", "WNNOW", "WNNP", "WNNR", "WNNTP", "WNOAK", "WNOC", "WNOOD", "WNOSW", "WNPAI", "WNPAN", "WNPBK", "WNPCH", "WNPCO", "WNPDD", "WNPEB", "WNPEF", "WNPEG", "WNPEM", "WNPEN", "WNPG", "WNPIP", "WNPMN", "WNPNL", "WNPNN", "WNPNR", "WNPON", "WNPOR", "WNPRD", "WNPRE", "WNPRG", "WNPRL", "WNPRS", "WNPTD", "WNPTW", "WNPWL", "WNQH", "WNRAY", "WNRC", "WNRE", "WNRHD", "WNRHU", "WNRIW", "WNRM", "WNRNR", "WNROS", "WNROW", "WNRST", "WNRUA", "WNRUT", "WNRWX", "WNRYT", "WNSA", "WNSAM", "WNSAU", "WNSEA", "WNSEI", "WNSHA", "WNSHI", "WNSSM", "WNSSN", "WNSTE", "WNSTI", "WNSW", "WNSY", "WNTAL", "WNTAR", "WNTFG", "WNTH", "WNTHL", "WNTHR", "WNTIL", "WNTRA", "WNTRB", "WNTRG", "WNTRN", "WNTRR", "WNTRU", "WNTRW", "WNTRY", "WNTUD", "WNTV", "WNTW", "WNTYG", "WNTYN", "WNTYW", "WNUB", "WNUM", "WNUP", "WNVAL", "WNWA", "WNWCH", "WNWEL", "WNWEM", "WNWEO", "WNWET", "WNWIG", "WNWIT", "WNWOM", "WNWOR", "WNWPL", "WNWTN", "WNWUL", "WNWX", "WNWXL", "WNWXN", "WNYA", "WNYO", "WRKGDN", "WRNELMS", "WRPGRN", "WRPIM", "WRSKEN", "WRSLO", "WRSTHBK", "WRVAUX", "WRWHI", "WRWKEN", "WRWMIN", "WSHAM", "WSHAU", "WSHEL", "WSHOL", "WSIBR", "WSINN", "WSINS", "WSINV", "WSIRS", "WSIRV", "WSJOB", "WSJOH", "WSJOP", "WSJUR", "WSKBN", "WSKET", "WSKGE", "WSKIA", "WSKIB", "WSKIC", "WSKID", "WSKIE", "WSKIF", "WSKIG", "WSKII", "WSKIK", "WSKIL", "WSKIM", "WSKIN", "WSKIO", "WSKIP", "WSKIR", "WSKIU", "WSKIW", "WSKIY", "WSKKC", "WSKKD", "WSKKE", "WSKKF", "WSKKL", "WSKKN", "WSKKO", "WSKKR", "WSKKT", "WSKKZ", "WSKLM", "WSKLN", "WSKRK", "WSLAA", "WSLAB", "WSLAH", "WSLAK", "WSLAL", "WSLAM", "WSLAN", "WSLAR", "WSLAU", "WSLEA", "WSLED", "WSLEN", "WSLES", "WSLEW", "WSLEX", "WSLID", "WSLIS", "WSLOA", "WSLOC", "WSLOD", "WSLOE", "WSLOG", "WSLOH", "WSLOI", "WSLON", "WSLOS", "WSLOT", "WSLUI", "WSLUS", "WSMAB", "WSMAC", "WSMAH", "WSMAR", "WSMAU", "WSMAY", "WSMER", "WSMIL", "WSMIN", "WSMIT", "WSMOC", "WSMOD", "WSMOF", "WSMON", "WSMOS", "WSMOT", "WSMOU", "WSMUI", "WSNEA", "WSNEB", "WSNEC", "WSNEG", "WSNEL", "WSNES", "WSNEW", "WSOBA", "WSOCH", "WSOLD", "WSORM", "WSPAI", "WSPAL", "WSPAN", "WSPAR", "WSPAT", "WSPEN", "WSPIN", "WSPIR", "WSPOA", "WSPOC", "WSPOE", "WSPOL", "WSPOP", "WSPOR", "WSPOS", "WSPOW", "WSPRE", "WSPRO", "WSPTH", "WSPTN", "WSREN", "WSRHU", "WSRIN", "WSROC", "WSROT", "WSRUT", "WSSAL", "WSSAN", "WSSAQ", "WSSCA", "WSSCO", "WSSHE", "WSSHI", "WSSKI", "WSSKL", "WSSLI", "WSSOE", "WSSOK", "WSSOR", "WSSPR", "WSSTD", "WSSTE", "WSSTN", "WSSTO", "WSSTR", "WSSTT", "WSSTU", "WSSTW", "WSSYM", "WSTAB", "WSTAH", "WSTAR", "WSTAT", "WSTAY", "WSTHL", "WSTHO", "WSTIG", "WSTIN", "WSTIR", "WSTOB", "WSTOD", "WSTOR", "WSTOW", "WSTRO", "WSTUR", "WSTWE", "WSTWY", "WSTYN", "WSUDD", "WSULV", "WSUPL", "WSWAT", "WSWEK", "WSWEM", "WSWES", "WSWHB", "WSWHH", "WSWHI", "WSWIG", "WSWIS", "WWHARB", "WWHART", "WWHATH", "WWHAWK", "WWHAYL", "WWHBCK", "WWHBCM", "WWHCRX", "WWHELE", "WWHELS", "WWHEMY", "WWHENL", "WWHOLB", "WWHOLF", "WWHOLN", "WWHOLS", "WWHONI", "WWHTOR", "WWILCH", "WWILFR", "WWILMI", "WWINST", "WWIPPL", "WWISLE", "WWIVYB", "WWKENN", "WWKENT", "WWKGWR", "WWKILK", "WWKKWL", "WWKNGB", "WWKSTM", "WWLAND", "WWLANR", "WWLAPF", "WWLAUN", "WWLDOW", "WWLEED", "WWLIFT", "WWLISK", "WWLLAW", "WWLODD", "WWLOOE", "WWLOST", "WWLPRT", "WWLSTL", "WWLSUT", "WWLTRE", "WWLUPP", "WWLVET", "WWLWDN", "WWLYDF", "WWLYME", "WWLYNT", "WWMABT", "WWMARA", "WWMARK", "WWMART", "WWMAWG", "WWMBSH", "WWMCAN", "WWMDAM", "WWMEVA", "WWMILV", "WWMINE", "WWMITC", "WWMLBK", "WWMMAG", "WWMODY", "WWMORT", "WWMORW", "WWMOUS", "WWMPRT", "WWMSMT", "WWMTON", "WWMTVY", "WWMULL", "WWNABB", "WWNANP", "WWNCAD", "WWNCUR", "WWNCYR", "WWNETH", "WWNEWQ", "WWNFER", "WWNMOL", "WWNPTN", "WWNPWI", "WWNTAM", "WWNTAW", "WWNTCY", "WWOAKF", "WWOKEH", "WWOSMY", "WWOSTN", "WWPADS", "WWPAIG", "WWPAR", "WWPCMB", "WWPENZ", "WWPERR", "WWPINH", "WWPIPE", "WWPISA", "WWPLRN", "WWPOLP", "WWPORL", "WWPOST", "WWPOUN", "WWPRAZ", "WWPREA", "WWPRIN", "WWPRYN", "WWPSCO", "WWPSTK", "WWPTON", "WWPTRE", "WWPTWN", "WWPURI", "WWPYTH", "WWRACK", "WWREDR", "WWRILL", "WWROBO", "WWROCH", "WWRUMF", "WWSAGN", "WWSALC", "WWSALT", "WWSAMP", "WWSAUS", "WWSBNT", "WWSBUD", "WWSBUR", "WWSCAN", "WWSCHD", "WWSCIL", "WWSCLM", "WWSCOL", "WWSDAY", "WWSDOM", "WWSEAT", "WWSENN", "WWSFLM", "WWSGAB", "WWSGEN", "WWSGER", "WWSHAL", "WWSHAU", "WWSHEB", "WWSHER", "WWSHIP", "WWSHIR", "WWSIDB", "WWSIDM", "WWSILV", "WWSIVE", "WWSJUS", "WWSKEV", "WWSMAB", "WWSMAR", "WWSMER", "WWSMOL", "WWSMWG", "WWSMWS", "WWSOME", "WWSOWT", "WWSPAX", "WWSPET", "WWSTAL", "WWSTAR", "WWSTAV", "WWSTEN", "WWSTIC", "WWSTIT", "WWSTOC", "WWSTOG", "WWSTUD", "WWSUTT", "WWSWIM", "WWTAUN", "WWTAVI", "WWTEDB", "WWTEIG", "WWTEMP", "WWTHRE", "WWTIMB", "WWTINT", "WWTIVE", "WWTLIZ", "WWTOPS", "WWTORQ", "WWTORR", "WWTORX", "WWTOTN", "WWTPNT", "WWTREB", "WWTREG", "WWTRES", "WWTRUR", "WWUPOT", "WWVERY", "WWWADE", "WWWASH", "WWWBAY", "WWWCKR", "WWWDGT", "WWWDWN", "WWWEEK", "WWWELL", "WWWEMB", "WWWFRD", "WWWHEA", "WWWHIM", "WWWILL", "WWWILM", "WWWINC", "WWWITH", "WWWIVE", "WWWKLH", "WWWMON", "WWWMOR", "WWWOOD", "WWWOOL", "WWWSHM", "WWWZOY", "WWYEAL", "WWYELV", "WWYEOV", "WWYETM", "WWZELA");
foreach ($codes as $code) {
    echo "Loading " . $code . "...";
    $html = scraperWiki::scrape("http://www.samknows.com/broadband/exchange/" . $code);
    echo "Loaded";
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("div.item-content tr") as $row) {
        $headers = $row->find("th");
        $columns = $row->find("td");
        if (preg_match("/Postcode/", $headers[0]->plaintext)) {
            echo $code . ": " . $columns[0]->plaintext;
            scraperwiki::save(array("code"), array("code" => $code, "postcode" => $columns[0]->plaintext));
            break;
        }
    }
}
        $b += $inclusive ? 0 : strlen($start);
        $e = empty($stop) ? strlen($src) : strpos(strtolower($src), strtolower($stop), $b);
        $e += $inclusive ? strlen($stop) : 0;
        $e = $e > strlen($src) ? strlen($src) : $e;
        if ($e > $b) {
            return trim(substr($src, $b, $e - $b));
        }
    }
}
$URL = 'http://en.wikipedia.org/wiki/ISO_3166-1_alpha-3';
$x = scraperWiki::scrape($URL);
if (!empty($x)) {
    $x = partof($x, 'Officially assigned code elements</span></h3>', '<h3>');
    $x = partof($x, '<table', null, true);
    $x = explode('</table>', $x);
    if (count($x) > 0) {
        foreach ($x as $y) {
            $y = explode('</tr>', $y);
            if (count($y) > 0) {
                foreach ($y as $z) {
                    if (preg_match_all('/<td(.*?)>(.*?)<\\/td>/iu', $z, $m)) {
                        if (count($m[2]) == 2) {
                            $d = array('code' => trim(strip_tags($m[2][0])), 'label' => trim(strip_tags($m[2][1])));
                            scraperwiki::save(array('code'), $d);
                        }
                    }
                }
            }
        }
    }
}
$dropDownList = $dom->find("#fachlicheZuordnung", 0);
$max = 200000;
foreach ($dropDownList->children() as $option) {
    if ($option->tag != 'option') {
        continue;
    }
    if (!preg_match('/^\\d+$/', $option->value, $m)) {
        continue;
    }
    if ($max-- < 1) {
        break;
    }
    $discipline = array();
    $discipline['id'] = (int) $option->value;
    $discipline['title'] = $option->plaintext;
    scraperwiki::save(array('id'), $discipline);
}
function load_html($url, $parameters)
{
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_POST, count(explode('&', $parameters)));
    curl_setopt($ch, CURLOPT_POSTFIELDS, 'task=copyRequestParametersToSession&' . $parameters);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_HEADER, 1);
    $result = curl_exec($ch);
    curl_close($ch);
    preg_match_all('|Set-Cookie: (.*?);|U', $result, $m);
    $cookies = implode(';', $m[1]);
    echo $cookies . "\n";
    $ch = curl_init();
コード例 #26
0
        scraperwiki::save(array('Link'), array('Link' => $name));
    }
}
require 'scraperwiki/simple_html_dom.php';
//MUSEUM
/*

//museum
for($i=764; $i<=49; $i++){
    print $i."\n";
 $html = scraperwiki::scrape("http://www.mamilade.de/kinder/2006700-4---1317074400-$i-1324941496.html");

    # Use the PHP Simple HTML DOM Parser to extract <td> tags
    $dom = new simple_html_dom();
    $dom->load($html);
*/
//gastro
for ($i = 1; $i <= 765; $i++) {
    print $i . "\n";
    $html = scraperwiki::scrape("http://www.mamilade.de/gastronomie/2024700-4---1317074400-{$i}-1324976513.html");
    # Use the PHP Simple HTML DOM Parser to extract <td> tags
    $dom = new simple_html_dom();
    $dom->load($html);
    //LINK
    foreach ($dom->find('a.headline400') as $name) {
        # Store data in the datastore
        $name = $name->href;
        //print $name. "\n";
        scraperwiki::save(array('Link'), array('Link' => $name));
    }
}
コード例 #27
0
foreach ($pages_to_scrape as $page) {
    $html = scraperwiki::scrape($base_url . $page);
    $sections_dom = new simple_html_dom();
    $sections_dom->load($html);
    $datah2X = '';
    foreach ($sections_dom->find('h2 span.azpisarrera') as $datah2) {
        $datah2X = utf8_encode($datah2->plaintext);
        print "h2: " . $datah2X . "\n";
    }
    if (!isset($datah2X) || $datah2X == '') {
        foreach ($sections_dom->find('h1 span') as $datah2) {
            $datah2X = utf8_encode($datah2->plaintext);
            print "h1: " . $datah2X . "\n";
        }
    }
    $alldata = '';
    $i = 0;
    $arraydom = $sections_dom->find('dt.ordaina strong');
    foreach ($arraydom as $data) {
        $sep = "|";
        if ($i == count($arraydom) - 1) {
            $sep = "";
        }
        $alldata .= utf8_encode($data->plaintext) . $sep;
        $i++;
    }
    print "data: " . $alldata . "\n----------------\n";
    $entry['Term'] = $datah2X;
    $entry['Definition'] = $alldata;
    scraperwiki::save(array('Definition'), $entry);
}
コード例 #28
0
<?php

$html = scraperWiki::scrape("http://www.hud.gov/local/ny/homeless/familiesshelters.cfm");
//print $html . "\n";
require 'scraperwiki/simple_html_dom.php';
$dom = new simple_html_dom();
$dom->load($html);
foreach ($dom->find("td#content-area p") as $data) {
    $rows = explode("<br>", $data);
    #    print_r($rows);
    #    print count($rows);
    $record = array('shelter' => $rows[0], 'address' => $rows[1], 'city' => $rows[2], 'phone' => $rows[3]);
    print_r($record);
    scraperwiki::save(array('shelter'), $record);
}
$html = scraperWiki::scrape("http://www.hud.gov/local/ny/homeless/familiesshelters.cfm");
//print $html . "\n";
require 'scraperwiki/simple_html_dom.php';
$dom = new simple_html_dom();
$dom->load($html);
foreach ($dom->find("td#content-area p") as $data) {
    $rows = explode("<br>", $data);
    #    print_r($rows);
    #    print count($rows);
    $record = array('shelter' => $rows[0], 'address' => $rows[1], 'city' => $rows[2], 'phone' => $rows[3]);
    print_r($record);
    scraperwiki::save(array('shelter'), $record);
}
コード例 #29
0
 $gg_url = 'http://www.google.com.au/search?&num=100&tbm=plcs&hl=en&q=' . urlencode($query) . '&start=';
*/
$i = 1;
$size = 0;
$options = array(CURLOPT_RETURNTRANSFER => true, CURLOPT_HEADER => false, CURLOPT_FOLLOWLOCATION => true, CURLOPT_ENCODING => "", CURLOPT_AUTOREFERER => true, CURLOPT_CONNECTTIMEOUT => 120, CURLOPT_TIMEOUT => 120, CURLOPT_MAXREDIRS => 10, CURLOPT_COOKIEFILE => "cookie.txt", CURLOPT_COOKIEJAR => "cookie.txt", CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3", CURLOPT_REFERER => "http://www.google.com/");
for ($page = $start; $page < $npages; $page++) {
    $ch = curl_init($gg_url . $page . '0');
    curl_setopt_array($ch, $options);
    $scraped = "";
    $scraped .= curl_exec($ch);
    curl_close($ch);
    $results = array();
    $save = array();
    $keys = array('name', 'address', 'suburb', 'phone', 'url');
    preg_match_all('@line-height:1.24" valign="top">([^"]+)<br>([^"]+)<br>.<nobr>([^"]+)</nobr></table>.*<h3\\s*class="r">\\s*<a[^<>]*href="([^<>]*)"[^<>]*>(.*)</a>\\s*</h3>@siU', $scraped, $results);
    $address = $results[1];
    $suburb = $results[2];
    $phone = $results[3];
    $url = $results[4];
    $name = $results[5];
    for ($zf = 0; $zf < count($results[0]); $zf++) {
        $save[] = array('name' => $name[$zf], 'address' => $address[$zf], 'suburb' => $suburb[$zf], 'phone' => $phone[$zf], 'url' => $url[$zf]);
        scraperwiki::save($keys, $save);
        //echo $address[$zf]." : ".$suburb[$zf]." : ".$phone[$zf]." : ".$url[$zf]." : ".$name[$zf]." \n";
    }
    $size += strlen($scraped);
    $i++;
}
//fclose($fp);
echo "Number of results: {$i} Total KB read: " . $size / 1024.0;
print "Done.";
コード例 #30
0
        $link_number += 1;
        print "Brace yourselfs...\n";
    }
    print "Foreach-loop made it through the end. Wow.\n\nBrace yourselfs...\n";
}
print "For-loop made it through the end. Phew.\n";
print "\n";
print "First URL was \"" . $allthelinks[0] . "\"\n";
print "Last URL was \"" . $attribute_href . "\"\n";
print $link_number . " URLs found initially.\n";
$allthelinks = array_unique($allthelinks);
// Throwing out those double entries
$allthelinks = array_values($allthelinks);
// Making new index
$link_number = count($allthelinks);
// Counting what's left
print $link_number . " unique URLs found.\n";
$countdat = 0;
foreach ($allthelinks as $datlink) {
    if (stripos($datlink, $victimstrace) !== FALSE) {
        $alltheuserlinks = array($countdat, $datlink);
        $countdat += 1;
    }
}
print $countdat . " URLs found belong to " . $thevictim . ".\n\n";
print "Here it goes... nggghhh\n";
var_dump($alltheuserlinks);
// And now taking a dump in your face
//print "scraperwiki::save says: ";
scraperwiki::save(0, var_dump($allthelinks));