scraperwiki PHP代码示例

示例#1

2

显示文件

文件： scraper.php 项目： trngltrngl/gsmarena

 function parseModelsPage($brandId, $brandName, $page)
 {
     $html_content = scraperwiki::scrape($page);
     $this->html = str_get_html($html_content);
     foreach ($this->html->find("div.makers a") as $el) {
         $img = $el->find('img', 0);
         $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext;
         $m['img'] = $img->src;
         $m['link'] = 'http://www.gsmarena.com/' . $el->href;
         $m['desc'] = $img->title;
         $temp = explode('-', $el->href);
         $m['id'] = (int) substr($temp[1], 0, -4);
         $m['brand_id'] = $brandId;
         scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model");
         $this->models++;
     }
     $pagination = $this->html->find("div.nav-pages", 0);
     if ($pagination) {
         $nextPageLink = $pagination->lastChild();
         if ($nextPageLink && $nextPageLink->title == "Next page") {
             $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href);
         }
     }
     $this->html->__destruct();
 }

示例#2

0

显示文件

文件： dk-ted.php 项目： flyeven/scraperwiki-scraper-vault

function scrapeTEDRSS($url, $sector)
{
    print $url . " " . $sector . "\n";
    // $xml = scraperWiki::scrape($url);
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    // 10 second before aborting
    // try CURLOPT_CONNECTTIMEOUT (in seconds)
    // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with):
    // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting
    $xml = curl_exec($curl);
    print curl_error($curl) . "\n";
    $dom = new simple_html_dom();
    $dom->load($xml);
    $items = $dom->find("item");
    foreach ($items as $item) {
        $guid = $item->find("guid");
        $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext);
        print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB";
        echo "\n";
        // $record = scrapeTEDDataPage ($noticeURL, $sector);
        $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL);
        scraperwiki::save(array('sector', 'url'), $record);
        sleep(1);
    }
    $dom->__destruct();
    unset($items);
    unset($dom);
    unset($xml);
    print memory_get_usage() / 1024 / 1024 . "MB\n";
}

示例#3

0

显示文件

文件： chuck_norris.php 项目： flyeven/scraperwiki-scraper-vault

function saveIt($txt)
{
    global $joke_count;
    $record = array('JOKE_ID' => ++$joke_count, 'JOKE_TEXT' => $txt);
    scraperwiki::save(array('JOKE_ID'), $record);
    //var_dump($record);
}

示例#4

0

显示文件

文件： goonmetrics.php 项目： flyeven/scraperwiki-scraper-vault

function scrapeMarketGroup($url)
{
    global $visitedIds;
    $html = scraperWiki::scrape($url);
    $html = str_replace("\n", "", $html);
    preg_match_all("|<a href=\"/importing/61000746/marketgroup/(\\d+?)/\">(.+?)</a>|s", $html, $matches, PREG_SET_ORDER);
    foreach ($matches as $match) {
        $groupId = $match[1];
        $groupName = html_entity_decode($match[2]);
        //echo $groupName."\n";
        if (!in_array($groupId, $visitedIds)) {
            $visitedIds[] = $groupId;
            scrapeMarketGroup("http://goonmetrics.com/importing/61000746/marketgroup/" . $groupId . "/");
        }
    }
    preg_match_all("|<tr(.*?)>(.*?)<td(.*?)><a href=\"http://games.chruker.dk/eve_online/item.php\\?type_id=(.+?)\" target=\"_blank\">(.*?)<span class=\"dot\" onclick=\"CCPEVE.showMarketDetails\\((.*?)\\)\">(.+?)</span>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.+?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)<td(.*?)>(.*?)</td>(.*?)</tr>|s", $html, $matches, PREG_SET_ORDER);
    foreach ($matches as $match) {
        $item = array("itemId" => trim($match[4]), "name" => trim(mb_check_encoding($match[7], 'UTF-8') ? $match[7] : utf8_encode($match[7])), "weekVol" => trim(mb_check_encoding($match[11], 'UTF-8') ? $match[11] : utf8_encode($match[11])), "k6Stock" => trim(mb_check_encoding($match[17], 'UTF-8') ? $match[17] : utf8_encode($match[17])));
        $item['weekVol'] = str_replace(",", "", $item['weekVol']);
        $item['k6Stock'] = str_replace(",", "", $item['k6Stock']);
        $saved = false;
        $delay = 0;
        while (!$saved && $delay < 600) {
            try {
                @scraperwiki::save_sqlite(array('itemId'), $item, 'eve_goonmetrics');
                $saved = true;
            } catch (Exception $e) {
                sleep(10);
                $delay++;
            }
        }
    }
}

示例#5

0

显示文件

文件： desert-island-disc-records.php 项目： flyeven/scraperwiki-scraper-vault

function do_day($rec)
{
    $html = scraperwiki::scrape($rec['url']);
    $dom = new simple_html_dom();
    $dom->load($html);
    $cell = $dom->find('a[name=discs]');
    $lines = $cell[0]->parent->find('text');
    print $lines[10] . "\n";
    print count($lines) . "\n";
    # loop by number, as null lines stop a foreach
    $n = 0;
    for ($line_no = 0; $line_no < count($lines); $line_no++) {
        $line = $lines[$line_no];
        if (strlen($line) == 3) {
            # the DOM object crashes on this row, so ignore
            continue;
        }
        #if (preg_match("#^" . $n . "#", $line, $matches)) {
        print $line_no . " " . strlen($line) . "\n";
        $n = $n + 1;
        print $line . "\n";
        #}
    }
    #scraperwiki::save(array('data'), array('data' => $data->plaintext));
}

示例#6

0

显示文件

文件： geipan.php 项目： flyeven/scraperwiki-scraper-vault

function scrapPage($page)
{
    print "Scraping page " . $page;
    $url = "http://www.geipan.fr/index.php?id=202";
    $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens=";
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_POST, 11);
    curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
    $html = curl_exec($curl);
    print curl_error($curl) . "\n";
    //      print($html);
    $dom = new simple_html_dom();
    $dom->load($html);
    $trs = $dom->find("tr");
    foreach ($trs as $tr) {
        if (isset($tr->attr['onclick'])) {
            $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13);
            print $ID . "\n";
            $tds = $tr->find("td");
            $title = utf8_encode($tds[0]->plaintext);
            $date = $tds[1]->plaintext;
            $departement = utf8_encode($tds[2]->plaintext);
            $classe = $tds[3]->plaintext;
            $maj = $tds[4]->plaintext;
            $city = substr($title, 0, strpos($title, "(") - 1);
            $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city);
            scraperwiki::save(array('ID', 'maj'), $record);
        }
    }
}

示例#7

0

显示文件

文件： asuntojen_hintatiedot_1.php 项目： flyeven/scraperwiki-scraper-vault

function scrape_page()
{
    $row = 0;
    $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) > 8) {
            $row++;
            $GLOBALS['rowTotal']++;
            $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext);
            scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']);
            print $GLOBALS['rowTotal'] . "\n";
            print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n";
        }
    }
    if ($row == 50) {
        print "Vielä jatkuu, haetaan seuraava sivu..." . "\n";
        $GLOBALS['z']++;
        scrape_page();
    } else {
        print "Skrääpiminen suoritettu." . "\n";
        print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n";
        print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n";
    }
}

示例#8

0

显示文件

文件： pauls-hmhse-scraper.php 项目： flyeven/scraperwiki-scraper-vault

function clubURL($url)
{
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $clubName = trim(str_replace('&nbsp;', '', $dom->find('table', 0)->find('tr', 2)->plaintext));
    $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName));
    $_GLOBAL['clubs'][] = $formatClubName;
    echo 'running ' . $formatClubName . "\n";
    foreach ($dom->find('table', 2)->find('tr') as $row) {
        if (is_numeric($row->find('td', 0)->plaintext)) {
            $year = trim($row->find('td', 0)->plaintext);
            $position = trim(str_replace('&nbsp;', '', $row->find('td', 1)->plaintext));
            if (trim($position) == 'Champion') {
                $position = 1;
            }
            $leagueLevel = trim($row->find('td', 2)->plaintext);
            $overallPosition = trim($row->find('td', 3)->plaintext);
            $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext));
            $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext));
            $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance);
            scraperwiki::save(array('club', 'year'), $dataset);
        }
    }
    /*
     * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak
     */
    $dom->clear();
    unset($dom);
}

示例#9

0

显示文件

文件： munich_airport.php 项目： flyeven/scraperwiki-scraper-vault

function grep_munich($url, $table_name)
{
    $html = scraperWiki::scrape($url);
    $count = 0;
    # Use the PHP Simple HTML DOM Parser to extract <td> tags
    $dom = new simple_html_dom();
    $dom->load($html);
    //Drop all old informations by dropping the table
    scraperwiki::sqliteexecute("drop table if exists " . $table_name);
    scraperwiki::sqlitecommit();
    $table = $dom->getElementById('flight_info_area');
    foreach ($table->find('tr') as $data) {
        // Flight details. Read tds or ths
        $tds = $data->find("td");
        //if there are less then 7 columns continue to next loop
        if (sizeof($tds) < 7) {
            continue;
        }
        //print $data->plaintext . "\n";
        $flightnr = $tds[1]->plaintext;
        $from = $tds[2]->plaintext;
        $time = $tds[3]->plaintext;
        $expected_time = $tds[4]->plaintext;
        //Create date
        $date = date("Y-m-d");
        //Build array of flight informations
        $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time);
        //Save the informations of one flight
        scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name);
        $count = $count + 1;
    }
}

示例#10

0

显示文件

文件： yellowpage_id.php 项目： flyeven/scraperwiki-scraper-vault

function scrap_yp($last_alphabet = '', $last_page = '')
{
    $alphabet = range('a', 'z');
    if (is_null($last_alphabet) || $last_alphabet == '') {
        $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded');
        if (!is_null($temp_alphabet)) {
            $last_alphabet = $temp_alphabet;
        } else {
            $last_alphabet = 'a';
        }
    }
    if (is_null($last_page) || $last_page == '') {
        $temp_page = scraperwiki::get_var('last_page_loaded');
        if (!is_null($temp_page)) {
            $last_page = $temp_page;
        } else {
            $last_page = 1;
        }
    }
    $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page;
    $html = scraperWiki::scrape($yp_base_url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("ul.directory-list") as $data) {
        echo $data;
    }
}

示例#11

0

显示文件

文件： foodnetwork.php 项目： flyeven/scraperwiki-scraper-vault

function getIngredients($html)
{
    $i = 0;
    $dom = new simple_html_dom();
    $dom->load($html);
    //foreach($dom->find('result-item',1)->href as $data)
    //{
    // if ($data != null)
    //$res = trim($data->plaintext);
    $res = $dom->find('a[class=callout]', 1)->href;
    $res = str_replace("reviews/", "", $res);
    echo "http://www.foodnetwork.com" . $res;
    $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res);
    $domFoods = new simple_html_dom();
    //$domFoods->load($html1);
    $h = str_get_html($html1);
    //echo $domFoods;
    echo "\n\n";
    foreach ($h->find('li[class=ingredient]') as $data) {
        $ingredient = $data->plaintext;
        if (isset($h->href)) {
            $href = $h->href;
        }
        //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){
        //echo $data->plaintext;
        scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href));
    }
}

示例#12

0

显示文件

文件： other_scraper_ml.php 项目： flyeven/scraperwiki-scraper-vault

function run_ml($q_num = 0)
{
    $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext)));
        $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext);
        /*
         *  Stores results
         */
        scraperwiki::save_sqlite(array("No"), $record);
        unset($temp_data);
    }
    foreach ($dom->find("a") as $a) {
        if ($a->plaintext == 'Next') {
            $tmp_a = $a->href;
            $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a);
            if ($tmp_a > 0) {
                continue;
            }
        }
    }
    if ((int) $tmp_a != 0) {
        run_ml($tmp_a);
    } else {
        exit;
    }
}

示例#13

0

显示文件

文件： scraper.php 项目： jbm160/wc_cat

function getCategories($u)
{
    global $baseurl, $f;
    $path = "";
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    echo "Loaded URL: " . $u . "\n";
    if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) {
        $breadcrumb = $d->find('div[id=breadcrumb]', 0);
        //foreach($breadcrumb as $b) {
        //echo "Breadcrumb = " . $b;}
        if (!is_null($breadcrumb)) {
            foreach ($breadcrumb->children() as $crumb) {
                $path .= trim($crumb->innertext) . "/";
            }
            $path .= trim(strrchr($breadcrumb->innertext, ">"), "> ");
        }
        foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) {
            $name = trim(strstr($div->children(0)->innertext, "(", true));
            $url = $baseurl . $div->children(0)->href;
            $data = array("Name" => $name, "Path" => $path, "URL" => $url);
            echo $path . "/" . $name . "\n";
            if ($local) {
                fputcsv($f, array($name, $path, $url));
            } else {
                scraperwiki::save_sqlite(array("URL"), $data);
            }
            getCategories($url);
        }
    }
}

示例#14

0

显示文件

文件： dzi_-_ngo_detail_scraper.php 项目： flyeven/scraperwiki-scraper-vault

function scrapeDetails($ngo)
{
    $html_content = scraperwiki::scrape($ngo["url"]);
    $dom = new simple_html_dom();
    $dom->load($html_content);
    $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:');
    // Scrape Details from all paragraphs
    $paragraphs = $dom->find('p');
    foreach ($paragraphs as $p) {
        if (strstr($p->plaintext, "Website")) {
            $ngo["website"] = $p->find('a', 0)->href;
        }
        if (strstr($p->plaintext, "Email")) {
            $ngo["email"] = $p->find('a', 0)->plaintext;
        }
        foreach ($infosWeWant as $key => $info) {
            $res = extractInfo($p, $info);
            if ($res) {
                $ngo[$info] = $res;
                //Do not search for this info again
                unset($infosWeWant[$key]);
            }
        }
    }
    print_r($ngo);
    return $ngo;
}

示例#15

0

显示文件

文件： productlist.php 项目： jbm160/brs

function getProducts($u, $cat)
{
    global $o;
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    //echo "Loaded URL: " . $u . "\n";
    $items = $d->find('li.grid-item');
    if (count($items) > 0) {
        foreach ($items as $p) {
            $prod = $p->find('p.product-name > a', 0);
            $prodname = trim($prod->innertext);
            $prodURL = $prod->href;
            if (!is_null($p->find('p.minimal-price', 0))) {
                $prodtype = 1;
            } else {
                $prodtype = 0;
            }
            fputcsv($o, array($prodname, $prodtype, $cat, $prodURL));
            echo $prodname . "\n";
        }
        if (!is_null($d->find('p.next', 0))) {
            getProducts($d->find('p.next', 0)->href, $cat);
        }
    }
}

示例#16

0

显示文件

文件： kcci.php 项目： flyeven/scraperwiki-scraper-vault

function kcci($uuid)
{
    // Create DOM from URL or file
    $html = file_get_html('http://www.kcci.com.pk/UserProfile/tabid/42/userId/' . $uuid . '/Default.aspx');
    // Extract member profile from table
    $table = $html->find('table', 1);
    $profile = array();
    foreach ($table->find('td') as $td) {
        array_push($profile, $td->plaintext);
    }
    $record['UUID'] = $uuid;
    for ($i = 0; $i < count($profile); $i += 2) {
        $record[$profile[$i]] = $profile[$i + 1];
    }
    // Save the record
    ksort($record);
    $unique_keys = array('UUID');
    scraperwiki::save_sqlite($unique_keys, $record, $table_name = "kcci", $verbose = 2);
    // Clean up
    unset($record);
    unset($profile);
    $td->clear();
    unset($td);
    $table->clear();
    unset($table);
    $html->clear();
    unset($html);
}

示例#17

0

显示文件

文件： openeduol.php 项目： flyeven/scraperwiki-scraper-vault

function alreadyKnown($cat, $url)
{
    $data = scraperwiki::sqliteexecute("select distinct id from swdata where cat='" . $cat . "' and url='" . $url . "'");
    if (count($data->data) === 0) {
        return false;
    }
    echo "already known : " . $url . " in " . $cat . "\n";
    return true;
}

示例#18

0

显示文件

文件： eures_url_scraper.php 项目： asavagar/EU-data-cloud

function scraper($url_search, $country_id)
{
    $has_next = false;
    $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet";
    $html = scraperwiki::scrape($url_search);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find('table[class=JResult]') as $result) {
        foreach ($result->find('td[class=JRTitle] a') as $job_page) {
            $chars = explode("'", $job_page->onclick);
            $url_job = $base_url . substr($chars[1], 1);
            $url_id = strstr($url_job, 'uniqueJvId=');
            $url_id = str_replace('uniqueJvId=', "", $url_id);
            echo "JOB: " . $url_job . "<br />";
        }
        foreach ($result->find('th') as $data) {
            $text = trim($data->plaintext);
            if ($text == 'Description:') {
                $description = trim($data->next_sibling()->plaintext);
                echo "DESCRIPTION: " . $description . "<br />";
            }
            if ($text == 'Source:') {
                $source = trim($data->next_sibling()->plaintext);
                $source = str_replace("'", "\\'", $source);
                if ($source != '' && $source != '&nbsp;') {
                    $source_id = insert_name('source', $source);
                    echo "SOURCE: " . $source . "<br /><br />";
                }
            }
        }
        $description = str_replace("'", "\\'", $description);
        $description = str_replace("</BR>", "", $description);
        $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'");
        $cont = mysql_num_rows($sql);
        if ($cont == 0) {
            mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()");
        } else {
            echo "Job URL already extracted: " . $url_job . "<br /><br />";
        }
    }
    foreach ($dom->find('div[class=prevNext] a') as $next_page) {
        $text = $next_page->plaintext;
        if ($text == "Next page") {
            $url_next = substr($next_page->href, 1);
            $url_next = $base_url . $url_next;
            $has_next = true;
            print "<br /><br />NEXT: " . $url_next . "<br /><br />";
        }
    }
    unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search);
    //Comment this for tests, uncomment this to get all data
    //	if ($has_next == true){
    //		sleep(1);
    //		scraper($url_next, $country_id);
    //	}
}

示例#19

0

显示文件

文件： jse.php 项目： flyeven/scraperwiki-scraper-vault

function get_codes($dom)
{
    foreach ($dom->find("select") as $data) {
        foreach ($data->find("option") as $op) {
            $record = array('stockCode' => $op->value, 'stockSymbol' => $op->plaintext);
            $message = scraperwiki::save_sqlite(array("stockCode"), $record);
            #print_r($message);
        }
    }
}

示例#20

0

显示文件

文件： jse_1.php 项目： flyeven/scraperwiki-scraper-vault

function get_codes($dom)
{
    foreach ($dom->find('tr[class^="list_row"]') as $data) {
        $tds = $data->find("td");
        //print $tds[0]->plaintext . "\n";
        $record = array('item' => $tds[0]->plaintext, 'BUY_CND' => $tds[1]->plaintext, 'SELL_CND' => $tds[2]->plaintext, 'BUY_US' => $tds[3]->plaintext, 'SELL_US' => $tds[4]->plaintext);
        scraperwiki::save_sqlite(array("item"), $record);
        print_r($record);
    }
}

示例#21

0

显示文件

文件： link_tagger.php 项目： flyeven/scraperwiki-scraper-vault

function scrape($source)
{
    global $source, $utmSource, $utmMedium, $utmTerm, $utmContent, $utmCampaign;
    $link = scraperwiki::scrape($source);
    $html = str_get_html($link);
    foreach ($html->find('a[href]') as $a) {
        $href = $a->href;
        $a->href = $href . '#utm_source=' . $utmSource . '&utm_medium=' . $utmMedium . '&utm_term=' . $utmTerm . '&utm_content=' . $utmContent . '&utm_campaign=' . $utmCampaign;
    }
    print $html;
}

示例#22

0

显示文件

文件： scraper.php 项目： arasabbasi/e-sanandaj

function ripByPage($page)
{
    $pathToDetails = 'http://aramestan.e-sanandaj.ir/BurialRequest/DeadSearch?keyword=&firstName=&lastName=&fatherName=&partNo=0&rowNo=&graveNo=&deathDateFrom=&deathDateTo=&bornDateFrom=&bornDateTo=&page=' . $page;
    $output = scraperwiki::scrape($pathToDetails);
    $resultingJsonObject = json_decode($output);
    for ($id = 0; $id <= 9; $id++) {
        $entry = array('id' => $resultingJsonObject->{'result'}[$id]->{'Id'}, 'fullname' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFullName'}), 'fathername' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFatherName'}), 'birthdate' => strVal($resultingJsonObject->{'result'}[$id]->{'BornDate'}), 'deathdate' => strVal($resultingJsonObject->{'result'}[$id]->{'DeathDate'}), 'partno' => strVal($resultingJsonObject->{'result'}[$id]->{'PartNo'}), 'rowno' => strVal($resultingJsonObject->{'result'}[$id]->{'RowNo'}), 'graveno' => strVal($resultingJsonObject->{'result'}[$id]->{'GraveNo'}), 'gender' => strVal($resultingJsonObject->{'result'}[$id]->{'Gender'}), 'identitycode' => strVal($resultingJsonObject->{'result'}[$id]->{'IdentityCode'}));
        scraperwiki::save_sqlite(array('data'), $entry);
        $pagecount = $resultingJsonObject->{'PageNumber'};
    }
}

示例#23

0

显示文件

文件： exec.php 项目： TimofonicJunkRoom/ScraperWikiX

function errorHandler($errno, $errstr, $errfile, $errline)
{
    // if error has been surpressed with an @
    // see: http://php.net/manual/en/function.set-error-handler.php
    if (error_reporting() == 0) {
        return;
    }
    global $script;
    $etb = errorParserStack($errno, $errstr, $script);
    scraperwiki::sw_dumpMessage($etb);
    return true;
}

示例#24

0

显示文件

文件： proxyparser2.php 项目： flyeven/scraperwiki-scraper-vault

function grab($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("#tbl_proxy_list tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) == 7) {
            $input = decode_ip((string) $tds[0]);
            $record = array('ip' => $input);
            scraperwiki::save(array('ip'), $record);
        }
    }
}

示例#25

0

显示文件

文件： config.php 项目： rishabmps/extraction-framework

function getLangs()
{
    $url = "http://mappings.dbpedia.org/server/statistics/";
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $i = 0;
    $langs = array();
    foreach ($dom->find('/html/body/p/a') as $result) {
        $lang = str_replace("/", "", trim($result->href));
        $langs[] = $lang;
    }
    return $langs;
}

示例#26

0

显示文件

文件： ideelabnewsscraper.php 项目： flyeven/scraperwiki-scraper-vault

function scrapeIdeeLab()
{
    $html = scraperWiki::scrape("http://ideelab.wordpress.com/category/uudis/");
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find('div.status-publish') as $data) {
        $newsTitle = $data->find('div.posttitle h2.pagetitle');
        //    print($newsTitle[0]->plaintext."\n");
        $newsBody = $data->find('div.entry');
        //    print($newsBody[0]->plaintext."\n");
        $record = array('title' => $newsTitle[0]->plaintext, 'newsbody' => $newsBody[0]->plaintext);
        scraperwiki::save(array('title', 'newsbody'), $record);
    }
}

示例#27

0

显示文件

文件： test_77.php 项目： flyeven/scraperwiki-scraper-vault

function gazelangs($url, $lang)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $michi = "strong";
    $michi = $michi . " hope";
    foreach ($dom->find("ul[@class='trans_sent']") as $data) {
        $tds = $data->find("li");
        $record = array('user_input' => $tds[0]->plaintext, 'babelfish_output' => $tds[1]->plaintext, 'timestamp_scrape' => date("Y-m-d H:i:s"), 'page' => $url, 'language' => $lang);
        // print json_encode($record) . "\n";
        scraperwiki::save(array('user_input', 'babelfish_output', 'timestamp_scrape', 'page', 'language'), $record);
    }
}

示例#28

0

显示文件

文件： dzi_-_german_non_profit_companys.php 项目： flyeven/scraperwiki-scraper-vault

function scrapeIndex($url)
{
    $html_content = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html_content);
    $ngos = array();
    foreach ($dom->find('h2') as $h2) {
        $name = str_replace("&#8211;", "-", html_entity_decode($h2->plaintext));
        $url = $h2->find('a', 0);
        $url = $url->href;
        $ngos[] = array("name" => $name, "url" => $url);
        scraperwiki::save_sqlite(array("name"), array("name" => $name, "url" => $url), "ngos");
    }
    print_r($ngos);
    return $ngos;
}

示例#29

0

显示文件

文件： user_agents_1.php 项目： flyeven/scraperwiki-scraper-vault

function crawlAgents($pageUrl, $domObj)
{
    $html = scraperwiki::scrape($pageUrl);
    $domObj->load($html);
    $html = null;
    $table = $domObj->find('/html/body/table[5]');
    foreach ($table[0]->find('tr') as $trs) {
        if (strpos($trs->firstChild()->plaintext, " String ") == false) {
            $tds = $trs->find('td');
            $agentstring = str_replace('&nbsp;', '', $tds[0]->plaintext);
            $agentdescription = str_replace('&nbsp;', '', $tds[1]->plaintext);
            $agenttype = str_replace('&nbsp;', '', $tds[2]->plaintext);
            $record = array('agent' => $agentstring, 'description' => $agentdescription, 'agent_type' => $agenttype);
            scraperwiki::save_sqlite(array('agent'), $record, $table_name = "UserAgents");
        }
    }
}

示例#30

0

显示文件

文件： goodexcuses.php 项目： flyeven/scraperwiki-scraper-vault

function getExcuse($extension)
{
    global $html;
    global $count;
    $root = "http://www.goodexcuses.co.uk";
    //$extension = "/Excuses/My-fish-is-sick-and-I-need-to-take-it-to-the-vet/" ;
    $html = file_get_html($root . $extension);
    //The excuse
    $excuse = $html->find('h2', 0)->innertext;
    echo $excuse . "\n";
    //save to DB
    $record = array('EXCUSE_ID' => ++$count, 'EXCUSE_TEXT' => $excuse, 'EXCUSE_URL' => $extension);
    scraperwiki::save(array('EXCUSE_ID'), $record);
    //Get next url
    //echo "\n".goToNextURL()."\n";
    goToNextURL();
}

PHP scraperwiki示例