load() public method

load html from string
public load ( $str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT )
Ejemplo n.º 1
0
function a587_getPlaintext($_text, $_remove)
{
    global $REX;
    foreach (explode(',', $REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['order']) as $elem) {
        switch ($elem) {
            case 'selectors':
                // remove elements selected by css-selectors
                $html = new simple_html_dom();
                $html->load($_text);
                $html->remove($_remove);
                $html->load($html->outertext);
                $_text = $html->plaintext;
                break;
            case 'regex':
                // regex
                if (!empty($REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['regex'])) {
                    $regex = array();
                    $replacement = array();
                    $odd = true;
                    foreach (explode("\n", $REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['regex']) as $line) {
                        if ($line != '') {
                            if ($odd) {
                                $regex[] = trim($line);
                            } else {
                                $replacement[] = $line;
                            }
                            $odd = !$odd;
                        }
                    }
                    $_text = preg_replace($regex, $replacement, $_text);
                }
                break;
            case 'textile':
                // strip HTML-tags
                if (!empty($REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['textile']) and function_exists('rex_a79_textile')) {
                    $_text = rex_a79_textile($_text);
                }
                break;
            case 'striptags':
                // strip HTML-tags
                if (!empty($REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['striptags'])) {
                    $_text = strip_tags($_text);
                }
                break;
        }
    }
    return $_text;
}
function scrape_page()
{
    $row = 0;
    $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) > 8) {
            $row++;
            $GLOBALS['rowTotal']++;
            $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext);
            scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']);
            print $GLOBALS['rowTotal'] . "\n";
            print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n";
        }
    }
    if ($row == 50) {
        print "Vielä jatkuu, haetaan seuraava sivu..." . "\n";
        $GLOBALS['z']++;
        scrape_page();
    } else {
        print "Skrääpiminen suoritettu." . "\n";
        print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n";
        print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n";
    }
}
function run_ml($q_num = 0)
{
    $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext)));
        $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext);
        /*
         *  Stores results
         */
        scraperwiki::save_sqlite(array("No"), $record);
        unset($temp_data);
    }
    foreach ($dom->find("a") as $a) {
        if ($a->plaintext == 'Next') {
            $tmp_a = $a->href;
            $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a);
            if ($tmp_a > 0) {
                continue;
            }
        }
    }
    if ((int) $tmp_a != 0) {
        run_ml($tmp_a);
    } else {
        exit;
    }
}
Ejemplo n.º 4
0
 /**
  * Get Raw html of webpage
  *
  * @param bool $usepost
  *
  * @return bool
  */
 private function getUrl($usepost = false)
 {
     if (isset($this->_trailUrl)) {
         $ch = curl_init(self::POPURL . $this->_trailUrl);
     } else {
         $ch = curl_init(self::IF18);
     }
     if ($usepost === true) {
         curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");
         curl_setopt($ch, CURLOPT_POST, 1);
         curl_setopt($ch, CURLOPT_POSTFIELDS, $this->_postParams);
     }
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($ch, CURLOPT_HEADER, 0);
     curl_setopt($ch, CURLOPT_VERBOSE, 0);
     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
     curl_setopt($ch, CURLOPT_USERAGENT, "Firefox/2.0.0.1");
     curl_setopt($ch, CURLOPT_FAILONERROR, 1);
     if (isset($this->cookie)) {
         curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie);
         curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie);
     }
     curl_setopt_array($ch, newznab\utility\Utility::curlSslContextOptions());
     $this->_response = curl_exec($ch);
     if (!$this->_response) {
         curl_close($ch);
         return false;
     }
     curl_close($ch);
     $this->_html->load($this->_response);
     return true;
 }
Ejemplo n.º 5
0
function getCategories($u)
{
    global $baseurl, $f;
    $path = "";
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    echo "Loaded URL: " . $u . "\n";
    if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) {
        $breadcrumb = $d->find('div[id=breadcrumb]', 0);
        //foreach($breadcrumb as $b) {
        //echo "Breadcrumb = " . $b;}
        if (!is_null($breadcrumb)) {
            foreach ($breadcrumb->children() as $crumb) {
                $path .= trim($crumb->innertext) . "/";
            }
            $path .= trim(strrchr($breadcrumb->innertext, ">"), "> ");
        }
        foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) {
            $name = trim(strstr($div->children(0)->innertext, "(", true));
            $url = $baseurl . $div->children(0)->href;
            $data = array("Name" => $name, "Path" => $path, "URL" => $url);
            echo $path . "/" . $name . "\n";
            if ($local) {
                fputcsv($f, array($name, $path, $url));
            } else {
                scraperwiki::save_sqlite(array("URL"), $data);
            }
            getCategories($url);
        }
    }
}
Ejemplo n.º 6
0
 /**
  * 解析下返回来的信息
  * @return string 解析成功后的信息
  */
 public function parse()
 {
     require_once dirname(__FILE__) . '/simple_html_dom.php';
     $data = $this->requestURL();
     if (empty($data) || strlen($data < 100)) {
         return $data;
     }
     //如过抓取到的内容是空的说明cookie失效了。
     $html = new simple_html_dom();
     $html->load($data);
     $ymd = $html->find('.time-d');
     $his = $html->find('.time-h');
     $title = $html->find('.consume-title a');
     $trade = $html->find('td.tradeNo p');
     $name = $html->find('p.name');
     $amount = $html->find('td.amount span');
     if (!$trade) {
         return 'no_order';
     }
     $info = array();
     foreach ($ymd as $key => $value) {
         //只要订单数字部分
         preg_match('/\\d+/', $trade[$key]->innertext, $tradeNo);
         //这里可以添加一些逻辑判断语句,例如存到数据库里面遍历查询这个订单是否已经通知成功
         $info[] = array('time' => trim($ymd[$key]->innertext) . ' ' . trim($his[$key]->innertext), 'title' => trim($title[$key]->innertext), 'trade' => trim($tradeNo[0]), 'name' => trim($name[$key]->innertext), 'amount' => trim(str_replace('+', '', $amount[$key]->innertext)));
     }
     $html->clear();
     return $info;
 }
Ejemplo n.º 7
0
 /**
  * Get Raw html of webpage
  *
  * @param bool   $usepost
  * @param string $site
  *
  * @return bool
  */
 private function getUrl($usepost = false, $site = "straight")
 {
     if (isset($this->_trailUrl)) {
         $ch = curl_init($this->_whichSite[$site] . $this->_trailUrl);
     } else {
         $ch = curl_init(self::IF18);
     }
     if ($usepost === true) {
         curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST");
         curl_setopt($ch, CURLOPT_POST, 1);
         curl_setopt($ch, CURLOPT_POSTFIELDS, $this->_postParams);
     }
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($ch, CURLOPT_HEADER, 0);
     curl_setopt($ch, CURLOPT_VERBOSE, 0);
     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
     curl_setopt($ch, CURLOPT_USERAGENT, "Firefox/2.0.0.1");
     curl_setopt($ch, CURLOPT_FAILONERROR, 1);
     if (isset($this->cookie)) {
         curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie);
         curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie);
     }
     $this->_response = curl_exec($ch);
     if (!$this->_response) {
         curl_close($ch);
         return false;
     }
     curl_close($ch);
     $this->_html->load($this->_response);
     return true;
 }
function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape("http://www.norwegian.no/fly/lavpris/?D_City=CPH&A_City=DUB&TripType=2&D_Day=1&D_Month=201104&R_Day=1&R_Month=201104&AdultCount=1&ChildCount=0&InfantCount=0");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("div");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}
function grep_munich($url, $table_name)
{
    $html = scraperWiki::scrape($url);
    $count = 0;
    # Use the PHP Simple HTML DOM Parser to extract <td> tags
    $dom = new simple_html_dom();
    $dom->load($html);
    //Drop all old informations by dropping the table
    scraperwiki::sqliteexecute("drop table if exists " . $table_name);
    scraperwiki::sqlitecommit();
    $table = $dom->getElementById('flight_info_area');
    foreach ($table->find('tr') as $data) {
        // Flight details. Read tds or ths
        $tds = $data->find("td");
        //if there are less then 7 columns continue to next loop
        if (sizeof($tds) < 7) {
            continue;
        }
        //print $data->plaintext . "\n";
        $flightnr = $tds[1]->plaintext;
        $from = $tds[2]->plaintext;
        $time = $tds[3]->plaintext;
        $expected_time = $tds[4]->plaintext;
        //Create date
        $date = date("Y-m-d");
        //Build array of flight informations
        $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time);
        //Save the informations of one flight
        scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name);
        $count = $count + 1;
    }
}
Ejemplo n.º 10
0
function scrapeTEDRSS($url, $sector)
{
    print $url . " " . $sector . "\n";
    // $xml = scraperWiki::scrape($url);
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    // 10 second before aborting
    // try CURLOPT_CONNECTTIMEOUT (in seconds)
    // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with):
    // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting
    $xml = curl_exec($curl);
    print curl_error($curl) . "\n";
    $dom = new simple_html_dom();
    $dom->load($xml);
    $items = $dom->find("item");
    foreach ($items as $item) {
        $guid = $item->find("guid");
        $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext);
        print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB";
        echo "\n";
        // $record = scrapeTEDDataPage ($noticeURL, $sector);
        $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL);
        scraperwiki::save(array('sector', 'url'), $record);
        sleep(1);
    }
    $dom->__destruct();
    unset($items);
    unset($dom);
    unset($xml);
    print memory_get_usage() / 1024 / 1024 . "MB\n";
}
Ejemplo n.º 11
0
 /**
  * Compile a template file by reading it, converting the DOM using
  * {@see convert()}, then applying macros using {@see transform()}.
  * @param string $template Template file path.
  * @return string PHP template content. 
  * @throws InvalidTemplateException If template is inaccessible or invalid.
  */
 public function compile($template)
 {
     $dom = new \simple_html_dom();
     $this->currentTemplate = $template;
     $file = file_get_contents($template);
     if ($file === false) {
         throw new InvalidTemplateException(tr('Could not read template: %1', $template));
     }
     if (!$dom->load($file, true, false)) {
         throw new InvalidTemplateException(tr('Could not parse template: %1', $template));
     }
     $root = new InternalNode();
     $main = $dom->find('[j:main]', 0);
     if (isset($main)) {
         $root->append($this->convert($main));
     } else {
         foreach ($dom->find('*, text') as $html) {
             if ($html->parent->tag != 'root') {
                 continue;
             }
             $root->append($this->convert($html));
         }
     }
     $this->transform($root);
     return $root->__toString();
 }
Ejemplo n.º 12
0
function getCardInfo($url)
{
    $baseURL = 'http://gatherer.wizards.com/Pages/Card/';
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $cardImage = $dom->find('img[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cardImage]', 0)->src;
    $cardImage = str_replace("amp;", "", $cardImage);
    $imgURL = $baseURL . $cardImage;
    $name = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow] div[class=value]', 0)->plaintext;
    $name = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $name);
    $mana = "";
    $manaImages = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow] div[class=value] img');
    foreach ($manaImages as $manaItem) {
        $mana .= substr($manaItem->alt, 0, 1);
    }
    $mana = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $mana);
    $cmc = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow] div[class=value]', 0);
    $cmc = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cmc);
    $type = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow] div[class=value]', 0);
    $type = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $type);
    $text = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow] div[class=value]', 0);
    $text = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $text);
    $flavor = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow] div[class=value]', 0);
    $flavor = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $flavor);
    $cardNumber = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow] div[class=value]', 0);
    $cardNumber = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cardNumber);
    $artist = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow] div[class=value]', 0);
    $artist = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $artist);
    $rarity = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow] div[class=value]', 0);
    $rarity = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $rarity);
    $set = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_setRow] div[class=value]', 0);
    $set = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $set);
    scraperwiki::save_sqlite(array("card"), array("Card" => trim($name), "Image" => $imgURL, "Mana" => trim($mana), "CMC" => trim($cmc), "Type" => trim($type), "Card Text" => trim($text), "Flavor Text" => trim($flavor), "Artist" => trim($artist), "Card Number" => trim($cardNumber), "Rarity" => trim($rarity), "Expansion" => trim($set)));
}
function do_day($rec)
{
    $html = scraperwiki::scrape($rec['url']);
    $dom = new simple_html_dom();
    $dom->load($html);
    $cell = $dom->find('a[name=discs]');
    $lines = $cell[0]->parent->find('text');
    print $lines[10] . "\n";
    print count($lines) . "\n";
    # loop by number, as null lines stop a foreach
    $n = 0;
    for ($line_no = 0; $line_no < count($lines); $line_no++) {
        $line = $lines[$line_no];
        if (strlen($line) == 3) {
            # the DOM object crashes on this row, so ignore
            continue;
        }
        #if (preg_match("#^" . $n . "#", $line, $matches)) {
        print $line_no . " " . strlen($line) . "\n";
        $n = $n + 1;
        print $line . "\n";
        #}
    }
    #scraperwiki::save(array('data'), array('data' => $data->plaintext));
}
Ejemplo n.º 14
0
 public static function absolutizeHtml($sBaseUrl, $sHtml)
 {
     $oHtml = new simple_html_dom();
     $oHtml->load($sHtml);
     $aTags = $oHtml->find('a');
     foreach ($aTags as $oTag) {
         $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href);
     }
     $aTags = $oHtml->find('img');
     foreach ($aTags as $oTag) {
         $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src);
     }
     $aTags = $oHtml->find('script');
     foreach ($aTags as $oTag) {
         $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src);
     }
     $aTags = $oHtml->find('link');
     foreach ($aTags as $oTag) {
         $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href);
     }
     // Parse url() in inline css
     $aTags = $oHtml->find('style');
     foreach ($aTags as $oTag) {
         $oTag->innertext = preg_replace_callback('|url\\s*\\(\\s*[\'"]?([^\'"\\)]+)[\'"]?\\s*\\)|', function ($aMatches) use($sBaseUrl) {
             return 'url("' . trim(self::absolutizeUrl($sBaseUrl, $aMatches[1])) . '")';
         }, $oTag->innertext);
     }
     return $oHtml . '';
 }
Ejemplo n.º 15
0
function scrapPage($page)
{
    print "Scraping page " . $page;
    $url = "http://www.geipan.fr/index.php?id=202";
    $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens=";
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_POST, 11);
    curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
    $html = curl_exec($curl);
    print curl_error($curl) . "\n";
    //      print($html);
    $dom = new simple_html_dom();
    $dom->load($html);
    $trs = $dom->find("tr");
    foreach ($trs as $tr) {
        if (isset($tr->attr['onclick'])) {
            $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13);
            print $ID . "\n";
            $tds = $tr->find("td");
            $title = utf8_encode($tds[0]->plaintext);
            $date = $tds[1]->plaintext;
            $departement = utf8_encode($tds[2]->plaintext);
            $classe = $tds[3]->plaintext;
            $maj = $tds[4]->plaintext;
            $city = substr($title, 0, strpos($title, "(") - 1);
            $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city);
            scraperwiki::save(array('ID', 'maj'), $record);
        }
    }
}
Ejemplo n.º 16
0
 protected function parsing($input)
 {
     include_once "inc/simple_html_dom.php";
     # Create a DOM parser object
     $html = new simple_html_dom();
     # Parse the HTML from Amazon.
     $html->load($input);
     $result = [];
     # Iterate over all the  tags
     foreach ($html->find('li[class=s-result-item]') as $key => $innerData) {
         //image
         foreach ($innerData->find('img[class=s-access-image]') as $img) {
             $atmp['image'] = $img->getAttribute('src');
         }
         //title
         foreach ($innerData->find('h2[class=s-access-title]') as $title) {
             $atmp['title'] = $title->innertext();
         }
         //price
         foreach ($innerData->find('span[class=s-price]') as $price) {
             $price = $price->innertext();
             $atmp['price'] = $price;
             $atmp['numPrice'] = str_replace(",", '', substr($price, 1));
         }
         # Show the <a href>
         $result[$key] = $atmp;
     }
     if (!empty($result)) {
         return $this->aResult = $result;
     }
 }
Ejemplo n.º 17
0
 /**
  * curl 访问 开奖数据
  */
 private function get_data()
 {
     include_once 'simplehtmldom_1_5/simple_html_dom.php';
     $simple_html_dom = new \simple_html_dom();
     //zlib 解压 并转码
     $data = false;
     $data = @file_get_contents("compress.zlib://" . self::URL);
     if (!$data) {
         $this->setLog(false, '重庆时时彩-开奖数据抓取失败');
         exit('重庆时时彩-数据抓取失败,请尽快联系网站管理员' . "\r\n");
     }
     //转换成 UTF-8编码
     $encode = mb_detect_encoding($data, array('ASCII', 'UTF-8', 'GB2312', "GBK", 'BIG5'));
     $content = iconv($encode, 'UTF-8', $data);
     $simple_html_dom->load($content);
     //开奖期号
     $qihao = $simple_html_dom->find('div[class=aside]', 0)->find('h3', 0)->find('em', 0)->plaintext;
     //开奖号
     $code = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 1)->plaintext;
     if ($code == '--') {
         exit('重庆时时彩-等待开奖...' . "\r\n");
     }
     $isKaiJiang = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 2)->plaintext;
     if ($isKaiJiang == '--' && $isKaiJiang == '开奖中') {
         exit('重庆时时彩-等待开奖...' . "\r\n");
     }
     $simple_html_dom->clear();
     //将开奖号中间的空格去掉
     $code = str_replace(" ", '', $code);
     //开奖时间
     $kjsj = date('Y-m-d H:i:s');
     $this->data = ['qihao' => $qihao, 'kjsj' => $kjsj, 'code' => $code];
 }
function get_dom($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    return $dom;
}
function scrapeDetails($ngo)
{
    $html_content = scraperwiki::scrape($ngo["url"]);
    $dom = new simple_html_dom();
    $dom->load($html_content);
    $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:');
    // Scrape Details from all paragraphs
    $paragraphs = $dom->find('p');
    foreach ($paragraphs as $p) {
        if (strstr($p->plaintext, "Website")) {
            $ngo["website"] = $p->find('a', 0)->href;
        }
        if (strstr($p->plaintext, "Email")) {
            $ngo["email"] = $p->find('a', 0)->plaintext;
        }
        foreach ($infosWeWant as $key => $info) {
            $res = extractInfo($p, $info);
            if ($res) {
                $ngo[$info] = $res;
                //Do not search for this info again
                unset($infosWeWant[$key]);
            }
        }
    }
    print_r($ngo);
    return $ngo;
}
Ejemplo n.º 20
0
 private function scrap_page($url)
 {
     $base_url = 'http://' . parse_url($url, PHP_URL_HOST);
     $p = new Page($url);
     $h = new simple_html_dom();
     $h->load($p->content());
     $boxes = $h->find('.textbox');
     $result = array();
     foreach ($boxes as $box) {
         // image/url
         $content = $box->find('.textbox-content', 0);
         $url = $base_url . $content->find('a', 0)->href;
         $thumb = $base_url . $content->find('img', 0)->src;
         // other data
         $label = $box->find('.webcss-label', 0);
         $title = $label->find('p', 0)->find('a', 0)->innertext;
         $title = html_entity_decode($title, ENT_COMPAT, 'UTF-8');
         $h2 = $label->find('h2', 0);
         $date = Text::create($h2->innertext)->cut_after('>:')->to_s();
         $h5 = $label->find('h5', 0);
         $tags = Text::create($h5->innertext)->strip_tags()->cut_after(':')->to_s();
         $tags = array_filter(explode(',', $tags), 'trim');
         $view = $label->find('.webcss_view', 0);
         $m = Text::create($view->innertext)->regex_match('/(\\d+)/');
         $pages = $m[1];
         $item = array('title' => $title, 'url' => $url, 'date' => $date, 'pages' => $pages, 'thumb' => $thumb, 'tags' => '#' . implode('#', $tags) . '#');
         $result[] = $item;
     }
     return array_reverse($result);
 }
Ejemplo n.º 21
0
function read_listing($params, $url = 'http://www.auto24.ee/kasutatud/nimekiri.php')
{
    $endpoint = build_query($url, $params);
    $html = scraperWiki::scrape($endpoint);
    $dom = new simple_html_dom();
    $dom->load($html);
    $totalResultsEl = $dom->find('.paginator .current-range strong');
    $totalResults = $totalResultsEl[0]->plaintext;
    $medianItem = ($totalResults + 1) / 2;
    if ($medianItem > RESULTS_PER_PAGE) {
        $listingOffset = floor($medianItem / RESULTS_PER_PAGE) * RESULTS_PER_PAGE;
        $params['ak'] = $listingOffset;
        $medianItem -= $listingOffset;
        $endpoint = build_query($url, $params);
        $html = scraperWiki::scrape($endpoint);
        $dom = new simple_html_dom();
        $dom->load($html);
    }
    $rows = $dom->find("[@id=usedVehiclesSearchResult] .result-row");
    $lPoint = floor($medianItem) - 1;
    $hPoint = ceil($medianItem) - 1;
    $a24ksi = 0;
    if ($lPoint == $hPoint) {
        $rowData = get_row_data($rows[$lPoint]);
        $a24ksi = $rowData['price'];
    } else {
        $lRowData = get_row_data($rows[$lPoint]);
        $hRowData = get_row_data($rows[$hPoint]);
        $a24ksi = round(($lRowData['price'] + $hRowData['price']) / 2);
    }
    return array('n' => $totalResults, 'val' => $a24ksi);
}
function scrape_NG_news_article($art_url)
{
    $html = scraperWiki::scrape($art_url);
    require_once 'scraperwiki/simple_html_dom.php';
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("div#page_head h1") as $data) {
        $art_title = $data->innertext;
    }
    foreach ($dom->find("div#page_head h2") as $data) {
        $art_subtitle = $data->innertext;
    }
    $art_text_array = array();
    $art_paragraph_count = 0;
    $art_text_full = "";
    $art_teaser50 = "";
    $art_teaser100 = "";
    foreach ($dom->find("div#content div.article_text p") as $data) {
        $art_paragraph_count++;
        $tmp = str_get_html($data)->plaintext;
        //        $art_text_array[$art_paragraph_count] = $tmp;
        $art_text_full .= $tmp . " #" . $art_paragraph_count . "# ";
        //if ($art_paragraph_count == 1) $art_teaser = $tmp;
    }
    $art_teaserS = word_teaser($art_text_full, 60);
    $art_teaserM = word_teaser($art_text_full, 120);
    /*  print $art_text_full;                             show_article($art_title,$art_subtitle,$art_text_array);
        for($i=0;$i<count($art_text_array);$i++) {        $art_text_full .= $art_text_array[$i]." #".$i."# ";    }
        $art_text_full = $art_text_full->plaintext;       $art_teaser = $art_text_array[0]->plaintext;   */
    // $record = array("Title" => $art_title, "Subtitle" => $art_subtitle, "TeaserS" => $art_teaserS, "TeaserM" => $art_teaserM, "Text" => $art_text_full, "URL" => $art_url);
    $record = array("TeaserM" => $art_teaserM, "URL" => $art_url);
    scraperwiki::save(array('URL'), $record);
    return $record;
}
Ejemplo n.º 23
0
function getProducts($u, $cat)
{
    global $o;
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    //echo "Loaded URL: " . $u . "\n";
    $items = $d->find('li.grid-item');
    if (count($items) > 0) {
        foreach ($items as $p) {
            $prod = $p->find('p.product-name > a', 0);
            $prodname = trim($prod->innertext);
            $prodURL = $prod->href;
            if (!is_null($p->find('p.minimal-price', 0))) {
                $prodtype = 1;
            } else {
                $prodtype = 0;
            }
            fputcsv($o, array($prodname, $prodtype, $cat, $prodURL));
            echo $prodname . "\n";
        }
        if (!is_null($d->find('p.next', 0))) {
            getProducts($d->find('p.next', 0)->href, $cat);
        }
    }
}
Ejemplo n.º 24
0
 /**
  * Private function of obtaining the simple html dom object with the html loaded in it
  * @param type $html
  * @return $html_dom_array Array of simple_html_dom tags
  */
 private function &_getHtmlDomArray($html)
 {
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     $html_dom_array = $html_dom->find('html', 0)->children();
     return $html_dom_array;
 }
Ejemplo n.º 25
0
 public function parse($isUpdate = false)
 {
     Ibos::import("application.extensions.simple_html_dom", true);
     if ($isUpdate) {
         $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel);
         $max = 0;
     } else {
         $model = $this->printmodel;
         $max = intval($this->itemmax);
     }
     $elements = array();
     $doc = new simple_html_dom();
     $doc->load($model, true, true, CHARSET);
     $items = $doc->find("ic");
     $config = $this->getItemConfig();
     if (!empty($items) && !empty($config)) {
         $this->refactor($items, $config, $max, $elements);
     }
     $html = $doc->save();
     $this->_cache = $elements;
     CacheUtil::set("form_" . $this->ID, $elements);
     $form["printmodelshort"] = $html;
     if ($max != $this->itemmax) {
         $form["itemmax"] = $max;
     }
     $doc->clear();
     FlowFormType::model()->modify($this->ID, $form);
 }
function scrap_yp($last_alphabet = '', $last_page = '')
{
    $alphabet = range('a', 'z');
    if (is_null($last_alphabet) || $last_alphabet == '') {
        $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded');
        if (!is_null($temp_alphabet)) {
            $last_alphabet = $temp_alphabet;
        } else {
            $last_alphabet = 'a';
        }
    }
    if (is_null($last_page) || $last_page == '') {
        $temp_page = scraperwiki::get_var('last_page_loaded');
        if (!is_null($temp_page)) {
            $last_page = $temp_page;
        } else {
            $last_page = 1;
        }
    }
    $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page;
    $html = scraperWiki::scrape($yp_base_url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("ul.directory-list") as $data) {
        echo $data;
    }
}
Ejemplo n.º 27
0
function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape(BASE_URL . "?type={$param}");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("td");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}
Ejemplo n.º 28
0
 private function parsing($scrappedData)
 {
     $result = [];
     //Create a DOM parser object
     $html = new simple_html_dom();
     //Parse the HTML from Amazon.
     $html->load($scrappedData);
     # Iterate over all the  tags
     foreach ($html->find('li[class=s-result-item]') as $key => $innerData) {
         //image
         foreach ($innerData->find('img[class=s-access-image]') as $img) {
             $atmp['image'] = $img->getAttribute('src');
         }
         //title
         foreach ($innerData->find('h2[class=s-access-title]') as $title) {
             $atmp['title'] = $title->innertext();
         }
         //price
         foreach ($innerData->find('span[class=s-price]') as $price) {
             $price = $price->innertext();
             $atmp['price'] = $price;
             $atmp['numPrice'] = str_replace(",", '', substr($price, 1));
         }
         //total page
         foreach ($html->find('span[class=pagnDisabled]') as $maxPage) {
             $atmp['totalPage'] = $maxPage->innertext();
         }
         # Show the <a href>
         if (isset($atmp)) {
             $result[$key] = $atmp;
         }
     }
     return $this->aResult = $result;
 }
Ejemplo n.º 29
0
 /**
  * Gets Raw Html
  *
  * @param string $fetchURL
  * @param bool $usePost
  *
  * @return bool
  */
 private function getUrl($fetchURL, $usePost = false)
 {
     if (isset($fetchURL)) {
         $this->_ch = curl_init($fetchURL);
     }
     if ($usePost === true) {
         curl_setopt($this->_ch, CURLOPT_POST, 1);
         curl_setopt($this->_ch, CURLOPT_POSTFIELDS, $this->_postParams);
     }
     curl_setopt($this->_ch, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($this->_ch, CURLOPT_HEADER, 0);
     curl_setopt($this->_ch, CURLOPT_VERBOSE, 0);
     curl_setopt($this->_ch, CURLOPT_USERAGENT, "Firefox/2.0.0.1");
     curl_setopt($this->_ch, CURLOPT_FAILONERROR, 1);
     if (isset($this->cookie)) {
         curl_setopt($this->_ch, CURLOPT_COOKIEJAR, $this->cookie);
         curl_setopt($this->_ch, CURLOPT_COOKIEFILE, $this->cookie);
     }
     curl_setopt_array($this->_ch, Misc::curlSslContextOptions());
     $this->_response = curl_exec($this->_ch);
     if (!$this->_response) {
         curl_close($this->_ch);
         return false;
     }
     curl_close($this->_ch);
     $this->_html->load($this->_response);
     return true;
 }
Ejemplo n.º 30
0
 public function save($html, $dir)
 {
     import("@.ORG.htmltodocx.documentation.support_functions");
     $phpword_object = new PHPWord();
     $section = $phpword_object->createSection();
     // HTML Dom object:
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     // Note, we needed to nest the html in a couple of dummy elements.
     // Create the dom array of elements which we are going to work on:
     $html_dom_array = $html_dom->find('html', 0)->children();
     // We need this for setting base_root and base_path in the initial_state array
     // (below). We are using a function here (derived from Drupal) to create these
     // paths automatically - you may want to do something different in your
     // implementation. This function is in the included file
     // documentation/support_functions.inc.
     $paths = htmltodocx_paths();
     // Provide some initial settings:
     $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example());
     // Convert the HTML and put it into the PHPWord object
     htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
     // Clear the HTML dom object:
     $html_dom->clear();
     unset($html_dom);
     // Save File
     $str = explode(".", $h2d_file_uri);
     $h2d_file_uri = $dir . "wordtemp/" . time() . ".docx";
     if (!file_exists($dir . "wordtemp/")) {
         $this->createFolders($dir . "wordtemp/");
         //判断目标文件夹是否存在
     }
     $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
     $objWriter->save($h2d_file_uri);
     return $h2d_file_uri;
 }