PHP simple_html_dom, vanilla Examples

Programming Language: PHP

Class/Type: simple_html_dom

Examples at hotexamples.com: 30

PHP Simple HTML DOM is a PHP package library used to parse HTML documents and extract the data from it. It simplifies the process of parsing HTML documents and eliminates the need for complex and lengthy regular expressions.

Code Examples:

1. Extracting all the links from an HTML document:

// Load the HTML document
$html = file_get_html('http://www.example.com/');

// Find all the links in the document
$links = $html->find('a');

// Print all the links
foreach($links as $link) {
    echo $link->href . "
";
}

2. Extracting all the images from an HTML document:

// Load the HTML document
$html = file_get_html('http://www.example.com/');

// Find all the images in the document
$images = $html->find('img');

// Print all the images
foreach($images as $image) {
    echo $image->src . "
";
}

3. Extracting the title of an HTML document:

// Load the HTML document
$html = file_get_html('http://www.example.com/');

// Find the title of the document
$title = $html->find('title', 0)->plaintext;

// Print the title
echo $title;

These examples use PHP Simple HTML DOM to extract specific elements from an HTML document. The package library used in these examples is PHP Simple HTML DOM.

PHP simple_html_dom - 30 examples found. These are the top rated real world PHP examples of simple_html_dom from package vanilla extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

clear(30)

find(30)

load_file(30)

load(30)

save(24)

__destruct(16)

getElementById(8)

__construct(5)

__toString(4)

getElementsByTagName(2)

getElementByTagName(2)

set_callback(2)

getElementsById(1)

read_tag(1)

remove(1)

removeNode(1)

remove_callback(1)

file_get_html(1)

Paperg - change $size from protected to public so we can easily access it Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.

simple_html_dom Class Documentation

Example #1

Show file

File: ebayClass.php Project: natzar/Robin

 function handleDocumentInfo($DocInfo)
 {
     echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . PHP_EOL;
     if ($DocInfo->http_status_code == '200' and $DocInfo->received and $DocInfo->content_type == 'text/html' and isset($DocInfo->content)) {
         $html = $DocInfo->content;
         $host = $DocInfo->host;
         $urlPosted = $DocInfo->url;
         $htmldom = new simple_html_dom();
         $htmldom->load($html);
         $data = array();
         $images = $htmldom->find('ul#ListViewInner li img');
         echo 'Total images' . count($images) . '' . PHP_EOL;
         $i = 0;
         foreach ($images as $raw_links) {
             echo $raw_links->alt . "','1','" . $raw_links->src . "\n";
             $filename = '_e__' . $i . '.jpg';
             copy(str_replace("l225 ", "l900", $raw_links->src), 'downloads/' . $filename);
             $i++;
         }
         echo json_encode($data) . PHP_EOL;
         //$writer->writeRow(json_encode($data));
         unset($data);
         unset($htmldom);
     }
 }

Example #2

Show file

File: SimpleHtmlParser.php Project: AxelPanda/ibos

 public function parse($isUpdate = false)
 {
     Ibos::import("application.extensions.simple_html_dom", true);
     if ($isUpdate) {
         $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel);
         $max = 0;
     } else {
         $model = $this->printmodel;
         $max = intval($this->itemmax);
     }
     $elements = array();
     $doc = new simple_html_dom();
     $doc->load($model, true, true, CHARSET);
     $items = $doc->find("ic");
     $config = $this->getItemConfig();
     if (!empty($items) && !empty($config)) {
         $this->refactor($items, $config, $max, $elements);
     }
     $html = $doc->save();
     $this->_cache = $elements;
     CacheUtil::set("form_" . $this->ID, $elements);
     $form["printmodelshort"] = $html;
     if ($max != $this->itemmax) {
         $form["itemmax"] = $max;
     }
     $doc->clear();
     FlowFormType::model()->modify($this->ID, $form);
 }

Example #3

Show file

File: other_scraper_ml.php Project: flyeven/scraperwiki-scraper-vault

function run_ml($q_num = 0)
{
    $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext)));
        $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext);
        /*
         *  Stores results
         */
        scraperwiki::save_sqlite(array("No"), $record);
        unset($temp_data);
    }
    foreach ($dom->find("a") as $a) {
        if ($a->plaintext == 'Next') {
            $tmp_a = $a->href;
            $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a);
            if ($tmp_a > 0) {
                continue;
            }
        }
    }
    if ((int) $tmp_a != 0) {
        run_ml($tmp_a);
    } else {
        exit;
    }
}

Example #4

Show file

File: Ichizen.php Project: kknet/AdultMidnight

 /**
  * 動画のURLを取得する
  *
  * @param  simple_html_dom $html
  * @return array
  **/
 public function getMoviesUrl($html)
 {
     $query = 'div.entryBody div.topmore a img';
     $movies_els = $html->find($query);
     $movie_data = array();
     $manager = new UriManager();
     // 動画はこちらテキストのリンクを取得する
     foreach ($movies_els as $movies_el) {
         if (!preg_match('/^動画.+/', $movies_el->getAttribute('alt'))) {
             continue;
         }
         // 親のaタグからリンクを取得する
         $parent_el = $next_el = $movies_el->parentNode();
         $i = 0;
         while ($i < 3) {
             $next_el = $next_el->nextSibling();
             if (is_null($next_el)) {
                 break;
             }
             $i++;
         }
         if ($next_el->nodeName() == 'span') {
             $movie_data = [];
             break;
         }
         if ($parent_el->nodeName() == 'a') {
             $movie_data[] = $manager->resolve($parent_el->getAttribute('href'));
         }
     }
     return $movie_data;
 }

Example #5

Show file

File: productlist.php Project: jbm160/brs

function getProducts($u, $cat)
{
    global $o;
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    //echo "Loaded URL: " . $u . "\n";
    $items = $d->find('li.grid-item');
    if (count($items) > 0) {
        foreach ($items as $p) {
            $prod = $p->find('p.product-name > a', 0);
            $prodname = trim($prod->innertext);
            $prodURL = $prod->href;
            if (!is_null($p->find('p.minimal-price', 0))) {
                $prodtype = 1;
            } else {
                $prodtype = 0;
            }
            fputcsv($o, array($prodname, $prodtype, $cat, $prodURL));
            echo $prodname . "\n";
        }
        if (!is_null($d->find('p.next', 0))) {
            getProducts($d->find('p.next', 0)->href, $cat);
        }
    }
}

Example #6

Show file

File: GrabCqSsc.php Project: aiyeyun/grab

 /**
  * curl 访问 开奖数据
  */
 private function get_data()
 {
     include_once 'simplehtmldom_1_5/simple_html_dom.php';
     $simple_html_dom = new \simple_html_dom();
     //zlib 解压 并转码
     $data = false;
     $data = @file_get_contents("compress.zlib://" . self::URL);
     if (!$data) {
         $this->setLog(false, '重庆时时彩-开奖数据抓取失败');
         exit('重庆时时彩-数据抓取失败,请尽快联系网站管理员' . "\r\n");
     }
     //转换成 UTF-8编码
     $encode = mb_detect_encoding($data, array('ASCII', 'UTF-8', 'GB2312', "GBK", 'BIG5'));
     $content = iconv($encode, 'UTF-8', $data);
     $simple_html_dom->load($content);
     //开奖期号
     $qihao = $simple_html_dom->find('div[class=aside]', 0)->find('h3', 0)->find('em', 0)->plaintext;
     //开奖号
     $code = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 1)->plaintext;
     if ($code == '--') {
         exit('重庆时时彩-等待开奖...' . "\r\n");
     }
     $isKaiJiang = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 2)->plaintext;
     if ($isKaiJiang == '--' && $isKaiJiang == '开奖中') {
         exit('重庆时时彩-等待开奖...' . "\r\n");
     }
     $simple_html_dom->clear();
     //将开奖号中间的空格去掉
     $code = str_replace(" ", '', $code);
     //开奖时间
     $kjsj = date('Y-m-d H:i:s');
     $this->data = ['qihao' => $qihao, 'kjsj' => $kjsj, 'code' => $code];
 }

Example #7

Show file

File: EroEro.php Project: kknet/AdultMidnight

 /**
  * 動画のURLを取得する
  *
  * @param  simple_html_dom $html
  * @return array
  **/
 public function getMoviesUrl($html)
 {
     $query = 'div.ently_body div.ently_text div.video-container iframe';
     $movies_els = $html->find($query);
     $movie_data = array();
     $manager = new UriManager();
     // 動画はこちらテキストのリンクを取得する
     foreach ($movies_els as $movies_el) {
         if ($movies_el->hasAttribute('src')) {
             $url = $manager->resolve($movies_el->getAttribute('src'));
             if ($url !== false) {
                 $movie_data[] = $url;
             }
         }
     }
     $query = 'div.ently_outline div.ently_body a';
     $movies_els = $html->find($query);
     foreach ($movies_els as $movies_el) {
         $text = $movies_el->plaintext;
         if (preg_match('/リンク（/', $text) && $movies_el->hasAttribute('href')) {
             $resolve_url = $manager->resolve($movies_el->getAttribute('href'));
             if ($resolve_url) {
                 $movie_data[] = $resolve_url;
             }
         }
     }
     return $movie_data;
 }

Example #8

Show file

File: desert-island-disc-records.php Project: flyeven/scraperwiki-scraper-vault

function do_day($rec)
{
    $html = scraperwiki::scrape($rec['url']);
    $dom = new simple_html_dom();
    $dom->load($html);
    $cell = $dom->find('a[name=discs]');
    $lines = $cell[0]->parent->find('text');
    print $lines[10] . "\n";
    print count($lines) . "\n";
    # loop by number, as null lines stop a foreach
    $n = 0;
    for ($line_no = 0; $line_no < count($lines); $line_no++) {
        $line = $lines[$line_no];
        if (strlen($line) == 3) {
            # the DOM object crashes on this row, so ignore
            continue;
        }
        #if (preg_match("#^" . $n . "#", $line, $matches)) {
        print $line_no . " " . strlen($line) . "\n";
        $n = $n + 1;
        print $line . "\n";
        #}
    }
    #scraperwiki::save(array('data'), array('data' => $data->plaintext));
}

Example #9

Show file

File: TemplateCompiler.php Project: jivoo/jivoo

 /**
  * Compile a template file by reading it, converting the DOM using
  * {@see convert()}, then applying macros using {@see transform()}.
  * @param string $template Template file path.
  * @return string PHP template content. 
  * @throws InvalidTemplateException If template is inaccessible or invalid.
  */
 public function compile($template)
 {
     $dom = new \simple_html_dom();
     $this->currentTemplate = $template;
     $file = file_get_contents($template);
     if ($file === false) {
         throw new InvalidTemplateException(tr('Could not read template: %1', $template));
     }
     if (!$dom->load($file, true, false)) {
         throw new InvalidTemplateException(tr('Could not parse template: %1', $template));
     }
     $root = new InternalNode();
     $main = $dom->find('[j:main]', 0);
     if (isset($main)) {
         $root->append($this->convert($main));
     } else {
         foreach ($dom->find('*, text') as $html) {
             if ($html->parent->tag != 'root') {
                 continue;
             }
             $root->append($this->convert($html));
         }
     }
     $this->transform($root);
     return $root->__toString();
 }

Example #10

Show file

File: geipan.php Project: flyeven/scraperwiki-scraper-vault

function scrapPage($page)
{
    print "Scraping page " . $page;
    $url = "http://www.geipan.fr/index.php?id=202";
    $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens=";
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    curl_setopt($curl, CURLOPT_POST, 11);
    curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
    $html = curl_exec($curl);
    print curl_error($curl) . "\n";
    //      print($html);
    $dom = new simple_html_dom();
    $dom->load($html);
    $trs = $dom->find("tr");
    foreach ($trs as $tr) {
        if (isset($tr->attr['onclick'])) {
            $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13);
            print $ID . "\n";
            $tds = $tr->find("td");
            $title = utf8_encode($tds[0]->plaintext);
            $date = $tds[1]->plaintext;
            $departement = utf8_encode($tds[2]->plaintext);
            $classe = $tds[3]->plaintext;
            $maj = $tds[4]->plaintext;
            $city = substr($title, 0, strpos($title, "(") - 1);
            $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city);
            scraperwiki::save(array('ID', 'maj'), $record);
        }
    }
}

Example #11

Show file

File: GoogleImageSearch.php Project: gigikiri/GoogleImageSearch

 public function getSearchResults(simple_html_dom $dom)
 {
     $result = array();
     $count = count($dom->find('div.srg'));
     if ($count) {
         // if found div.srg
         $c = $count > 1 ? 1 : 0;
         // if this is first page, we have 2 divs, first with some irrelevant
         //links, so skip the first page
         $d = $dom->find('div.srg', $c);
         // get second div(if this is 1st page), or first div
     } else {
         // no div.srg found, search all page
         $d = $dom;
     }
     foreach ($d->find('div.rc') as $div) {
         $a = $div->find('h3.r a', 0);
         // get link to the website
         //Get original image url
         $originalImg = $div->find('div.th a', 0);
         preg_match('/imgurl=(.+?)&/', $originalImg->href, $matches);
         $result[] = array(htmlspecialchars_decode($a->plaintext, ENT_QUOTES), $a->href, $matches[1]);
     }
     return $result;
 }

Example #12

Show file

File: asuntojen_hintatiedot_1.php Project: flyeven/scraperwiki-scraper-vault

function scrape_page()
{
    $row = 0;
    $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        if (count($tds) > 8) {
            $row++;
            $GLOBALS['rowTotal']++;
            $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext);
            scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']);
            print $GLOBALS['rowTotal'] . "\n";
            print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n";
        }
    }
    if ($row == 50) {
        print "Vielä jatkuu, haetaan seuraava sivu..." . "\n";
        $GLOBALS['z']++;
        scrape_page();
    } else {
        print "Skrääpiminen suoritettu." . "\n";
        print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n";
        print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n";
    }
}

Example #13

Show file

File: HtmlToDocx.php Project: tmlsoft/main

 public function save($html, $dir)
 {
     import("@.ORG.htmltodocx.documentation.support_functions");
     $phpword_object = new PHPWord();
     $section = $phpword_object->createSection();
     // HTML Dom object:
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     // Note, we needed to nest the html in a couple of dummy elements.
     // Create the dom array of elements which we are going to work on:
     $html_dom_array = $html_dom->find('html', 0)->children();
     // We need this for setting base_root and base_path in the initial_state array
     // (below). We are using a function here (derived from Drupal) to create these
     // paths automatically - you may want to do something different in your
     // implementation. This function is in the included file
     // documentation/support_functions.inc.
     $paths = htmltodocx_paths();
     // Provide some initial settings:
     $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example());
     // Convert the HTML and put it into the PHPWord object
     htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
     // Clear the HTML dom object:
     $html_dom->clear();
     unset($html_dom);
     // Save File
     $str = explode(".", $h2d_file_uri);
     $h2d_file_uri = $dir . "wordtemp/" . time() . ".docx";
     if (!file_exists($dir . "wordtemp/")) {
         $this->createFolders($dir . "wordtemp/");
         //判断目标文件夹是否存在
     }
     $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
     $objWriter->save($h2d_file_uri);
     return $h2d_file_uri;
 }

Example #14

Show file

File: pinterestClass.php Project: natzar/Robin

 function handleDocumentInfo($DocInfo)
 {
     //	global $writer;
     echo "Page requested: " . $DocInfo->url . " (" . $DocInfo->http_status_code . ")" . PHP_EOL;
     if ($DocInfo->http_status_code == '200' and $DocInfo->received and $DocInfo->content_type == 'text/html' and isset($DocInfo->content)) {
         $html = $DocInfo->content;
         $host = $DocInfo->host;
         $urlPosted = $DocInfo->url;
         $htmldom = new simple_html_dom();
         $htmldom->load($html);
         $data = array();
         $images = $htmldom->find('.pinHolder img');
         echo 'Total Images ' . count($images) . PHP_EOL;
         $i = intval(Date("YmdHis"));
         foreach ($images as $raw_links) {
             $data['items'][] = array("title" => $raw_links->alt, "img" => $raw_links->src);
             copy($raw_links->src, 'downloads/' . $i . '.jpg');
             $i++;
         }
         echo json_encode($data) . PHP_EOL;
         //$writer->writeRow(json_encode($data));
         unset($data);
         unset($htmldom);
     }
 }

Example #15

Show file

File: scrap_amazon.php Project: rahulpr/Scrap-amazon

 private function parsing($scrappedData)
 {
     $result = [];
     //Create a DOM parser object
     $html = new simple_html_dom();
     //Parse the HTML from Amazon.
     $html->load($scrappedData);
     # Iterate over all the  tags
     foreach ($html->find('li[class=s-result-item]') as $key => $innerData) {
         //image
         foreach ($innerData->find('img[class=s-access-image]') as $img) {
             $atmp['image'] = $img->getAttribute('src');
         }
         //title
         foreach ($innerData->find('h2[class=s-access-title]') as $title) {
             $atmp['title'] = $title->innertext();
         }
         //price
         foreach ($innerData->find('span[class=s-price]') as $price) {
             $price = $price->innertext();
             $atmp['price'] = $price;
             $atmp['numPrice'] = str_replace(",", '', substr($price, 1));
         }
         //total page
         foreach ($html->find('span[class=pagnDisabled]') as $maxPage) {
             $atmp['totalPage'] = $maxPage->innertext();
         }
         # Show the <a href>
         if (isset($atmp)) {
             $result[$key] = $atmp;
         }
     }
     return $this->aResult = $result;
 }

Example #16

Show file

File: dk-ted.php Project: flyeven/scraperwiki-scraper-vault

function scrapeTEDRSS($url, $sector)
{
    print $url . " " . $sector . "\n";
    // $xml = scraperWiki::scrape($url);
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($curl, CURLOPT_TIMEOUT, 20);
    // 10 second before aborting
    // try CURLOPT_CONNECTTIMEOUT (in seconds)
    // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with):
    // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting
    $xml = curl_exec($curl);
    print curl_error($curl) . "\n";
    $dom = new simple_html_dom();
    $dom->load($xml);
    $items = $dom->find("item");
    foreach ($items as $item) {
        $guid = $item->find("guid");
        $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext);
        print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB";
        echo "\n";
        // $record = scrapeTEDDataPage ($noticeURL, $sector);
        $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL);
        scraperwiki::save(array('sector', 'url'), $record);
        sleep(1);
    }
    $dom->__destruct();
    unset($items);
    unset($dom);
    unset($xml);
    print memory_get_usage() / 1024 / 1024 . "MB\n";
}

Example #17

Show file

File: search.php Project: kavyavsagar/Scrap-Amazon

 protected function parsing($input)
 {
     include_once "inc/simple_html_dom.php";
     # Create a DOM parser object
     $html = new simple_html_dom();
     # Parse the HTML from Amazon.
     $html->load($input);
     $result = [];
     # Iterate over all the  tags
     foreach ($html->find('li[class=s-result-item]') as $key => $innerData) {
         //image
         foreach ($innerData->find('img[class=s-access-image]') as $img) {
             $atmp['image'] = $img->getAttribute('src');
         }
         //title
         foreach ($innerData->find('h2[class=s-access-title]') as $title) {
             $atmp['title'] = $title->innertext();
         }
         //price
         foreach ($innerData->find('span[class=s-price]') as $price) {
             $price = $price->innertext();
             $atmp['price'] = $price;
             $atmp['numPrice'] = str_replace(",", '', substr($price, 1));
         }
         # Show the <a href>
         $result[$key] = $atmp;
     }
     if (!empty($result)) {
         return $this->aResult = $result;
     }
 }

Example #18

Show file

File: munich_airport.php Project: flyeven/scraperwiki-scraper-vault

function grep_munich($url, $table_name)
{
    $html = scraperWiki::scrape($url);
    $count = 0;
    # Use the PHP Simple HTML DOM Parser to extract <td> tags
    $dom = new simple_html_dom();
    $dom->load($html);
    //Drop all old informations by dropping the table
    scraperwiki::sqliteexecute("drop table if exists " . $table_name);
    scraperwiki::sqlitecommit();
    $table = $dom->getElementById('flight_info_area');
    foreach ($table->find('tr') as $data) {
        // Flight details. Read tds or ths
        $tds = $data->find("td");
        //if there are less then 7 columns continue to next loop
        if (sizeof($tds) < 7) {
            continue;
        }
        //print $data->plaintext . "\n";
        $flightnr = $tds[1]->plaintext;
        $from = $tds[2]->plaintext;
        $time = $tds[3]->plaintext;
        $expected_time = $tds[4]->plaintext;
        //Create date
        $date = date("Y-m-d");
        //Build array of flight informations
        $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time);
        //Save the informations of one flight
        scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name);
        $count = $count + 1;
    }
}

Example #19

Show file

File: phl-flight-scraperphp.php Project: flyeven/scraperwiki-scraper-vault

function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape("http://www.norwegian.no/fly/lavpris/?D_City=CPH&A_City=DUB&TripType=2&D_Day=1&D_Month=201104&R_Day=1&R_Month=201104&AdultCount=1&ChildCount=0&InfantCount=0");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("div");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}

Example #20

Show file

File: testing_scraper.php Project: flyeven/scraperwiki-scraper-vault

function get_dom($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    return $dom;
}

Example #21

Show file

File: dzi_-_ngo_detail_scraper.php Project: flyeven/scraperwiki-scraper-vault

function scrapeDetails($ngo)
{
    $html_content = scraperwiki::scrape($ngo["url"]);
    $dom = new simple_html_dom();
    $dom->load($html_content);
    $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:');
    // Scrape Details from all paragraphs
    $paragraphs = $dom->find('p');
    foreach ($paragraphs as $p) {
        if (strstr($p->plaintext, "Website")) {
            $ngo["website"] = $p->find('a', 0)->href;
        }
        if (strstr($p->plaintext, "Email")) {
            $ngo["email"] = $p->find('a', 0)->plaintext;
        }
        foreach ($infosWeWant as $key => $info) {
            $res = extractInfo($p, $info);
            if ($res) {
                $ngo[$info] = $res;
                //Do not search for this info again
                unset($infosWeWant[$key]);
            }
        }
    }
    print_r($ngo);
    return $ngo;
}

Example #22

Show file

File: FreeHManga.php Project: JerryMaheswara/crawler

 private function scrap_page($url)
 {
     $base_url = 'http://' . parse_url($url, PHP_URL_HOST);
     $p = new Page($url);
     $h = new simple_html_dom();
     $h->load($p->content());
     $boxes = $h->find('.textbox');
     $result = array();
     foreach ($boxes as $box) {
         // image/url
         $content = $box->find('.textbox-content', 0);
         $url = $base_url . $content->find('a', 0)->href;
         $thumb = $base_url . $content->find('img', 0)->src;
         // other data
         $label = $box->find('.webcss-label', 0);
         $title = $label->find('p', 0)->find('a', 0)->innertext;
         $title = html_entity_decode($title, ENT_COMPAT, 'UTF-8');
         $h2 = $label->find('h2', 0);
         $date = Text::create($h2->innertext)->cut_after('>:')->to_s();
         $h5 = $label->find('h5', 0);
         $tags = Text::create($h5->innertext)->strip_tags()->cut_after(':')->to_s();
         $tags = array_filter(explode(',', $tags), 'trim');
         $view = $label->find('.webcss_view', 0);
         $m = Text::create($view->innertext)->regex_match('/(\\d+)/');
         $pages = $m[1];
         $item = array('title' => $title, 'url' => $url, 'date' => $date, 'pages' => $pages, 'thumb' => $thumb, 'tags' => '#' . implode('#', $tags) . '#');
         $result[] = $item;
     }
     return array_reverse($result);
 }

Example #23

Show file

File: test_403.php Project: flyeven/scraperwiki-scraper-vault

function read_listing($params, $url = 'http://www.auto24.ee/kasutatud/nimekiri.php')
{
    $endpoint = build_query($url, $params);
    $html = scraperWiki::scrape($endpoint);
    $dom = new simple_html_dom();
    $dom->load($html);
    $totalResultsEl = $dom->find('.paginator .current-range strong');
    $totalResults = $totalResultsEl[0]->plaintext;
    $medianItem = ($totalResults + 1) / 2;
    if ($medianItem > RESULTS_PER_PAGE) {
        $listingOffset = floor($medianItem / RESULTS_PER_PAGE) * RESULTS_PER_PAGE;
        $params['ak'] = $listingOffset;
        $medianItem -= $listingOffset;
        $endpoint = build_query($url, $params);
        $html = scraperWiki::scrape($endpoint);
        $dom = new simple_html_dom();
        $dom->load($html);
    }
    $rows = $dom->find("[@id=usedVehiclesSearchResult] .result-row");
    $lPoint = floor($medianItem) - 1;
    $hPoint = ceil($medianItem) - 1;
    $a24ksi = 0;
    if ($lPoint == $hPoint) {
        $rowData = get_row_data($rows[$lPoint]);
        $a24ksi = $rowData['price'];
    } else {
        $lRowData = get_row_data($rows[$lPoint]);
        $hRowData = get_row_data($rows[$hPoint]);
        $a24ksi = round(($lRowData['price'] + $hRowData['price']) / 2);
    }
    return array('n' => $totalResults, 'val' => $a24ksi);
}

Example #24

Show file

File: test_scraper_35.php Project: flyeven/scraperwiki-scraper-vault

function scrape_NG_news_article($art_url)
{
    $html = scraperWiki::scrape($art_url);
    require_once 'scraperwiki/simple_html_dom.php';
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("div#page_head h1") as $data) {
        $art_title = $data->innertext;
    }
    foreach ($dom->find("div#page_head h2") as $data) {
        $art_subtitle = $data->innertext;
    }
    $art_text_array = array();
    $art_paragraph_count = 0;
    $art_text_full = "";
    $art_teaser50 = "";
    $art_teaser100 = "";
    foreach ($dom->find("div#content div.article_text p") as $data) {
        $art_paragraph_count++;
        $tmp = str_get_html($data)->plaintext;
        //        $art_text_array[$art_paragraph_count] = $tmp;
        $art_text_full .= $tmp . " #" . $art_paragraph_count . "# ";
        //if ($art_paragraph_count == 1) $art_teaser = $tmp;
    }
    $art_teaserS = word_teaser($art_text_full, 60);
    $art_teaserM = word_teaser($art_text_full, 120);
    /*  print $art_text_full;                             show_article($art_title,$art_subtitle,$art_text_array);
        for($i=0;$i<count($art_text_array);$i++) {        $art_text_full .= $art_text_array[$i]." #".$i."# ";    }
        $art_text_full = $art_text_full->plaintext;       $art_teaser = $art_text_array[0]->plaintext;   */
    // $record = array("Title" => $art_title, "Subtitle" => $art_subtitle, "TeaserS" => $art_teaserS, "TeaserM" => $art_teaserM, "Text" => $art_text_full, "URL" => $art_url);
    $record = array("TeaserM" => $art_teaserM, "URL" => $art_url);
    scraperwiki::save(array('URL'), $record);
    return $record;
}

Example #25

Show file

File: scraper.php Project: jbm160/wc_cat

function getCategories($u)
{
    global $baseurl, $f;
    $path = "";
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    echo "Loaded URL: " . $u . "\n";
    if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) {
        $breadcrumb = $d->find('div[id=breadcrumb]', 0);
        //foreach($breadcrumb as $b) {
        //echo "Breadcrumb = " . $b;}
        if (!is_null($breadcrumb)) {
            foreach ($breadcrumb->children() as $crumb) {
                $path .= trim($crumb->innertext) . "/";
            }
            $path .= trim(strrchr($breadcrumb->innertext, ">"), "> ");
        }
        foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) {
            $name = trim(strstr($div->children(0)->innertext, "(", true));
            $url = $baseurl . $div->children(0)->href;
            $data = array("Name" => $name, "Path" => $path, "URL" => $url);
            echo $path . "/" . $name . "\n";
            if ($local) {
                fputcsv($f, array($name, $path, $url));
            } else {
                scraperwiki::save_sqlite(array("URL"), $data);
            }
            getCategories($url);
        }
    }
}

Example #26

Show file

File: ApprovalNoticeDocx.inc.php Project: JovanyJeff/hrp

 /**
  * Private function of obtaining the simple html dom object with the html loaded in it
  * @param type $html
  * @return $html_dom_array Array of simple_html_dom tags
  */
 private function &_getHtmlDomArray($html)
 {
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     $html_dom_array = $html_dom->find('html', 0)->children();
     return $html_dom_array;
 }

Example #27

Show file

File: php.php Project: flyeven/scraperwiki-scraper-vault

function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape(BASE_URL . "?type={$param}");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("td");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}

Example #28

Show file

File: yellowpage_id.php Project: flyeven/scraperwiki-scraper-vault

function scrap_yp($last_alphabet = '', $last_page = '')
{
    $alphabet = range('a', 'z');
    if (is_null($last_alphabet) || $last_alphabet == '') {
        $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded');
        if (!is_null($temp_alphabet)) {
            $last_alphabet = $temp_alphabet;
        } else {
            $last_alphabet = 'a';
        }
    }
    if (is_null($last_page) || $last_page == '') {
        $temp_page = scraperwiki::get_var('last_page_loaded');
        if (!is_null($temp_page)) {
            $last_page = $temp_page;
        } else {
            $last_page = 1;
        }
    }
    $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page;
    $html = scraperWiki::scrape($yp_base_url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("ul.directory-list") as $data) {
        echo $data;
    }
}

Example #29

Show file

File: foodnetwork.php Project: flyeven/scraperwiki-scraper-vault

function getIngredients($html)
{
    $i = 0;
    $dom = new simple_html_dom();
    $dom->load($html);
    //foreach($dom->find('result-item',1)->href as $data)
    //{
    // if ($data != null)
    //$res = trim($data->plaintext);
    $res = $dom->find('a[class=callout]', 1)->href;
    $res = str_replace("reviews/", "", $res);
    echo "http://www.foodnetwork.com" . $res;
    $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res);
    $domFoods = new simple_html_dom();
    //$domFoods->load($html1);
    $h = str_get_html($html1);
    //echo $domFoods;
    echo "\n\n";
    foreach ($h->find('li[class=ingredient]') as $data) {
        $ingredient = $data->plaintext;
        if (isset($h->href)) {
            $href = $h->href;
        }
        //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){
        //echo $data->plaintext;
        scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href));
    }
}

Example #30

Show file

File: BaseTube.php Project: KrendiL101/123test

 protected function getDomParser($content = null)
 {
     $dom_parser = new simple_html_dom();
     if ($content) {
         $dom_parser->load($content);
     }
     return $dom_parser;
 }