function a587_getPlaintext($_text, $_remove) { global $REX; foreach (explode(',', $REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['order']) as $elem) { switch ($elem) { case 'selectors': // remove elements selected by css-selectors $html = new simple_html_dom(); $html->load($_text); $html->remove($_remove); $html->load($html->outertext); $_text = $html->plaintext; break; case 'regex': // regex if (!empty($REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['regex'])) { $regex = array(); $replacement = array(); $odd = true; foreach (explode("\n", $REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['regex']) as $line) { if ($line != '') { if ($odd) { $regex[] = trim($line); } else { $replacement[] = $line; } $odd = !$odd; } } $_text = preg_replace($regex, $replacement, $_text); } break; case 'textile': // strip HTML-tags if (!empty($REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['textile']) and function_exists('rex_a79_textile')) { $_text = rex_a79_textile($_text); } break; case 'striptags': // strip HTML-tags if (!empty($REX['ADDON']['rexsearch_plugins']['rexsearch']['plaintext']['settings']['striptags'])) { $_text = strip_tags($_text); } break; } } return $_text; }
function scrape_page() { $row = 0; $html = scraperWiki::scrape("http://asuntojen.hintatiedot.fi/haku/?c=" . $GLOBALS['c'] . "&s=" . $GLOBALS['s'] . "&r=" . $GLOBALS['r'] . "&amin=" . $GLOBALS['amin'] . "&amax=" . $GLOBALS['amax'] . "&z=" . $GLOBALS['z']); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); if (count($tds) > 8) { $row++; $GLOBALS['rowTotal']++; $apt = array("Uniikkiavain" => $GLOBALS['rowTotal'], "Kaupunginosa" => $tds[0]->plaintext, "Myyntihinta" => $tds[3]->plaintext, "Neliohinta" => $tds[4]->plaintext, "Tyyppi" => $tds[1]->plaintext, "Koko" => $tds[2]->plaintext); scraperwiki::save_sqlite(null, $apt, $table_name = $GLOBALS['c'] . " " . $GLOBALS['time']); print $GLOBALS['rowTotal'] . "\n"; print $row . ". Sijainti: " . $tds[0]->plaintext . " Hinta: " . $tds[3]->plaintext . " Tyyppi: " . $tds[1]->plaintext . " Koko: " . $tds[2]->plaintext . " Neliöhinta: " . $tds[4]->plaintext . "€" . "\n"; } } if ($row == 50) { print "Vielä jatkuu, haetaan seuraava sivu..." . "\n"; $GLOBALS['z']++; scrape_page(); } else { print "Skrääpiminen suoritettu." . "\n"; print "Sivuja yhteensä: " . $GLOBALS['z'] . "\n"; print "Rivejä yhteensä: " . $GLOBALS['rowTotal'] . "\n"; } }
function run_ml($q_num = 0) { $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext))); $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext); /* * Stores results */ scraperwiki::save_sqlite(array("No"), $record); unset($temp_data); } foreach ($dom->find("a") as $a) { if ($a->plaintext == 'Next') { $tmp_a = $a->href; $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a); if ($tmp_a > 0) { continue; } } } if ((int) $tmp_a != 0) { run_ml($tmp_a); } else { exit; } }
/** * Get Raw html of webpage * * @param bool $usepost * * @return bool */ private function getUrl($usepost = false) { if (isset($this->_trailUrl)) { $ch = curl_init(self::POPURL . $this->_trailUrl); } else { $ch = curl_init(self::IF18); } if ($usepost === true) { curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST"); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $this->_postParams); } curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_VERBOSE, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_USERAGENT, "Firefox/2.0.0.1"); curl_setopt($ch, CURLOPT_FAILONERROR, 1); if (isset($this->cookie)) { curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie); curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie); } curl_setopt_array($ch, newznab\utility\Utility::curlSslContextOptions()); $this->_response = curl_exec($ch); if (!$this->_response) { curl_close($ch); return false; } curl_close($ch); $this->_html->load($this->_response); return true; }
function getCategories($u) { global $baseurl, $f; $path = ""; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); echo "Loaded URL: " . $u . "\n"; if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) { $breadcrumb = $d->find('div[id=breadcrumb]', 0); //foreach($breadcrumb as $b) { //echo "Breadcrumb = " . $b;} if (!is_null($breadcrumb)) { foreach ($breadcrumb->children() as $crumb) { $path .= trim($crumb->innertext) . "/"; } $path .= trim(strrchr($breadcrumb->innertext, ">"), "> "); } foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) { $name = trim(strstr($div->children(0)->innertext, "(", true)); $url = $baseurl . $div->children(0)->href; $data = array("Name" => $name, "Path" => $path, "URL" => $url); echo $path . "/" . $name . "\n"; if ($local) { fputcsv($f, array($name, $path, $url)); } else { scraperwiki::save_sqlite(array("URL"), $data); } getCategories($url); } } }
/** * 解析下返回来的信息 * @return string 解析成功后的信息 */ public function parse() { require_once dirname(__FILE__) . '/simple_html_dom.php'; $data = $this->requestURL(); if (empty($data) || strlen($data < 100)) { return $data; } //如过抓取到的内容是空的说明cookie失效了。 $html = new simple_html_dom(); $html->load($data); $ymd = $html->find('.time-d'); $his = $html->find('.time-h'); $title = $html->find('.consume-title a'); $trade = $html->find('td.tradeNo p'); $name = $html->find('p.name'); $amount = $html->find('td.amount span'); if (!$trade) { return 'no_order'; } $info = array(); foreach ($ymd as $key => $value) { //只要订单数字部分 preg_match('/\\d+/', $trade[$key]->innertext, $tradeNo); //这里可以添加一些逻辑判断语句,例如存到数据库里面遍历查询这个订单是否已经通知成功 $info[] = array('time' => trim($ymd[$key]->innertext) . ' ' . trim($his[$key]->innertext), 'title' => trim($title[$key]->innertext), 'trade' => trim($tradeNo[0]), 'name' => trim($name[$key]->innertext), 'amount' => trim(str_replace('+', '', $amount[$key]->innertext))); } $html->clear(); return $info; }
/** * Get Raw html of webpage * * @param bool $usepost * @param string $site * * @return bool */ private function getUrl($usepost = false, $site = "straight") { if (isset($this->_trailUrl)) { $ch = curl_init($this->_whichSite[$site] . $this->_trailUrl); } else { $ch = curl_init(self::IF18); } if ($usepost === true) { curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "POST"); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $this->_postParams); } curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_VERBOSE, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_USERAGENT, "Firefox/2.0.0.1"); curl_setopt($ch, CURLOPT_FAILONERROR, 1); if (isset($this->cookie)) { curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookie); curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookie); } $this->_response = curl_exec($ch); if (!$this->_response) { curl_close($ch); return false; } curl_close($ch); $this->_html->load($this->_response); return true; }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape("http://www.norwegian.no/fly/lavpris/?D_City=CPH&A_City=DUB&TripType=2&D_Day=1&D_Month=201104&R_Day=1&R_Month=201104&AdultCount=1&ChildCount=0&InfantCount=0"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("div"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
function grep_munich($url, $table_name) { $html = scraperWiki::scrape($url); $count = 0; # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); //Drop all old informations by dropping the table scraperwiki::sqliteexecute("drop table if exists " . $table_name); scraperwiki::sqlitecommit(); $table = $dom->getElementById('flight_info_area'); foreach ($table->find('tr') as $data) { // Flight details. Read tds or ths $tds = $data->find("td"); //if there are less then 7 columns continue to next loop if (sizeof($tds) < 7) { continue; } //print $data->plaintext . "\n"; $flightnr = $tds[1]->plaintext; $from = $tds[2]->plaintext; $time = $tds[3]->plaintext; $expected_time = $tds[4]->plaintext; //Create date $date = date("Y-m-d"); //Build array of flight informations $flight_data = array("date" => $date, "count" => $count, "flightnr" => $flightnr, "from" => $from, "time" => $time, "expected_time" => $expected_time); //Save the informations of one flight scraperwiki::save_sqlite(array("date", "count"), $flight_data, $table_name); $count = $count + 1; } }
function scrapeTEDRSS($url, $sector) { print $url . " " . $sector . "\n"; // $xml = scraperWiki::scrape($url); $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); // 10 second before aborting // try CURLOPT_CONNECTTIMEOUT (in seconds) // try CURLOPT_LOW_SPEED_LIMIT (to define what slow is, with): // curl_setopt($curl, CURLOPT_LOW_SPEED_TIME, 10); (10 second at low speed before aborting $xml = curl_exec($curl); print curl_error($curl) . "\n"; $dom = new simple_html_dom(); $dom->load($xml); $items = $dom->find("item"); foreach ($items as $item) { $guid = $item->find("guid"); $noticeURL = str_replace("TEXT", "DATA", $guid[0]->plaintext); print $noticeURL . " " . $sector . " " . memory_get_usage() / 1000000 . "MB"; echo "\n"; // $record = scrapeTEDDataPage ($noticeURL, $sector); $record = array('time' => microtime(true), 'sector' => $sector, 'url' => $noticeURL); scraperwiki::save(array('sector', 'url'), $record); sleep(1); } $dom->__destruct(); unset($items); unset($dom); unset($xml); print memory_get_usage() / 1024 / 1024 . "MB\n"; }
/** * Compile a template file by reading it, converting the DOM using * {@see convert()}, then applying macros using {@see transform()}. * @param string $template Template file path. * @return string PHP template content. * @throws InvalidTemplateException If template is inaccessible or invalid. */ public function compile($template) { $dom = new \simple_html_dom(); $this->currentTemplate = $template; $file = file_get_contents($template); if ($file === false) { throw new InvalidTemplateException(tr('Could not read template: %1', $template)); } if (!$dom->load($file, true, false)) { throw new InvalidTemplateException(tr('Could not parse template: %1', $template)); } $root = new InternalNode(); $main = $dom->find('[j:main]', 0); if (isset($main)) { $root->append($this->convert($main)); } else { foreach ($dom->find('*, text') as $html) { if ($html->parent->tag != 'root') { continue; } $root->append($this->convert($html)); } } $this->transform($root); return $root->__toString(); }
function getCardInfo($url) { $baseURL = 'http://gatherer.wizards.com/Pages/Card/'; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $cardImage = $dom->find('img[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cardImage]', 0)->src; $cardImage = str_replace("amp;", "", $cardImage); $imgURL = $baseURL . $cardImage; $name = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow] div[class=value]', 0)->plaintext; $name = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $name); $mana = ""; $manaImages = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow] div[class=value] img'); foreach ($manaImages as $manaItem) { $mana .= substr($manaItem->alt, 0, 1); } $mana = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $mana); $cmc = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow] div[class=value]', 0); $cmc = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cmc); $type = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow] div[class=value]', 0); $type = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $type); $text = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow] div[class=value]', 0); $text = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $text); $flavor = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow] div[class=value]', 0); $flavor = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $flavor); $cardNumber = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow] div[class=value]', 0); $cardNumber = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cardNumber); $artist = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow] div[class=value]', 0); $artist = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $artist); $rarity = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow] div[class=value]', 0); $rarity = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $rarity); $set = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_setRow] div[class=value]', 0); $set = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $set); scraperwiki::save_sqlite(array("card"), array("Card" => trim($name), "Image" => $imgURL, "Mana" => trim($mana), "CMC" => trim($cmc), "Type" => trim($type), "Card Text" => trim($text), "Flavor Text" => trim($flavor), "Artist" => trim($artist), "Card Number" => trim($cardNumber), "Rarity" => trim($rarity), "Expansion" => trim($set))); }
function do_day($rec) { $html = scraperwiki::scrape($rec['url']); $dom = new simple_html_dom(); $dom->load($html); $cell = $dom->find('a[name=discs]'); $lines = $cell[0]->parent->find('text'); print $lines[10] . "\n"; print count($lines) . "\n"; # loop by number, as null lines stop a foreach $n = 0; for ($line_no = 0; $line_no < count($lines); $line_no++) { $line = $lines[$line_no]; if (strlen($line) == 3) { # the DOM object crashes on this row, so ignore continue; } #if (preg_match("#^" . $n . "#", $line, $matches)) { print $line_no . " " . strlen($line) . "\n"; $n = $n + 1; print $line . "\n"; #} } #scraperwiki::save(array('data'), array('data' => $data->plaintext)); }
public static function absolutizeHtml($sBaseUrl, $sHtml) { $oHtml = new simple_html_dom(); $oHtml->load($sHtml); $aTags = $oHtml->find('a'); foreach ($aTags as $oTag) { $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href); } $aTags = $oHtml->find('img'); foreach ($aTags as $oTag) { $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src); } $aTags = $oHtml->find('script'); foreach ($aTags as $oTag) { $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src); } $aTags = $oHtml->find('link'); foreach ($aTags as $oTag) { $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href); } // Parse url() in inline css $aTags = $oHtml->find('style'); foreach ($aTags as $oTag) { $oTag->innertext = preg_replace_callback('|url\\s*\\(\\s*[\'"]?([^\'"\\)]+)[\'"]?\\s*\\)|', function ($aMatches) use($sBaseUrl) { return 'url("' . trim(self::absolutizeUrl($sBaseUrl, $aMatches[1])) . '")'; }, $oTag->innertext); } return $oHtml . ''; }
function scrapPage($page) { print "Scraping page " . $page; $url = "http://www.geipan.fr/index.php?id=202"; $fields_string = "&no_cache=1&" . "tx_geipansearch_pi1%5Bsubmit_form%5D=1&" . "tx_geipansearch_pi1%5Btexte_resume%5D=&" . "tx_geipansearch_pi1%5Bdate_debut%5D=&" . "tx_geipansearch_pi1%5Bdate_fin%5D=&" . "no_cache=1&" . "tx_geipansearch_pi1%5Bclasse_cas%5D=tous&" . "tx_geipansearch_pi1%5Bregion%5D=&" . "page=" . $page . "&" . "order_by=&" . "sens="; $curl = curl_init($url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); curl_setopt($curl, CURLOPT_MAXREDIRS, 10); curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($curl, CURLOPT_TIMEOUT, 20); curl_setopt($curl, CURLOPT_POST, 11); curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string); $html = curl_exec($curl); print curl_error($curl) . "\n"; // print($html); $dom = new simple_html_dom(); $dom->load($html); $trs = $dom->find("tr"); foreach ($trs as $tr) { if (isset($tr->attr['onclick'])) { $ID = substr($tr->attr['onclick'], strpos($tr->attr['onclick'], "cas=") + 4, 13); print $ID . "\n"; $tds = $tr->find("td"); $title = utf8_encode($tds[0]->plaintext); $date = $tds[1]->plaintext; $departement = utf8_encode($tds[2]->plaintext); $classe = $tds[3]->plaintext; $maj = $tds[4]->plaintext; $city = substr($title, 0, strpos($title, "(") - 1); $record = array('ID' => $ID, 'title' => $title, 'date' => $date, 'departement' => $departement, 'classe' => $classe, 'maj' => $maj, 'city' => $city); scraperwiki::save(array('ID', 'maj'), $record); } } }
protected function parsing($input) { include_once "inc/simple_html_dom.php"; # Create a DOM parser object $html = new simple_html_dom(); # Parse the HTML from Amazon. $html->load($input); $result = []; # Iterate over all the tags foreach ($html->find('li[class=s-result-item]') as $key => $innerData) { //image foreach ($innerData->find('img[class=s-access-image]') as $img) { $atmp['image'] = $img->getAttribute('src'); } //title foreach ($innerData->find('h2[class=s-access-title]') as $title) { $atmp['title'] = $title->innertext(); } //price foreach ($innerData->find('span[class=s-price]') as $price) { $price = $price->innertext(); $atmp['price'] = $price; $atmp['numPrice'] = str_replace(",", '', substr($price, 1)); } # Show the <a href> $result[$key] = $atmp; } if (!empty($result)) { return $this->aResult = $result; } }
/** * curl 访问 开奖数据 */ private function get_data() { include_once 'simplehtmldom_1_5/simple_html_dom.php'; $simple_html_dom = new \simple_html_dom(); //zlib 解压 并转码 $data = false; $data = @file_get_contents("compress.zlib://" . self::URL); if (!$data) { $this->setLog(false, '重庆时时彩-开奖数据抓取失败'); exit('重庆时时彩-数据抓取失败,请尽快联系网站管理员' . "\r\n"); } //转换成 UTF-8编码 $encode = mb_detect_encoding($data, array('ASCII', 'UTF-8', 'GB2312', "GBK", 'BIG5')); $content = iconv($encode, 'UTF-8', $data); $simple_html_dom->load($content); //开奖期号 $qihao = $simple_html_dom->find('div[class=aside]', 0)->find('h3', 0)->find('em', 0)->plaintext; //开奖号 $code = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 1)->plaintext; if ($code == '--') { exit('重庆时时彩-等待开奖...' . "\r\n"); } $isKaiJiang = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 2)->plaintext; if ($isKaiJiang == '--' && $isKaiJiang == '开奖中') { exit('重庆时时彩-等待开奖...' . "\r\n"); } $simple_html_dom->clear(); //将开奖号中间的空格去掉 $code = str_replace(" ", '', $code); //开奖时间 $kjsj = date('Y-m-d H:i:s'); $this->data = ['qihao' => $qihao, 'kjsj' => $kjsj, 'code' => $code]; }
function get_dom($url) { $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); return $dom; }
function scrapeDetails($ngo) { $html_content = scraperwiki::scrape($ngo["url"]); $dom = new simple_html_dom(); $dom->load($html_content); $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:'); // Scrape Details from all paragraphs $paragraphs = $dom->find('p'); foreach ($paragraphs as $p) { if (strstr($p->plaintext, "Website")) { $ngo["website"] = $p->find('a', 0)->href; } if (strstr($p->plaintext, "Email")) { $ngo["email"] = $p->find('a', 0)->plaintext; } foreach ($infosWeWant as $key => $info) { $res = extractInfo($p, $info); if ($res) { $ngo[$info] = $res; //Do not search for this info again unset($infosWeWant[$key]); } } } print_r($ngo); return $ngo; }
private function scrap_page($url) { $base_url = 'http://' . parse_url($url, PHP_URL_HOST); $p = new Page($url); $h = new simple_html_dom(); $h->load($p->content()); $boxes = $h->find('.textbox'); $result = array(); foreach ($boxes as $box) { // image/url $content = $box->find('.textbox-content', 0); $url = $base_url . $content->find('a', 0)->href; $thumb = $base_url . $content->find('img', 0)->src; // other data $label = $box->find('.webcss-label', 0); $title = $label->find('p', 0)->find('a', 0)->innertext; $title = html_entity_decode($title, ENT_COMPAT, 'UTF-8'); $h2 = $label->find('h2', 0); $date = Text::create($h2->innertext)->cut_after('>:')->to_s(); $h5 = $label->find('h5', 0); $tags = Text::create($h5->innertext)->strip_tags()->cut_after(':')->to_s(); $tags = array_filter(explode(',', $tags), 'trim'); $view = $label->find('.webcss_view', 0); $m = Text::create($view->innertext)->regex_match('/(\\d+)/'); $pages = $m[1]; $item = array('title' => $title, 'url' => $url, 'date' => $date, 'pages' => $pages, 'thumb' => $thumb, 'tags' => '#' . implode('#', $tags) . '#'); $result[] = $item; } return array_reverse($result); }
function read_listing($params, $url = 'http://www.auto24.ee/kasutatud/nimekiri.php') { $endpoint = build_query($url, $params); $html = scraperWiki::scrape($endpoint); $dom = new simple_html_dom(); $dom->load($html); $totalResultsEl = $dom->find('.paginator .current-range strong'); $totalResults = $totalResultsEl[0]->plaintext; $medianItem = ($totalResults + 1) / 2; if ($medianItem > RESULTS_PER_PAGE) { $listingOffset = floor($medianItem / RESULTS_PER_PAGE) * RESULTS_PER_PAGE; $params['ak'] = $listingOffset; $medianItem -= $listingOffset; $endpoint = build_query($url, $params); $html = scraperWiki::scrape($endpoint); $dom = new simple_html_dom(); $dom->load($html); } $rows = $dom->find("[@id=usedVehiclesSearchResult] .result-row"); $lPoint = floor($medianItem) - 1; $hPoint = ceil($medianItem) - 1; $a24ksi = 0; if ($lPoint == $hPoint) { $rowData = get_row_data($rows[$lPoint]); $a24ksi = $rowData['price']; } else { $lRowData = get_row_data($rows[$lPoint]); $hRowData = get_row_data($rows[$hPoint]); $a24ksi = round(($lRowData['price'] + $hRowData['price']) / 2); } return array('n' => $totalResults, 'val' => $a24ksi); }
function scrape_NG_news_article($art_url) { $html = scraperWiki::scrape($art_url); require_once 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div#page_head h1") as $data) { $art_title = $data->innertext; } foreach ($dom->find("div#page_head h2") as $data) { $art_subtitle = $data->innertext; } $art_text_array = array(); $art_paragraph_count = 0; $art_text_full = ""; $art_teaser50 = ""; $art_teaser100 = ""; foreach ($dom->find("div#content div.article_text p") as $data) { $art_paragraph_count++; $tmp = str_get_html($data)->plaintext; // $art_text_array[$art_paragraph_count] = $tmp; $art_text_full .= $tmp . " #" . $art_paragraph_count . "# "; //if ($art_paragraph_count == 1) $art_teaser = $tmp; } $art_teaserS = word_teaser($art_text_full, 60); $art_teaserM = word_teaser($art_text_full, 120); /* print $art_text_full; show_article($art_title,$art_subtitle,$art_text_array); for($i=0;$i<count($art_text_array);$i++) { $art_text_full .= $art_text_array[$i]." #".$i."# "; } $art_text_full = $art_text_full->plaintext; $art_teaser = $art_text_array[0]->plaintext; */ // $record = array("Title" => $art_title, "Subtitle" => $art_subtitle, "TeaserS" => $art_teaserS, "TeaserM" => $art_teaserM, "Text" => $art_text_full, "URL" => $art_url); $record = array("TeaserM" => $art_teaserM, "URL" => $art_url); scraperwiki::save(array('URL'), $record); return $record; }
function getProducts($u, $cat) { global $o; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); //echo "Loaded URL: " . $u . "\n"; $items = $d->find('li.grid-item'); if (count($items) > 0) { foreach ($items as $p) { $prod = $p->find('p.product-name > a', 0); $prodname = trim($prod->innertext); $prodURL = $prod->href; if (!is_null($p->find('p.minimal-price', 0))) { $prodtype = 1; } else { $prodtype = 0; } fputcsv($o, array($prodname, $prodtype, $cat, $prodURL)); echo $prodname . "\n"; } if (!is_null($d->find('p.next', 0))) { getProducts($d->find('p.next', 0)->href, $cat); } } }
/** * Private function of obtaining the simple html dom object with the html loaded in it * @param type $html * @return $html_dom_array Array of simple_html_dom tags */ private function &_getHtmlDomArray($html) { $html_dom = new simple_html_dom(); $html_dom->load('<html><body>' . $html . '</body></html>'); $html_dom_array = $html_dom->find('html', 0)->children(); return $html_dom_array; }
public function parse($isUpdate = false) { Ibos::import("application.extensions.simple_html_dom", true); if ($isUpdate) { $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel); $max = 0; } else { $model = $this->printmodel; $max = intval($this->itemmax); } $elements = array(); $doc = new simple_html_dom(); $doc->load($model, true, true, CHARSET); $items = $doc->find("ic"); $config = $this->getItemConfig(); if (!empty($items) && !empty($config)) { $this->refactor($items, $config, $max, $elements); } $html = $doc->save(); $this->_cache = $elements; CacheUtil::set("form_" . $this->ID, $elements); $form["printmodelshort"] = $html; if ($max != $this->itemmax) { $form["itemmax"] = $max; } $doc->clear(); FlowFormType::model()->modify($this->ID, $form); }
function scrap_yp($last_alphabet = '', $last_page = '') { $alphabet = range('a', 'z'); if (is_null($last_alphabet) || $last_alphabet == '') { $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded'); if (!is_null($temp_alphabet)) { $last_alphabet = $temp_alphabet; } else { $last_alphabet = 'a'; } } if (is_null($last_page) || $last_page == '') { $temp_page = scraperwiki::get_var('last_page_loaded'); if (!is_null($temp_page)) { $last_page = $temp_page; } else { $last_page = 1; } } $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page; $html = scraperWiki::scrape($yp_base_url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("ul.directory-list") as $data) { echo $data; } }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape(BASE_URL . "?type={$param}"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("td"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
private function parsing($scrappedData) { $result = []; //Create a DOM parser object $html = new simple_html_dom(); //Parse the HTML from Amazon. $html->load($scrappedData); # Iterate over all the tags foreach ($html->find('li[class=s-result-item]') as $key => $innerData) { //image foreach ($innerData->find('img[class=s-access-image]') as $img) { $atmp['image'] = $img->getAttribute('src'); } //title foreach ($innerData->find('h2[class=s-access-title]') as $title) { $atmp['title'] = $title->innertext(); } //price foreach ($innerData->find('span[class=s-price]') as $price) { $price = $price->innertext(); $atmp['price'] = $price; $atmp['numPrice'] = str_replace(",", '', substr($price, 1)); } //total page foreach ($html->find('span[class=pagnDisabled]') as $maxPage) { $atmp['totalPage'] = $maxPage->innertext(); } # Show the <a href> if (isset($atmp)) { $result[$key] = $atmp; } } return $this->aResult = $result; }
/** * Gets Raw Html * * @param string $fetchURL * @param bool $usePost * * @return bool */ private function getUrl($fetchURL, $usePost = false) { if (isset($fetchURL)) { $this->_ch = curl_init($fetchURL); } if ($usePost === true) { curl_setopt($this->_ch, CURLOPT_POST, 1); curl_setopt($this->_ch, CURLOPT_POSTFIELDS, $this->_postParams); } curl_setopt($this->_ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($this->_ch, CURLOPT_HEADER, 0); curl_setopt($this->_ch, CURLOPT_VERBOSE, 0); curl_setopt($this->_ch, CURLOPT_USERAGENT, "Firefox/2.0.0.1"); curl_setopt($this->_ch, CURLOPT_FAILONERROR, 1); if (isset($this->cookie)) { curl_setopt($this->_ch, CURLOPT_COOKIEJAR, $this->cookie); curl_setopt($this->_ch, CURLOPT_COOKIEFILE, $this->cookie); } curl_setopt_array($this->_ch, Misc::curlSslContextOptions()); $this->_response = curl_exec($this->_ch); if (!$this->_response) { curl_close($this->_ch); return false; } curl_close($this->_ch); $this->_html->load($this->_response); return true; }
public function save($html, $dir) { import("@.ORG.htmltodocx.documentation.support_functions"); $phpword_object = new PHPWord(); $section = $phpword_object->createSection(); // HTML Dom object: $html_dom = new simple_html_dom(); $html_dom->load('<html><body>' . $html . '</body></html>'); // Note, we needed to nest the html in a couple of dummy elements. // Create the dom array of elements which we are going to work on: $html_dom_array = $html_dom->find('html', 0)->children(); // We need this for setting base_root and base_path in the initial_state array // (below). We are using a function here (derived from Drupal) to create these // paths automatically - you may want to do something different in your // implementation. This function is in the included file // documentation/support_functions.inc. $paths = htmltodocx_paths(); // Provide some initial settings: $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example()); // Convert the HTML and put it into the PHPWord object htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state); // Clear the HTML dom object: $html_dom->clear(); unset($html_dom); // Save File $str = explode(".", $h2d_file_uri); $h2d_file_uri = $dir . "wordtemp/" . time() . ".docx"; if (!file_exists($dir . "wordtemp/")) { $this->createFolders($dir . "wordtemp/"); //判断目标文件夹是否存在 } $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007'); $objWriter->save($h2d_file_uri); return $h2d_file_uri; }