function getProducts($u, $cat) { global $o; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); //echo "Loaded URL: " . $u . "\n"; $items = $d->find('li.grid-item'); if (count($items) > 0) { foreach ($items as $p) { $prod = $p->find('p.product-name > a', 0); $prodname = trim($prod->innertext); $prodURL = $prod->href; if (!is_null($p->find('p.minimal-price', 0))) { $prodtype = 1; } else { $prodtype = 0; } fputcsv($o, array($prodname, $prodtype, $cat, $prodURL)); echo $prodname . "\n"; } if (!is_null($d->find('p.next', 0))) { getProducts($d->find('p.next', 0)->href, $cat); } } }
private function parsing($scrappedData) { $result = []; //Create a DOM parser object $html = new simple_html_dom(); //Parse the HTML from Amazon. $html->load($scrappedData); # Iterate over all the tags foreach ($html->find('li[class=s-result-item]') as $key => $innerData) { //image foreach ($innerData->find('img[class=s-access-image]') as $img) { $atmp['image'] = $img->getAttribute('src'); } //title foreach ($innerData->find('h2[class=s-access-title]') as $title) { $atmp['title'] = $title->innertext(); } //price foreach ($innerData->find('span[class=s-price]') as $price) { $price = $price->innertext(); $atmp['price'] = $price; $atmp['numPrice'] = str_replace(",", '', substr($price, 1)); } //total page foreach ($html->find('span[class=pagnDisabled]') as $maxPage) { $atmp['totalPage'] = $maxPage->innertext(); } # Show the <a href> if (isset($atmp)) { $result[$key] = $atmp; } } return $this->aResult = $result; }
function clubURL($url) { $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext)); $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName)); $_GLOBAL['clubs'][] = $formatClubName; echo 'running ' . $formatClubName . "\n"; foreach ($dom->find('table', 2)->find('tr') as $row) { if (is_numeric($row->find('td', 0)->plaintext)) { $year = trim($row->find('td', 0)->plaintext); $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext)); if (trim($position) == 'Champion') { $position = 1; } $leagueLevel = trim($row->find('td', 2)->plaintext); $overallPosition = trim($row->find('td', 3)->plaintext); $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext)); $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext)); $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance); scraperwiki::save(array('club', 'year'), $dataset); } } /* * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak */ $dom->clear(); unset($dom); }
function run_ml($q_num = 0) { $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("tr") as $data) { $tds = $data->find("td"); $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext))); $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext); /* * Stores results */ scraperwiki::save_sqlite(array("No"), $record); unset($temp_data); } foreach ($dom->find("a") as $a) { if ($a->plaintext == 'Next') { $tmp_a = $a->href; $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a); if ($tmp_a > 0) { continue; } } } if ((int) $tmp_a != 0) { run_ml($tmp_a); } else { exit; } }
function getCategories($u) { global $baseurl, $f; $path = ""; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); echo "Loaded URL: " . $u . "\n"; if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) { $breadcrumb = $d->find('div[id=breadcrumb]', 0); //foreach($breadcrumb as $b) { //echo "Breadcrumb = " . $b;} if (!is_null($breadcrumb)) { foreach ($breadcrumb->children() as $crumb) { $path .= trim($crumb->innertext) . "/"; } $path .= trim(strrchr($breadcrumb->innertext, ">"), "> "); } foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) { $name = trim(strstr($div->children(0)->innertext, "(", true)); $url = $baseurl . $div->children(0)->href; $data = array("Name" => $name, "Path" => $path, "URL" => $url); echo $path . "/" . $name . "\n"; if ($local) { fputcsv($f, array($name, $path, $url)); } else { scraperwiki::save_sqlite(array("URL"), $data); } getCategories($url); } } }
/** * Compile a template file by reading it, converting the DOM using * {@see convert()}, then applying macros using {@see transform()}. * @param string $template Template file path. * @return string PHP template content. * @throws InvalidTemplateException If template is inaccessible or invalid. */ public function compile($template) { $dom = new \simple_html_dom(); $this->currentTemplate = $template; $file = file_get_contents($template); if ($file === false) { throw new InvalidTemplateException(tr('Could not read template: %1', $template)); } if (!$dom->load($file, true, false)) { throw new InvalidTemplateException(tr('Could not parse template: %1', $template)); } $root = new InternalNode(); $main = $dom->find('[j:main]', 0); if (isset($main)) { $root->append($this->convert($main)); } else { foreach ($dom->find('*, text') as $html) { if ($html->parent->tag != 'root') { continue; } $root->append($this->convert($html)); } } $this->transform($root); return $root->__toString(); }
function getCardInfo($url) { $baseURL = 'http://gatherer.wizards.com/Pages/Card/'; $html = scraperWiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $cardImage = $dom->find('img[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cardImage]', 0)->src; $cardImage = str_replace("amp;", "", $cardImage); $imgURL = $baseURL . $cardImage; $name = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow] div[class=value]', 0)->plaintext; $name = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $name); $mana = ""; $manaImages = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow] div[class=value] img'); foreach ($manaImages as $manaItem) { $mana .= substr($manaItem->alt, 0, 1); } $mana = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $mana); $cmc = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow] div[class=value]', 0); $cmc = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cmc); $type = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow] div[class=value]', 0); $type = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $type); $text = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow] div[class=value]', 0); $text = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $text); $flavor = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow] div[class=value]', 0); $flavor = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $flavor); $cardNumber = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow] div[class=value]', 0); $cardNumber = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cardNumber); $artist = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow] div[class=value]', 0); $artist = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $artist); $rarity = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow] div[class=value]', 0); $rarity = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $rarity); $set = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_setRow] div[class=value]', 0); $set = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $set); scraperwiki::save_sqlite(array("card"), array("Card" => trim($name), "Image" => $imgURL, "Mana" => trim($mana), "CMC" => trim($cmc), "Type" => trim($type), "Card Text" => trim($text), "Flavor Text" => trim($flavor), "Artist" => trim($artist), "Card Number" => trim($cardNumber), "Rarity" => trim($rarity), "Expansion" => trim($set))); }
public static function absolutizeHtml($sBaseUrl, $sHtml) { $oHtml = new simple_html_dom(); $oHtml->load($sHtml); $aTags = $oHtml->find('a'); foreach ($aTags as $oTag) { $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href); } $aTags = $oHtml->find('img'); foreach ($aTags as $oTag) { $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src); } $aTags = $oHtml->find('script'); foreach ($aTags as $oTag) { $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src); } $aTags = $oHtml->find('link'); foreach ($aTags as $oTag) { $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href); } // Parse url() in inline css $aTags = $oHtml->find('style'); foreach ($aTags as $oTag) { $oTag->innertext = preg_replace_callback('|url\\s*\\(\\s*[\'"]?([^\'"\\)]+)[\'"]?\\s*\\)|', function ($aMatches) use($sBaseUrl) { return 'url("' . trim(self::absolutizeUrl($sBaseUrl, $aMatches[1])) . '")'; }, $oTag->innertext); } return $oHtml . ''; }
/** * curl 访问 开奖数据 */ private function get_data() { include_once 'simplehtmldom_1_5/simple_html_dom.php'; $simple_html_dom = new \simple_html_dom(); //zlib 解压 并转码 $data = false; $data = @file_get_contents("compress.zlib://" . self::URL); if (!$data) { $this->setLog(false, '重庆时时彩-开奖数据抓取失败'); exit('重庆时时彩-数据抓取失败,请尽快联系网站管理员' . "\r\n"); } //转换成 UTF-8编码 $encode = mb_detect_encoding($data, array('ASCII', 'UTF-8', 'GB2312', "GBK", 'BIG5')); $content = iconv($encode, 'UTF-8', $data); $simple_html_dom->load($content); //开奖期号 $qihao = $simple_html_dom->find('div[class=aside]', 0)->find('h3', 0)->find('em', 0)->plaintext; //开奖号 $code = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 1)->plaintext; if ($code == '--') { exit('重庆时时彩-等待开奖...' . "\r\n"); } $isKaiJiang = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 2)->plaintext; if ($isKaiJiang == '--' && $isKaiJiang == '开奖中') { exit('重庆时时彩-等待开奖...' . "\r\n"); } $simple_html_dom->clear(); //将开奖号中间的空格去掉 $code = str_replace(" ", '', $code); //开奖时间 $kjsj = date('Y-m-d H:i:s'); $this->data = ['qihao' => $qihao, 'kjsj' => $kjsj, 'code' => $code]; }
function scraper($url_search, $country_id) { $has_next = false; $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet"; $html = scraperwiki::scrape($url_search); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('table[class=JResult]') as $result) { foreach ($result->find('td[class=JRTitle] a') as $job_page) { $chars = explode("'", $job_page->onclick); $url_job = $base_url . substr($chars[1], 1); $url_id = strstr($url_job, 'uniqueJvId='); $url_id = str_replace('uniqueJvId=', "", $url_id); echo "JOB: " . $url_job . "<br />"; } foreach ($result->find('th') as $data) { $text = trim($data->plaintext); if ($text == 'Description:') { $description = trim($data->next_sibling()->plaintext); echo "DESCRIPTION: " . $description . "<br />"; } if ($text == 'Source:') { $source = trim($data->next_sibling()->plaintext); $source = str_replace("'", "\\'", $source); if ($source != '' && $source != ' ') { $source_id = insert_name('source', $source); echo "SOURCE: " . $source . "<br /><br />"; } } } $description = str_replace("'", "\\'", $description); $description = str_replace("</BR>", "", $description); $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'"); $cont = mysql_num_rows($sql); if ($cont == 0) { mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()"); } else { echo "Job URL already extracted: " . $url_job . "<br /><br />"; } } foreach ($dom->find('div[class=prevNext] a') as $next_page) { $text = $next_page->plaintext; if ($text == "Next page") { $url_next = substr($next_page->href, 1); $url_next = $base_url . $url_next; $has_next = true; print "<br /><br />NEXT: " . $url_next . "<br /><br />"; } } unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search); //Comment this for tests, uncomment this to get all data // if ($has_next == true){ // sleep(1); // scraper($url_next, $country_id); // } }
/** * 创建表结构 * @param $configs */ private static function createTables($configs) { $tables = self::$XML->find("table"); foreach ($tables as $value) { $tableName = $configs["table-prefix"] . $value->name; self::query("DROP TABLE IF EXISTS `{$tableName}`"); $sql = "CREATE TABLE `{$tableName}`("; $pk = $value->find("pk", 0); if ($pk) { $sql .= "`{$pk->name}` {$pk->type} NOT NULL "; if ($pk->ai) { $sql .= "AUTO_INCREMENT "; } $sql .= "COMMENT '主键',"; } //添加字段 $fields = $value->find("fields", 0); if ($fields) { foreach ($fields->children() as $fd) { if ($fd->default || $fd->default === "0") { //has default value if (in_array($fd->default, self::$DEFAULT_VALUE_KEYWORD)) { $sql .= "`{$fd->name}` {$fd->type} NOT NULL DEFAULT {$fd->default} COMMENT '{$fd->comment}',"; } else { $sql .= "`{$fd->name}` {$fd->type} NOT NULL DEFAULT '{$fd->default}' COMMENT '{$fd->comment}',"; } } else { //has not default value $sql .= "`{$fd->name}` {$fd->type} NOT NULL COMMENT '{$fd->comment}',"; } //创建索引 if ($fd->getAttribute("add-index") == "true") { $indexType = $fd->getAttribute("index-type"); if ($indexType == "normal") { $sql .= "KEY `{$fd->name}` (`{$fd->name}`), "; } elseif ($indexType == "unique") { $sql .= "UNIQUE KEY `{$fd->name}` (`{$fd->name}`),"; } } } } if ($pk) { $sql .= "PRIMARY KEY (`{$pk->name}`)"; } $sql .= ") ENGINE={$value->engine} DEFAULT CHARSET={$configs['charset']} COMMENT='{$value->comment}' AUTO_INCREMENT=1 ;"; if (self::query($sql) !== false) { tprintOk("create table '{$tableName}' successfully."); } else { tprintError("create table '{$tableName}' faild."); tprintError(self::$DB_CONN->error); } } }
public function find($url) { # sanitize url $url = strip_tags($url); $url = urldecode($url); # Remove the end character if ($url[strlen($url) - 1] == '/') { $url = substr($url, 0, strlen($url) - 1); } # test if 'http://' is present if (strpos($url, 'http://') !== 0) { $url = 'http://' . $url; } # Tabs of rss feeds urls $results = array(); # Try to load the content of url $content = @file_get_contents($url); if (!empty($content)) { $html = new simple_html_dom(); $html->load($content, true); # Check if it is an url to feeds if (count($html->find('channel')) > 0) { $results[] = array('href' => $url, 'title' => 'RSS'); } else { if (count($html->find('feed')) > 0) { $results[] = array('href' => $url, 'title' => 'Atom'); } else { # Get links markup $links = $html->find('link'); foreach ($links as $link) { $href = $link->href; $title = $link->title; # Fix url to avoid path errors if ($href[0] == '/') { # $href = $url . $href; } # Get the resource type $type = $link->type; # Save only feeds if (!in_array($href, $results) && (ereg('application.*rss', $type) || ereg('application.*atom', $type))) { $results[] = array('href' => $href, 'title' => $title); } } } } } # transform results tab into json tab echo json_encode($results); }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); $h = new simple_html_dom(); $h->load($p->content()); $select = $h->find('select[name="pagejump"]', 0); $img = $h->find('#page', 0)->find('img', 0); $srcdir = dirname($img->src); $pages = array(); foreach ($select->find('option') as $opt) { $pages["{$prefix}-{$infix}-{$opt->value}.jpg"] = $srcdir . '/' . $opt->value . '.jpg'; } return $pages; }
function scrappe_offre($html, $reference) { $dom = new simple_html_dom(); $dom->load($html); $div = $dom->find("div.tx-sqliwebServiceanpe-pi5"); $span = $dom->find("div.tx-sqliwebServiceanpe-pi5 span.texteANPEDetail"); $actualisation = $span[1]->plaintext; foreach ($span as $data) { $tds = $data->find("td"); $record = array('actualiseJJ' => intval($actualisation[29] . $actualisation[30]), 'actualise le' => $actualisation, 'type_contrat' => $span[5]->plaintext, 'analyse_type_contrat' => "", 'experiance' => $span[6]->plaintext); print json_encode($record) . "\n"; #scraperwiki::save(array('contenu_offre'), $record); } #return ; }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape(BASE_URL . "?type={$param}"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("td"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
/** * Goes directly to site if site doesn't return error, search is found. * * @return bool */ public function search() { $result = false; if (!empty($this->searchTerm)) { $this->_title = $this->searchTerm; // Remove periods, underscored, anything between parenthesis. $this->searchTerm = preg_replace('#\\(.*?\\)|[-._]#i', ' ', $this->searchTerm); // Remove multiple spaces and trim leading spaces. $this->searchTerm = trim(preg_replace('#\\s{2,}#', ' ', $this->searchTerm)); // Replace whitespace with a - for desura game urls $this->searchTerm = preg_replace('#\\s#', '-', strtolower($this->searchTerm)); if ($this->getUrl(self::DESURAURL . '/games/' . $this->searchTerm) !== false) { if (!preg_match('#(Games system error)#i', $this->_response)) { if ($this->_ret = $this->_html->find("a#watchtoggle", 0)) { if (preg_match('#siteareaid=(?<gameid>\\d+)#', $this->_ret->href, $matches)) { $this->_desuraGameID = $matches['gameid']; $this->_directURL = self::DESURAURL . '/games/' . $this->searchTerm; $result = true; } } } } } return $result; }
/** * Searches for match against searchterm * @return bool, true if search >= 90% */ public function search() { $result = false; if (isset($this->searchTerm)) { $this->_trailUrl = self::TRAILINGSEARCH . urlencode($this->searchTerm); if ($this->getUrl() !== false) { if ($ret = $this->_html->find('div.product-info, div.title', 1)) { $this->_title = trim($ret->plaintext); $title = preg_replace('/XXX/', '', $ret->plaintext); $title = preg_replace('/\\(.*?\\)|[-._]/i', ' ', $title); $title = trim($title); if ($ret = $ret->find('a', 0)) { $this->_trailUrl = trim($ret->href); if ($this->getUrl() !== false) { if ($ret = $this->_html->find('#link-to-this', 0)) { $this->_directUrl = trim($ret->href); } similar_text(strtolower($this->searchTerm), strtolower($title), $p); if ($p >= 90) { $result = true; } } } } } } return $result; }
/** * Searches for match against searchterm * @return bool - true if search = 100% */ public function search() { $result = false; if (isset($this->searchTerm)) { $this->_trailUrl = self::TRAILINGSEARCH . urlencode($this->searchTerm); if ($this->getUrl() !== false) { if ($ret = $this->_html->find('img[rel=license]')) { if (count($ret) > 0) { foreach ($this->_html->find('img[rel=license]') as $ret) { if (isset($ret->alt)) { $title = trim($ret->alt, '"'); $title = preg_replace('/XXX/', '', $title); $comparetitle = preg_replace('/[^\\w]/', '', $title); $comparesearch = preg_replace('/[^\\w]/', '', $this->searchTerm); similar_text($comparetitle, $comparesearch, $p); if ($p == 100) { if (preg_match('/\\/(?<sku>\\d+)\\.jpg/i', $ret->src, $matches)) { $this->_title = trim($title); $this->_trailUrl = "/dvd_view_" . (string) $matches['sku'] . ".html"; $this->_directUrl = self::ADMURL . $this->_trailUrl; if ($this->getUrl() !== false) { $result = true; } } } } } } } } } return $result; }
public function parse($isUpdate = false) { Ibos::import("application.extensions.simple_html_dom", true); if ($isUpdate) { $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel); $max = 0; } else { $model = $this->printmodel; $max = intval($this->itemmax); } $elements = array(); $doc = new simple_html_dom(); $doc->load($model, true, true, CHARSET); $items = $doc->find("ic"); $config = $this->getItemConfig(); if (!empty($items) && !empty($config)) { $this->refactor($items, $config, $max, $elements); } $html = $doc->save(); $this->_cache = $elements; CacheUtil::set("form_" . $this->ID, $elements); $form["printmodelshort"] = $html; if ($max != $this->itemmax) { $form["itemmax"] = $max; } $doc->clear(); FlowFormType::model()->modify($this->ID, $form); }
protected function parsing($input) { include_once "inc/simple_html_dom.php"; # Create a DOM parser object $html = new simple_html_dom(); # Parse the HTML from Amazon. $html->load($input); $result = []; # Iterate over all the tags foreach ($html->find('li[class=s-result-item]') as $key => $innerData) { //image foreach ($innerData->find('img[class=s-access-image]') as $img) { $atmp['image'] = $img->getAttribute('src'); } //title foreach ($innerData->find('h2[class=s-access-title]') as $title) { $atmp['title'] = $title->innertext(); } //price foreach ($innerData->find('span[class=s-price]') as $price) { $price = $price->innertext(); $atmp['price'] = $price; $atmp['numPrice'] = str_replace(",", '', substr($price, 1)); } # Show the <a href> $result[$key] = $atmp; } if (!empty($result)) { return $this->aResult = $result; } }
/** * Private function of obtaining the simple html dom object with the html loaded in it * @param type $html * @return $html_dom_array Array of simple_html_dom tags */ private function &_getHtmlDomArray($html) { $html_dom = new simple_html_dom(); $html_dom->load('<html><body>' . $html . '</body></html>'); $html_dom_array = $html_dom->find('html', 0)->children(); return $html_dom_array; }
function read_listing($params, $url = 'http://www.auto24.ee/kasutatud/nimekiri.php') { $endpoint = build_query($url, $params); $html = scraperWiki::scrape($endpoint); $dom = new simple_html_dom(); $dom->load($html); $totalResultsEl = $dom->find('.paginator .current-range strong'); $totalResults = $totalResultsEl[0]->plaintext; $medianItem = ($totalResults + 1) / 2; if ($medianItem > RESULTS_PER_PAGE) { $listingOffset = floor($medianItem / RESULTS_PER_PAGE) * RESULTS_PER_PAGE; $params['ak'] = $listingOffset; $medianItem -= $listingOffset; $endpoint = build_query($url, $params); $html = scraperWiki::scrape($endpoint); $dom = new simple_html_dom(); $dom->load($html); } $rows = $dom->find("[@id=usedVehiclesSearchResult] .result-row"); $lPoint = floor($medianItem) - 1; $hPoint = ceil($medianItem) - 1; $a24ksi = 0; if ($lPoint == $hPoint) { $rowData = get_row_data($rows[$lPoint]); $a24ksi = $rowData['price']; } else { $lRowData = get_row_data($rows[$lPoint]); $hRowData = get_row_data($rows[$hPoint]); $a24ksi = round(($lRowData['price'] + $hRowData['price']) / 2); } return array('n' => $totalResults, 'val' => $a24ksi); }
/** * 動画のURLを取得する * * @param simple_html_dom $html * @return array **/ public function getMoviesUrl($html) { $query = 'div.entryBody div.topmore a img'; $movies_els = $html->find($query); $movie_data = array(); $manager = new UriManager(); // 動画はこちらテキストのリンクを取得する foreach ($movies_els as $movies_el) { if (!preg_match('/^動画.+/', $movies_el->getAttribute('alt'))) { continue; } // 親のaタグからリンクを取得する $parent_el = $next_el = $movies_el->parentNode(); $i = 0; while ($i < 3) { $next_el = $next_el->nextSibling(); if (is_null($next_el)) { break; } $i++; } if ($next_el->nodeName() == 'span') { $movie_data = []; break; } if ($parent_el->nodeName() == 'a') { $movie_data[] = $manager->resolve($parent_el->getAttribute('href')); } } return $movie_data; }
function scrapeDetails($ngo) { $html_content = scraperwiki::scrape($ngo["url"]); $dom = new simple_html_dom(); $dom->load($html_content); $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:'); // Scrape Details from all paragraphs $paragraphs = $dom->find('p'); foreach ($paragraphs as $p) { if (strstr($p->plaintext, "Website")) { $ngo["website"] = $p->find('a', 0)->href; } if (strstr($p->plaintext, "Email")) { $ngo["email"] = $p->find('a', 0)->plaintext; } foreach ($infosWeWant as $key => $info) { $res = extractInfo($p, $info); if ($res) { $ngo[$info] = $res; //Do not search for this info again unset($infosWeWant[$key]); } } } print_r($ngo); return $ngo; }
/** * 動画のURLを取得する * * @param simple_html_dom $html * @return array **/ public function getMoviesUrl($html) { $query = 'div.ently_body div.ently_text div.video-container iframe'; $movies_els = $html->find($query); $movie_data = array(); $manager = new UriManager(); // 動画はこちらテキストのリンクを取得する foreach ($movies_els as $movies_el) { if ($movies_el->hasAttribute('src')) { $url = $manager->resolve($movies_el->getAttribute('src')); if ($url !== false) { $movie_data[] = $url; } } } $query = 'div.ently_outline div.ently_body a'; $movies_els = $html->find($query); foreach ($movies_els as $movies_el) { $text = $movies_el->plaintext; if (preg_match('/リンク(/', $text) && $movies_el->hasAttribute('href')) { $resolve_url = $manager->resolve($movies_el->getAttribute('href')); if ($resolve_url) { $movie_data[] = $resolve_url; } } } return $movie_data; }
function scrap_yp($last_alphabet = '', $last_page = '') { $alphabet = range('a', 'z'); if (is_null($last_alphabet) || $last_alphabet == '') { $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded'); if (!is_null($temp_alphabet)) { $last_alphabet = $temp_alphabet; } else { $last_alphabet = 'a'; } } if (is_null($last_page) || $last_page == '') { $temp_page = scraperwiki::get_var('last_page_loaded'); if (!is_null($temp_page)) { $last_page = $temp_page; } else { $last_page = 1; } } $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page; $html = scraperWiki::scrape($yp_base_url); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("ul.directory-list") as $data) { echo $data; } }
/** * Get top10 type news. * * @param $index int * @return boolean */ private function getTopTenNews($index) { if (!$this->htmlDom) { return false; } $i = 0; foreach ($this->htmlDom->find('div.newslist') as $element) { if ($i != $index) { $i++; continue; } else { $iconUrl = static::URL_BASE . ltrim($element->find('dd.desc > img', 0)->src, '/'); $iconPath = $this->getNewsIconFilePath($iconUrl, static::NEWS_ICON_DIR_TOP10); foreach ($element->find('p') as $e) { $urlNode = $e->find('a', 0); $title = $urlNode->plaintext; $title = iconv('GB18030', 'UTF-8', trim($title)); $content = ''; $url = static::URL_BASE . ltrim($urlNode->href, '/'); $id = 0; if (preg_match("/articles\\/([0-9]+).htm/", $url, $matches)) { $id = $matches[1]; } if ($title) { $this->workflow->result($id, $url, $title, $content, $iconPath); } } break; } } return true; }
function scrape_NG_news_article($art_url) { $html = scraperWiki::scrape($art_url); require_once 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find("div#page_head h1") as $data) { $art_title = $data->innertext; } foreach ($dom->find("div#page_head h2") as $data) { $art_subtitle = $data->innertext; } $art_text_array = array(); $art_paragraph_count = 0; $art_text_full = ""; $art_teaser50 = ""; $art_teaser100 = ""; foreach ($dom->find("div#content div.article_text p") as $data) { $art_paragraph_count++; $tmp = str_get_html($data)->plaintext; // $art_text_array[$art_paragraph_count] = $tmp; $art_text_full .= $tmp . " #" . $art_paragraph_count . "# "; //if ($art_paragraph_count == 1) $art_teaser = $tmp; } $art_teaserS = word_teaser($art_text_full, 60); $art_teaserM = word_teaser($art_text_full, 120); /* print $art_text_full; show_article($art_title,$art_subtitle,$art_text_array); for($i=0;$i<count($art_text_array);$i++) { $art_text_full .= $art_text_array[$i]." #".$i."# "; } $art_text_full = $art_text_full->plaintext; $art_teaser = $art_text_array[0]->plaintext; */ // $record = array("Title" => $art_title, "Subtitle" => $art_subtitle, "TeaserS" => $art_teaserS, "TeaserM" => $art_teaserM, "Text" => $art_text_full, "URL" => $art_url); $record = array("TeaserM" => $art_teaserM, "URL" => $art_url); scraperwiki::save(array('URL'), $record); return $record; }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape("http://www.norwegian.no/fly/lavpris/?D_City=CPH&A_City=DUB&TripType=2&D_Day=1&D_Month=201104&R_Day=1&R_Month=201104&AdultCount=1&ChildCount=0&InfantCount=0"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("div"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
/** * Searches for a 100% match. * * @return bool */ public function search() { $result = false; if (!empty($this->searchTerm)) { $this->searchTerm = trim($this->searchTerm); if ($this->getUrl(self::GREENLIGHTURL . '?searchtext=' . urlencode($this->searchTerm) . self::GREENLIGHTVARS) !== false) { if ($ret = $this->_html->find("div.workshopItemTitle")) { if (count($ret) > 0) { foreach ($this->_html->find("div.workshopItemTitle") as $ret) { $this->_title = trim($ret->plaintext); //Sanitize both searchTerm and title for a positive 100% match if ($this->cleanTitles(strtolower($this->_title), strtolower($this->searchTerm)) === true) { if ($ret->parent()->outertext) { preg_match('#id?=(?<gameid>\\d+)#', $ret->parent()->outertext, $matches); $this->_greenlightGameID = $matches['gameid']; } $this->_directURL = self::DIRECTGAMEURL . $this->_greenlightGameID; if ($this->getUrl($this->_directURL) !== false) { $result = true; break; } } else { $result = false; } } } } } } return $result; }