function test($sHtml) { $oDom = new \HtmlParser\ParserDom($sHtml); $oDom->find('ul.uni-blk-list02', 0); $oDom->find('a'); $oDom->find('ul'); $oDom->find('p'); }
public static function parseImages($data) { if (!$data || !isset($data['url']) || !isset($data['type'])) { return self::_err('param not enough ~~'); } if ($data['type'] != 2) { return self::_err('just handle type => 2( website need to parse to get images) ~~'); } $url = $data['url']; $host = parse_url($url, PHP_URL_HOST); if (!$host) { return self::_err('no host'); } $content = HttpClient::get($url); switch ($host) { case "www.18weixin.com": $content = mb_convert_encoding($content, 'gb2312', 'utf-8'); $html_dom = new \HtmlParser\ParserDom($content); $tmp_images_wrap = $html_dom->find('div.picadd', 0); foreach ($tmp_images_wrap->find("img") as $tmp_image_target) { $tmp_target_url = $tmp_image_target->getAttr("src"); if (!$tmp_target_url) { continue; } if (substr($tmp_target_url, 0, 4) != "http") { $tmp_target_url = "http://" . $host . $tmp_target_url; } self::addQueue(['url' => $tmp_target_url, 'type' => 1]); } break; } return true; }
/** * 每天运行一次就ok了 * php yii emoticon/scrapy/weixin18 */ public function actionWeixin18($page = 1) { $host = 'http://www.18weixin.com'; $url = $host . '/weixinbiaoqing_%s.shtml'; $url = sprintf($url, $page); $content = HttpClient::get($url); $content = mb_convert_encoding($content, 'gb2312', 'utf-8'); $html_dom = new \HtmlParser\ParserDom($content); $img_wrap_array = $html_dom->find('div.imgborder_bqimg_width'); if (!$img_wrap_array) { return $this->echoLog("error:no img tag ~~"); } foreach ($img_wrap_array as $_item) { $tmp_target_url = $_item->find("a", 0); $tmp_target_url = $host . $tmp_target_url->getAttr("href"); EmoticonService::addQueue(['url' => $tmp_target_url, 'type' => 2]); } return $this->echoLog("it's over ~~"); }
public function testDom() { $sHtml = self::getHtml(); $oDom = new \HtmlParser\ParserDom($sHtml); $this->assertEquals('p4', $oDom->find('p', -1)->getPlainText()); $this->assertEquals('p_id', $oDom->find('p[id]', 0)->getPlainText()); $this->assertEquals('p_id_2', $oDom->find('p[id=p_id_2]', 0)->getPlainText()); $this->assertEquals('p2', $oDom->find('p[!id]', 1)->getPlainText()); $this->assertEquals('测试1', $oDom->find('#test1', 0)->getPlainText()); $oPClass = $oDom->find('p.test_class1', 0); $this->assertEquals('p1', $oPClass->getPlainText()); $this->assertEquals('test_class test_class1', $oPClass->getAttr('class')); $lCheck = array('p1', 'p2', 'p3', 'p_id', 'p_id_2'); $lPTag = $oDom->find('p.test_class'); $this->assertEquals(5, count($lPTag)); $lPText = array(); foreach ($lPTag as $oPTag) { $lPText[] = $oPTag->getPlainText(); } $this->assertEquals($lCheck, $lPText); $this->assertEquals($oDom->node instanceof \DOMNode, true); }
$html = '<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>test</title> </head> <body> <p class="test_class test_class1">p1</p> <p class="test_class test_class2">p2</p> <p class="test_class test_class3">p3</p> <div id="test1"><span style="display: none">测试1<br/></span><input date=\'"sdfsf"\' name="test" value="123"/>123123</div> </body> </html>'; $html_dom = new \HtmlParser\ParserDom($html); $p_array = $html_dom->find('p.test_class'); $p1 = $html_dom->find('p.test_class1', 0); $div = $html_dom->find('div#test1', 0); foreach ($p_array as $p) { echo $p->getPlainText() . "\n"; } echo $div->getPlainText() . "\n"; echo $p1->getPlainText() . "\n"; echo $p1->getAttr('class') . "\n"; echo "show html:\n"; echo $div->innerHtml() . "\n"; echo $div->outerHtml() . "\n\n"; $url = 'http://www.sina.com.cn/'; $sHtml = file_get_contents($url); $oDom = new \HtmlParser\ParserDom($sHtml); $oFound = $oDom->find('ul.uni-blk-list02', 0); echo "inner:\n\n" . $oFound->innerHtml() . "\n\n"; echo "outer:\n\n" . $oFound->outerHtml() . "\n";
<?php require_once 'workflows.php'; require_once 'ParserDom.php'; $wf = new Workflows(); $query = "{query}"; $query = str_replace('\\ ', ' ', trim($query)); $request = file_get_contents('http://dict.youdao.com/search?q=' . urlencode($query) . '&keyfrom=dict.top'); $html_dom = new \HtmlParser\ParserDom($request); if (strstr($query, ' ')) { $result = $html_dom->find('#fanyiToggle .trans-container p'); if (count($result) >= 3) { $result = $result[1]; $wf->result($query, $query, $query, trim($result->getPlainText()), 'icon.png'); } } else { $result = $html_dom->find('#phrsListTab .trans-container ul li'); foreach ($result as $value) { $value = explode('.', $value->getPlainText()); $type = trim(array_shift($value), " "); $wf->result($query, $query, $query . ' - ' . $type . '.', trim(implode(' ', $value), " "), 'icon.png'); } } echo $wf->toxml();
/** * http://www.mca.gov.cn/article/sj/tjbz/a/2016/20161031/201610311102.html * http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html * 抓取省市区,抓取之后的数据结构如下 * [ * 'province_id' => [ * 'id' => '', * 'name' = >'', * 'city_list' => [ * 'city_id' => [ * 'id' => '', * 'name' => '' * ] * ], * 'district_list' => [ * 'city_id' => [ * 'district_id' => [ * 'id' => '', * 'name' => '' * ] * ] * ] * ] * ] * php yii report/grab/mac * */ public function actionMca() { $url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html"; $content = HttpClient::get($url); $html_dom = new \HtmlParser\ParserDom($content); $target = $html_dom->find('p.MsoNormal'); if (!$target) { return $this->echoLog("error:no table tag ~~"); } $ret = []; foreach ($target as $_item_target) { $tmp_td_array = $_item_target->find("span"); if (!$tmp_td_array || count($tmp_td_array) < 3) { continue; } $tmp_id = $tmp_td_array[0]->getPlainText(); $tmp_title = trim($tmp_td_array[2]->getPlainText()); $tmp_title = strip_tags($tmp_title); $tmp_title = trim($tmp_title, " "); //$this->echoLog($tmp_id.":".$tmp_title); if (substr($tmp_id, 2, 4) == "0000") { //省 $ret[$tmp_id] = ['id' => $tmp_id, 'name' => $tmp_title, 'city_list' => [], 'district_list' => []]; continue; } if (substr($tmp_id, 4, 2) == "00") { //市 $tmp_province_id = substr($tmp_id, 0, 2) . "0000"; if (!isset($ret[$tmp_province_id])) { $this->echoLog("error"); exit; } $ret[$tmp_province_id]['city_list'][$tmp_id] = ['id' => $tmp_id, 'name' => $tmp_title]; continue; } //区 $tmp_province_id = substr($tmp_id, 0, 2) . "0000"; $tmp_city_id = substr($tmp_id, 0, 4) . "00"; if (!isset($ret[$tmp_province_id])) { $this->echoLog("error"); exit; } //这种情况就是直辖市,自己新生成一个市 if (!isset($ret[$tmp_province_id]['city_list'][$tmp_city_id])) { $ret[$tmp_province_id]['city_list'][$tmp_city_id] = ['id' => $tmp_city_id, 'name' => $ret[$tmp_province_id]['name']]; } $ret[$tmp_province_id]['district_list'][$tmp_city_id][$tmp_id] = ['id' => $tmp_id, 'name' => $tmp_title]; } foreach ($ret as $_item) { $tmp_province_id = $_item['id']; $tmp_province_name = $_item['name']; $tmp_params = ['id' => $tmp_province_id, 'province_id' => $tmp_province_id, 'province_name' => $tmp_province_name]; $this->setMapCityItem($tmp_params); if (!$_item['city_list']) { continue; } foreach ($_item['city_list'] as $_city_id => $_city_info) { $tmp_params = ['id' => $_city_id, 'province_id' => $tmp_province_id, 'province_name' => $tmp_province_name, 'city_id' => $_city_id, 'city_name' => $_city_info['name']]; $this->setMapCityItem($tmp_params); if (!$_item['district_list'] || !isset($_item['district_list'][$_city_id])) { continue; } $tmp_district_list = $_item['district_list'][$_city_id]; foreach ($tmp_district_list as $_district_info) { $tmp_params = ['id' => $_district_info['id'], 'province_id' => $tmp_province_id, 'province_name' => $tmp_province_name, 'city_id' => $_city_id, 'city_name' => $_city_info['name'], 'district_id' => $_district_info['id'], 'district_name' => $_district_info['name']]; $this->setMapCityItem($tmp_params); } } } }