Esempio n. 1
0
 public static function parseImages($data)
 {
     if (!$data || !isset($data['url']) || !isset($data['type'])) {
         return self::_err('param not enough ~~');
     }
     if ($data['type'] != 2) {
         return self::_err('just handle type => 2( website need to parse to get images) ~~');
     }
     $url = $data['url'];
     $host = parse_url($url, PHP_URL_HOST);
     if (!$host) {
         return self::_err('no host');
     }
     $content = HttpClient::get($url);
     switch ($host) {
         case "www.18weixin.com":
             $content = mb_convert_encoding($content, 'gb2312', 'utf-8');
             $html_dom = new \HtmlParser\ParserDom($content);
             $tmp_images_wrap = $html_dom->find('div.picadd', 0);
             foreach ($tmp_images_wrap->find("img") as $tmp_image_target) {
                 $tmp_target_url = $tmp_image_target->getAttr("src");
                 if (!$tmp_target_url) {
                     continue;
                 }
                 if (substr($tmp_target_url, 0, 4) != "http") {
                     $tmp_target_url = "http://" . $host . $tmp_target_url;
                 }
                 self::addQueue(['url' => $tmp_target_url, 'type' => 1]);
             }
             break;
     }
     return true;
 }
Esempio n. 2
0
function test($sHtml)
{
    $oDom = new \HtmlParser\ParserDom($sHtml);
    $oDom->find('ul.uni-blk-list02', 0);
    $oDom->find('a');
    $oDom->find('ul');
    $oDom->find('p');
}
Esempio n. 3
0
 /**
  * 每天运行一次就ok了
  * php yii emoticon/scrapy/weixin18
  */
 public function actionWeixin18($page = 1)
 {
     $host = 'http://www.18weixin.com';
     $url = $host . '/weixinbiaoqing_%s.shtml';
     $url = sprintf($url, $page);
     $content = HttpClient::get($url);
     $content = mb_convert_encoding($content, 'gb2312', 'utf-8');
     $html_dom = new \HtmlParser\ParserDom($content);
     $img_wrap_array = $html_dom->find('div.imgborder_bqimg_width');
     if (!$img_wrap_array) {
         return $this->echoLog("error:no img tag ~~");
     }
     foreach ($img_wrap_array as $_item) {
         $tmp_target_url = $_item->find("a", 0);
         $tmp_target_url = $host . $tmp_target_url->getAttr("href");
         EmoticonService::addQueue(['url' => $tmp_target_url, 'type' => 2]);
     }
     return $this->echoLog("it's over ~~");
 }
Esempio n. 4
0
 public function testDom()
 {
     $sHtml = self::getHtml();
     $oDom = new \HtmlParser\ParserDom($sHtml);
     $this->assertEquals('p4', $oDom->find('p', -1)->getPlainText());
     $this->assertEquals('p_id', $oDom->find('p[id]', 0)->getPlainText());
     $this->assertEquals('p_id_2', $oDom->find('p[id=p_id_2]', 0)->getPlainText());
     $this->assertEquals('p2', $oDom->find('p[!id]', 1)->getPlainText());
     $this->assertEquals('测试1', $oDom->find('#test1', 0)->getPlainText());
     $oPClass = $oDom->find('p.test_class1', 0);
     $this->assertEquals('p1', $oPClass->getPlainText());
     $this->assertEquals('test_class test_class1', $oPClass->getAttr('class'));
     $lCheck = array('p1', 'p2', 'p3', 'p_id', 'p_id_2');
     $lPTag = $oDom->find('p.test_class');
     $this->assertEquals(5, count($lPTag));
     $lPText = array();
     foreach ($lPTag as $oPTag) {
         $lPText[] = $oPTag->getPlainText();
     }
     $this->assertEquals($lCheck, $lPText);
     $this->assertEquals($oDom->node instanceof \DOMNode, true);
 }
Esempio n. 5
0
$html = '<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>test</title>
  </head>
  <body>
    <p class="test_class test_class1">p1</p>
    <p class="test_class test_class2">p2</p>
    <p class="test_class test_class3">p3</p>
    <div id="test1"><span style="display: none">测试1<br/></span><input date=\'"sdfsf"\' name="test" value="123"/>123123</div>
  </body>
</html>';
$html_dom = new \HtmlParser\ParserDom($html);
$p_array = $html_dom->find('p.test_class');
$p1 = $html_dom->find('p.test_class1', 0);
$div = $html_dom->find('div#test1', 0);
foreach ($p_array as $p) {
    echo $p->getPlainText() . "\n";
}
echo $div->getPlainText() . "\n";
echo $p1->getPlainText() . "\n";
echo $p1->getAttr('class') . "\n";
echo "show html:\n";
echo $div->innerHtml() . "\n";
echo $div->outerHtml() . "\n\n";
$url = 'http://www.sina.com.cn/';
$sHtml = file_get_contents($url);
$oDom = new \HtmlParser\ParserDom($sHtml);
$oFound = $oDom->find('ul.uni-blk-list02', 0);
echo "inner:\n\n" . $oFound->innerHtml() . "\n\n";
echo "outer:\n\n" . $oFound->outerHtml() . "\n";
Esempio n. 6
0
<?php

require_once 'workflows.php';
require_once 'ParserDom.php';
$wf = new Workflows();
$query = "{query}";
$query = str_replace('\\ ', ' ', trim($query));
$request = file_get_contents('http://dict.youdao.com/search?q=' . urlencode($query) . '&keyfrom=dict.top');
$html_dom = new \HtmlParser\ParserDom($request);
if (strstr($query, ' ')) {
    $result = $html_dom->find('#fanyiToggle .trans-container p');
    if (count($result) >= 3) {
        $result = $result[1];
        $wf->result($query, $query, $query, trim($result->getPlainText()), 'icon.png');
    }
} else {
    $result = $html_dom->find('#phrsListTab .trans-container ul li');
    foreach ($result as $value) {
        $value = explode('.', $value->getPlainText());
        $type = trim(array_shift($value), " ");
        $wf->result($query, $query, $query . ' - ' . $type . '.', trim(implode(' ', $value), " "), 'icon.png');
    }
}
echo $wf->toxml();
Esempio n. 7
0
 /**
 * http://www.mca.gov.cn/article/sj/tjbz/a/2016/20161031/201610311102.html
 * http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html
 * 抓取省市区,抓取之后的数据结构如下
 * [
 * 		'province_id' => [
 * 			'id' => '',
 * 			'name' = >'',
 * 			'city_list' => [
 * 				'city_id' => [
 * 					'id' => '',
 * 					'name' => ''
 * 				]
 * 			],
 * 			'district_list' => [
 * 				'city_id' => [
 * 						'district_id' => [
 * 						'id' => '',
 * 						'name' => ''
 * 					]
 * 				]
 * 			]
 * 		]
 * ]
 * php yii report/grab/mac
 *
 */
 public function actionMca()
 {
     $url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html";
     $content = HttpClient::get($url);
     $html_dom = new \HtmlParser\ParserDom($content);
     $target = $html_dom->find('p.MsoNormal');
     if (!$target) {
         return $this->echoLog("error:no table tag ~~");
     }
     $ret = [];
     foreach ($target as $_item_target) {
         $tmp_td_array = $_item_target->find("span");
         if (!$tmp_td_array || count($tmp_td_array) < 3) {
             continue;
         }
         $tmp_id = $tmp_td_array[0]->getPlainText();
         $tmp_title = trim($tmp_td_array[2]->getPlainText());
         $tmp_title = strip_tags($tmp_title);
         $tmp_title = trim($tmp_title, "    ");
         //$this->echoLog($tmp_id.":".$tmp_title);
         if (substr($tmp_id, 2, 4) == "0000") {
             //省
             $ret[$tmp_id] = ['id' => $tmp_id, 'name' => $tmp_title, 'city_list' => [], 'district_list' => []];
             continue;
         }
         if (substr($tmp_id, 4, 2) == "00") {
             //市
             $tmp_province_id = substr($tmp_id, 0, 2) . "0000";
             if (!isset($ret[$tmp_province_id])) {
                 $this->echoLog("error");
                 exit;
             }
             $ret[$tmp_province_id]['city_list'][$tmp_id] = ['id' => $tmp_id, 'name' => $tmp_title];
             continue;
         }
         //区
         $tmp_province_id = substr($tmp_id, 0, 2) . "0000";
         $tmp_city_id = substr($tmp_id, 0, 4) . "00";
         if (!isset($ret[$tmp_province_id])) {
             $this->echoLog("error");
             exit;
         }
         //这种情况就是直辖市,自己新生成一个市
         if (!isset($ret[$tmp_province_id]['city_list'][$tmp_city_id])) {
             $ret[$tmp_province_id]['city_list'][$tmp_city_id] = ['id' => $tmp_city_id, 'name' => $ret[$tmp_province_id]['name']];
         }
         $ret[$tmp_province_id]['district_list'][$tmp_city_id][$tmp_id] = ['id' => $tmp_id, 'name' => $tmp_title];
     }
     foreach ($ret as $_item) {
         $tmp_province_id = $_item['id'];
         $tmp_province_name = $_item['name'];
         $tmp_params = ['id' => $tmp_province_id, 'province_id' => $tmp_province_id, 'province_name' => $tmp_province_name];
         $this->setMapCityItem($tmp_params);
         if (!$_item['city_list']) {
             continue;
         }
         foreach ($_item['city_list'] as $_city_id => $_city_info) {
             $tmp_params = ['id' => $_city_id, 'province_id' => $tmp_province_id, 'province_name' => $tmp_province_name, 'city_id' => $_city_id, 'city_name' => $_city_info['name']];
             $this->setMapCityItem($tmp_params);
             if (!$_item['district_list'] || !isset($_item['district_list'][$_city_id])) {
                 continue;
             }
             $tmp_district_list = $_item['district_list'][$_city_id];
             foreach ($tmp_district_list as $_district_info) {
                 $tmp_params = ['id' => $_district_info['id'], 'province_id' => $tmp_province_id, 'province_name' => $tmp_province_name, 'city_id' => $_city_id, 'city_name' => $_city_info['name'], 'district_id' => $_district_info['id'], 'district_name' => $_district_info['name']];
                 $this->setMapCityItem($tmp_params);
             }
         }
     }
 }