private function parseHTML_content()
 {
     $content = $this->htmlDom->find('div#js_content', 0);
     $this->content = trim($content->innerHtml());
     $this->content = preg_replace("/<noscript>.*?<\\/noscript>/", "", $this->content);
     $this->content = preg_replace("/data-original=\".*?\"/", "", $this->content);
     $this->content = preg_replace("/data-actualsrc=\".*?\"/", "", $this->content);
 }
示例#2
0
 /**
  * 从内容里提取所有的脚本链接
  * @param $content
  * @return array
  */
 function extractScriptUrls($content)
 {
     $this->domParser->load($content);
     $scriptNodes = $this->domParser->find('script');
     return array_map(function ($scriptNode) {
         return $scriptNode->getAttr('src');
     }, $scriptNodes);
 }
示例#3
0
 public function reduce($closure)
 {
     $query = new ParserDom($this->response->getBody());
     $catalogs = array();
     foreach ($query->find('div.bookcont') as $catalog) {
         $chapter = array();
         foreach ($catalog->find('div span a') as $node) {
             $chapter[$node->getPlainText()] = $node->getAttr('href');
         }
         $catalogs[$catalog->find('div.bookMl strong', 0)->getPlainText()] = $chapter;
     }
 }
示例#4
0
$html = '<html>
  <head>
    <meta charset="utf-8">
    <title>test</title>
  </head>
  <body>
    <p class="test_class test_class1">p1</p>
    <p class="test_class test_class2">p2</p>
    <p class="test_class test_class3">p3</p>
    <p id="p_id" class="test_class test_id">p_id</p>
    <p id="p_id_2" class="test_class test_id">p_id_2</p>
    <p>p4</p>
    <div id="test1">测试1</div>
  </body>
</html>';
$dom = new ParserDom($html);
echo $html . "\n\n\n";
echo "p last one: " . $dom->findBreadthFirst('p', -1)->getPlainText() . "\n";
echo "have id first: " . $dom->findBreadthFirst('p[id]', 0)->getPlainText() . "\n";
echo "hava id p_id_2 first: " . $dom->findBreadthFirst('p[id=p_id_2]', 0)->getPlainText() . "\n";
echo "do not have id second: " . $dom->findBreadthFirst('p[!id]', 1)->getPlainText() . "\n";
echo "get by id test1: " . $dom->findBreadthFirst('#test1', 0)->getPlainText() . "\n";
$p1 = $dom->findBreadthFirst('p.test_class1', 0);
echo "get by class test_class1: " . $p1->getPlainText() . "\n";
echo "class: " . $p1->getAttr('class') . "\n";
$p_array = $dom->findBreadthFirst('p.test_class');
echo "\np list: \n";
foreach ($p_array as $p) {
    echo $p->getPlainText() . "\n";
}
echo "\n";
示例#5
0
文件: HtmlDom.php 项目: LL233/crawler
 public function __construct()
 {
     parent::__construct();
 }