private function analysis_html($page, $loction_url) { $spider_url = array(); //spider_url表的数组 $spider_url['content'] = NULL; //抽取正文内容 $iTextExtractor = new textExtract($page, 6); //$spider_url['content'] = $iTextExtractor->getPlainText(); $s_text = mb_substr($iTextExtractor->getPlainText(), 0, 1000); $spider_url['content'] = $s_text; /* $Readability = new Readability($page); // default charset is utf-8 $ReadabilityData = $Readability->getContent(); $spider_url['content'] = strip_tags($ReadabilityData['content'],'<img>'); */ //先把header 用正则提取出来 preg_match("@<head[^>]*>(.*?)<\\/head>@si", $page, $regs); $headdata = $regs[1]; //头部 $spider_url['description'] = NULL; $spider_url['keywords'] = NULL; $spider_url['author'] = NULL; $spider_url['title'] = NULL; $res = array(); if (isset($headdata)) { #获取Description preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res); if (isset($res)) { $spider_url['description'] = $res[1]; } //获取keywords preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res); if (isset($res)) { $spider_url['keywords'] = $res[1]; } //获取author preg_match("/<meta +name *=[\"']?author[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res); if (isset($res)) { @($spider_url['author'] = $res[1]); } //title preg_match("/<title>(.*)<\\/title>/Ui", $headdata, $res); if (isset($res)) { $spider_url['title'] = trim($res[1]); } } //echo md5($page); $spider_url['url'] = $loction_url; $spider_url['content_md5'] = md5($page); $spider_url['addtime'] = date("Y-m-d H:m:s"); $spider_url['site_id'] = $this->site_id; //把数组写入或更新到spider_url表 $result['add_spider_url'] = $this->CI->spider_model->add_spider_url($spider_url); return $result; }
clear: both; font-size: 12px; } </style> </head> <body> <center> <div id="allcontent"> <div id="title"><p>网页正文提取演示</p></div> <?php if (isset($_POST['url'])) { set_time_limit(60 * 10); require_once 'class.textExtract.php'; $iTextExtractor = new textExtract($_POST['url']); $text = $iTextExtractor->getPlainText(); if ($iTextExtractor->isGB) { $text = iconv('GBK', 'UTF-8//IGNORE', $text); } echo '<form method="post" action="demo.php"> <span class="des">网址:</span><input type="text" name="url" size="60" /> <input type="submit" name="submit" value="提取" /> </form>'; echo '<p id="gap"><?p>'; echo '<p id="lf" class="des">正文:</p>'; echo '<p id="text">' . $text . '</p>'; } else { echo '<form method="post" action="demo.php"> <span class="des">网址:</span><input type="text" name="url" size="60" /> <input type="submit" name="submit" value="提取" /> </form>';