Esempio n. 1
0
 private function analysis_html($page, $loction_url)
 {
     $spider_url = array();
     //spider_url表的数组
     $spider_url['content'] = NULL;
     //抽取正文内容
     $iTextExtractor = new textExtract($page, 6);
     //$spider_url['content'] = $iTextExtractor->getPlainText();
     $s_text = mb_substr($iTextExtractor->getPlainText(), 0, 1000);
     $spider_url['content'] = $s_text;
     /*
     $Readability     = new Readability($page); // default charset is utf-8
     $ReadabilityData = $Readability->getContent();
     $spider_url['content'] = strip_tags($ReadabilityData['content'],'<img>');
     */
     //先把header 用正则提取出来
     preg_match("@<head[^>]*>(.*?)<\\/head>@si", $page, $regs);
     $headdata = $regs[1];
     //头部
     $spider_url['description'] = NULL;
     $spider_url['keywords'] = NULL;
     $spider_url['author'] = NULL;
     $spider_url['title'] = NULL;
     $res = array();
     if (isset($headdata)) {
         #获取Description
         preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
         if (isset($res)) {
             $spider_url['description'] = $res[1];
         }
         //获取keywords
         preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
         if (isset($res)) {
             $spider_url['keywords'] = $res[1];
         }
         //获取author
         preg_match("/<meta +name *=[\"']?author[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
         if (isset($res)) {
             @($spider_url['author'] = $res[1]);
         }
         //title
         preg_match("/<title>(.*)<\\/title>/Ui", $headdata, $res);
         if (isset($res)) {
             $spider_url['title'] = trim($res[1]);
         }
     }
     //echo md5($page);
     $spider_url['url'] = $loction_url;
     $spider_url['content_md5'] = md5($page);
     $spider_url['addtime'] = date("Y-m-d H:m:s");
     $spider_url['site_id'] = $this->site_id;
     //把数组写入或更新到spider_url表
     $result['add_spider_url'] = $this->CI->spider_model->add_spider_url($spider_url);
     return $result;
 }
Esempio n. 2
0
		margin-top: 60px;
		clear: both;
		font-size: 12px;
	}
	</style>
</head>

<body>
	<center>
		<div id="allcontent">
		<div id="title"><p>网页正文提取演示</p></div>
		<?php 
if (isset($_POST['url'])) {
    set_time_limit(60 * 10);
    require_once 'class.textExtract.php';
    $iTextExtractor = new textExtract($_POST['url']);
    $text = $iTextExtractor->getPlainText();
    if ($iTextExtractor->isGB) {
        $text = iconv('GBK', 'UTF-8//IGNORE', $text);
    }
    echo '<form method="post" action="demo.php">
						<span class="des">网址:</span><input type="text" name="url" size="60" />
						<input type="submit" name="submit" value="提取" />
					  </form>';
    echo '<p id="gap"><?p>';
    echo '<p id="lf" class="des">正文:</p>';
    echo '<p id="text">' . $text . '</p>';
} else {
    echo '<form method="post" action="demo.php">
						<span class="des">网址:</span><input type="text" name="url" size="60" />
						<input type="submit" name="submit" value="提取" />