$SqlStr .= '\'' . $cateid . '\','; //起始标记 $SqlStr .= '`flagstart`='; $SqlStr .= '\'' . $flagstart . '\','; //结束标记 $SqlStr .= '`flagend`='; $SqlStr .= '\'' . $flagend . '\','; //内容开始标记 $SqlStr .= '`flagcontentstart`='; $SqlStr .= '\'' . $flagcontentstart . '\','; //内容结束标记 $SqlStr .= '`flagcontentend`='; $SqlStr .= '\'' . $flagcontentend . '\','; //广告开始标记Array $SqlStr .= '`flagadstart`='; $SqlStr .= '\'' . EnCodeStr($flagadstartarray) . '\','; //广告结束标记Array $SqlStr .= '`flagadend`='; $SqlStr .= '\'' . EnCodeStr($flagadendarray) . '\','; //单项替换 $SqlStr .= '`flagsingle`='; $SqlStr .= '\'' . EnCodeStr($flagsinglearray) . '\','; //更新日期 $SqlStr .= '`posttime`='; $SqlStr .= '\'' . date("Y-m-d", time()) . '\''; $SqlStr .= ' WHERE `id`=' . $id; query($SqlStr); $refresh_msg = '采集规则[<font color="#FF0000">' . $title . '</font>],修改添加成功,返回修改页面。'; $refresh_url = 'source_edit.php?id=' . $id; require $page_name . '.php'; require '../include/debug.inc.php';
/** * 远程抓取图片,保存到本地服务器 * @param $content 需要转换的内容 * @return 返回图片替换后的数据 */ function getContent($Content) { $Content = stripslashes($Content); // echo $Content; //获取图片路径 // preg_match_all( " <img[^>]*src=[\"|']?(^>+)[\"|']?[^>]*>", $Content, $temp ); // preg_match_all( "/src=(\"|')(.*?)(\"|')/i", DeCodeStr($Content), $temp ); preg_match_all("/src=(\"|')(.*?)(\"|')/i", $Content, $temp); $imageList = $temp[2]; // echo '<hr>'. print_r($imageList) . '<hr>'; //*/ $ImagePath = date("ym", time()) . '/' . date("d", time()); createFolder(IMAGEPATH, $ImagePath); //网页上面的路径 $ImageUrl = IMAGEURL . $ImagePath; for ($i = 0; $i < count($imageList); $i++) { $fName = saveFile($imageList[$i], $ImagePath, $ImageUrl); if (!empty($fName)) { $filename[$i] = $fName; } } for ($i = 0; $i < count($imageList); $i++) { $Content = str_replace($imageList[$i], $ImageUrl . $filename[$i], $Content); } /* echo '<hr>'; echo $Content; echo '<hr>'; exit(); //*/ /* //去掉无用的页面脚本 //去掉js $cp = preg_replace( "@\<script(.*?)\</script\>@is", "", $cp ); //去掉HTML //去Table $cp = preg_replace( "@\<table(.*?)\</table\>@is", "", $cp ); //去Tr $cp = preg_replace( "@\<tr(.*?)\</tr\>@is", "", $cp ); //去Td $cp = preg_replace( "@\<td(.*?)\</td\>@is", "", $cp ); //去div $cp = preg_replace( "@\<div(.*?)\</div\>@is", "", $cp ); //去iframe $cp = preg_replace( "@\<iframe(.*?)\</iframe\>@is", "", $cp ); //去掉css //$cp = preg_replace( "@\<style(.*?)\</style\>@is", "", $cp ); */ //去掉超连接 $Content = preg_replace(EnCodeStr("@\\<a(.*?)\\>@is"), "", $Content); //去<!-- --> $Content = preg_replace(EnCodeStr("@\\<!--(.*?)\\--\\>@is"), "", $Content); return $Content; }
/** * 得到一个字符串中的某一部分 * @param $Url 需要抓取的地址 * @param $ArticleId 入库文章编号 * @param $ContentStartFlag 文章内容开始、结束 * @param $ContentEndFlag * @param $FlagAdStart 过滤广告开始、结束标志 * @param $FlagAdEnd * @param $FlagSingle 单项过滤 * @param $ImagePath 图片保存地址 * @param $ImageUrl 图片显示地址 */ function getContent($Url, $id, $ContentStartFlag, $ContentEndFlag, $FlagAdStart, $FlagAdEnd, $FlagSingle, $ImagePath, $ImageUrl, $utf8) { //获取源数据 $Content = file_get_contents($Url); //echo $Content; $Content = EnCodeStr($Content); //echo $Content; /* Zerolone Add 07-04-28 如果是UTF-8编码的则 //*/ if ($utf8) { $Content = mb_convert_encoding($Content, "GB2312", "UTF-8"); } //切割数据 $Content = CutStr($Content, $ContentStartFlag, $ContentEndFlag); //echo $Content; //切割广告 //*// $FlagAdStartArray = explode(",", $FlagAdStart); $FlagAdEndArray = explode(",", $FlagAdEnd); for ($i = 0; $i < count($FlagAdStartArray); $i++) { $Content = CutStr($Content, $FlagAdStartArray[$i], $FlagAdEndArray[$i], 1); } //*/ //echo $Content; //单项替换 $FlagSingleArray = explode(",", $FlagSingle); for ($i = 0; $i < count($FlagSingleArray); $i++) { $Content = str_replace($FlagSingleArray[$i], '', $Content); } echo $Content; //获取图片路径 preg_match_all("/src=(\"|')(.*?)(\"|')/i", DeCodeStr($Content), $temp); $imageList = $temp[2]; //echo $imageList; //建立文件夹 if (!is_dir($ImagePath)) { mkdir($ImagePath); } $ImagePath .= '/' . date("m", time()); if (!is_dir($ImagePath)) { mkdir($ImagePath); } $ImagePath .= '/' . date("d", time()) . '/'; if (!is_dir($ImagePath)) { mkdir($ImagePath); } //网页上面的路径 $ImageUrl .= date("m", time()) . '/' . date("d", time()) . '/'; for ($i = 0; $i < count($imageList); $i++) { $fName = saveFile($imageList[$i], $ImagePath, $ImageUrl); if (!empty($fName)) { $filename[$i] = $fName; } } for ($i = 0; $i < count($imageList); $i++) { $Content = str_replace($imageList[$i], $ImageUrl . $filename[$i], $Content); } /* echo '<hr>'; echo decode($TheContent); echo '<hr>'; exit(); //*/ /* //去掉无用的页面脚本 //去掉js $cp = preg_replace( "@\<script(.*?)\</script\>@is", "", $cp ); //去掉HTML //去Table $cp = preg_replace( "@\<table(.*?)\</table\>@is", "", $cp ); //去Tr $cp = preg_replace( "@\<tr(.*?)\</tr\>@is", "", $cp ); //去Td $cp = preg_replace( "@\<td(.*?)\</td\>@is", "", $cp ); //去div $cp = preg_replace( "@\<div(.*?)\</div\>@is", "", $cp ); //去iframe $cp = preg_replace( "@\<iframe(.*?)\</iframe\>@is", "", $cp ); //去掉css //$cp = preg_replace( "@\<style(.*?)\</style\>@is", "", $cp ); */ //去掉超连接 $Content = preg_replace(EnCodeStr("@\\<a(.*?)\\>@is"), "", $Content); //去<!-- --> $Content = preg_replace(EnCodeStr("@\\<!--(.*?)\\--\\>@is"), "", $Content); //页面内容入库 $SqlStr = 'UPDATE `' . table_pre . 'article` SET '; //文章内容 $SqlStr .= '`flag`=1,'; //文章内容 $SqlStr .= '`content`='; $SqlStr .= '\'' . $Content . '\''; $SqlStr .= ' WHERE `id`=' . $id; // echo $SqlStr; query($SqlStr); return true; }