Exemple #1
0
 /**
  * 采集
  * @param int $id  collect_task_id
  * @url  string  需要读取的文件地址
  * @task  array  采集任务记录数组
  */
 public function ContentReg($id, $url, $task)
 {
     $re = CollectTask::read_html($url);
     if ($re['success']) {
         $content = $re['content'];
     } else {
         echo '<script>alert("' . $re['error'] . '");window.close();</script>';
         exit;
     }
     $url_arr = parse_url($url);
     if ($url_arr['path']) {
         $a = array_filter(explode("/", $url_arr['path']));
         array_pop($a);
         $pa = implode("/", $a) . "/";
     } else {
         $pa = "";
     }
     $content = CollectTask::re2ab($content, $url_arr['scheme'] . "://" . $url_arr['host'] . "/" . $pa);
     $template = CollectTemplate::get_template_by_id($task['collect_template_id'], false);
     if ($task['collect_task_totalpagereg']) {
         $totalpagereg = $task['collect_task_totalpagereg'];
     } elseif ($template['collect_template_totalpagereg']) {
         $totalpagereg = $template['collect_template_totalpagereg'];
     } else {
         $totalpagereg = "";
     }
     $pagerule = $task['collect_task_pagerule'] ? $task['collect_task_pagerule'] : $template['collect_template_pagerule'];
     $totalpage = 0;
     if ($totalpagereg) {
         $totalpagereg = preg_replace("/\\[.*\\]/U", "(\\d)", $totalpagereg);
         preg_match("/" . $totalpagereg . "/", $content, $p);
         if (intval($p[1])) {
             $totalpage = intval($p[1]);
         }
     }
     $pageurl_arr = array();
     if ($totalpage) {
         for ($i = 2; $i <= $totalpage; $i++) {
             $pageurl_arr[] = preg_replace("/{$pagerule}\\d/", $pagerule . $i, $url);
         }
     } else {
         //不是js分页时,则查找分页地址
         $pagestart = $task['collect_task_pagestart'] ? $task['collect_task_pagestart'] : $template['collect_template_pagestart'];
         $pageend = $task['collect_task_pageend'] ? $task['collect_task_pageend'] : $template['collect_template_pageend'];
         if ($pagestart && $pageend) {
             $page_area_reg = str_replace("/", "\\/", $pagestart) . "([\\s\\S]*)" . str_replace("/", "\\/", $pageend);
             preg_match("/{$page_area_reg}/Ui", $content, $mm);
             if ($mm[1]) {
                 preg_match_all('/\\<a.*href=["|\'](.*)?["|\'].*\\>(.*)\\<\\/a\\>/iU', $content, $arr);
                 if (count($arr[1])) {
                     $links = array_unique($arr[1]);
                     sort($links);
                     $source_url = preg_replace("/{$pagerule}\\d/", "", $url);
                     if ($pagerule) {
                         foreach ($links as $k => $v) {
                             if ($v != $url) {
                                 $baseurl = preg_replace("/{$pagerule}\\d/", "", $v);
                                 if ($baseurl == $source_url && $v != $url) {
                                     $pageurl_arr[] = $v;
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     if (is_array($pageurl_arr) && count($pageurl_arr)) {
         //取得所有分页内容
         foreach ($pageurl_arr as $v) {
             $c = CollectTask::read_html($v);
             if ($c['content']) {
                 $c['content'] = CollectTask::re2ab($c['content'], $url_arr['scheme'] . "://" . $url_arr['host'] . "/");
                 $content .= $c['content'];
             }
         }
     }
     $content = str_replace("none", "block", $content);
     //有些分页是默认不显示内容的
     $filter = array();
     if ($task['collect_task_filter']) {
         $filter = array_merge($filter, json_decode($task['collect_task_filter']));
     }
     if ($template['collect_template_filter']) {
         $filter = array_merge($filter, json_decode($template['collect_template_filter']));
     }
     $_charset = CollectTask::get_charset();
     $content = mb_convert_encoding($content, "UTF-8", "gb2312,gbk,utf-8");
     if (count($filter)) {
         foreach ($filter as $f) {
             $f = preg_replace("/\\[.*\\]/U", "([\\s\\S]*)", $f);
             $f = str_replace("/", "\\/", $f);
             $content = preg_replace("/" . $f . "/Ui", " ", $content);
         }
     }
     if ($task['collect_task_rulearr']) {
         eval('$task_rule_arr = ' . $task['collect_task_rulearr'] . ';');
         $task_rule_arr = array_filter($task_rule_arr);
     } else {
         $task_rule_arr = array();
     }
     if ($template['collect_template_fieldsreg']) {
         eval('$template_rule_arr = ' . $template['collect_template_fieldsreg'] . ';');
     }
     if (count($task_rule_arr)) {
         foreach ($task_rule_arr as $k => $v) {
             $template_rule_arr[$k] = $v;
         }
     }
     $template_rule_arr = array_filter($template_rule_arr);
     $fields = array();
     if (count($template_rule_arr) && is_array($template_rule_arr)) {
         foreach ($template_rule_arr as $k => $v) {
             $reg = preg_replace("/\\[.*\\]/U", "([\\s\\S]*)", $v);
             $reg = str_replace("/", "\\/", $reg);
             preg_match_all("/" . $reg . "/U", $content, $ma);
             if (count($ma[1])) {
                 foreach ($ma[1] as $k2 => $v2) {
                     if (CollectTask::is_image($v2)) {
                         if ($task['collect_task_saveimg']) {
                             $ma[1][$k2] = CollectTask::save_image($v2);
                         } else {
                             $ma[1][$k2] = $v2;
                         }
                     } else {
                         if ($task['collect_task_saveimg']) {
                             preg_match_all('/<img.*src=["|\']+(.*)?["|\']+>/Ui', $v2, $arr);
                             //正则出所有图片保存替换
                             if (is_array($arr[1]) && count($arr[1])) {
                                 foreach ($arr[1] as $img) {
                                     $ma[1][$k2] = str_replace($img, CollectTask::save_image($img), $v2);
                                 }
                             }
                         }
                     }
                 }
                 preg_match("/\\[(.*)\\]/U", $v, $match);
                 $fields[] = array('identify' => $k, 'name' => $match[1], 'value' => $ma[1][0]);
             }
         }
     }
     return $fields;
 }
Exemple #2
0
    if (is_array($f['value'])) {
        foreach ($f['value'] as $v) {
            if (CollectTask::is_image($v)) {
                //echo CollectTask::signName($v).".".CollectTask::getFileType($v);
                ?>
			<img src='<?php 
                echo $v;
                ?>
' /><br />
			<?php 
            } else {
                echo $v . '<br />';
            }
        }
    } else {
        if (CollectTask::is_image($f['value'])) {
            ?>
			<img src='<?php 
            echo $v;
            ?>
' /><br />
			<?php 
        } else {
            echo $v . '<br />';
        }
    }
    ?>
                        </td>
		</tr>
                <?php 
}