Exemple #1
0
 public function collectAction($id)
 {
     $task = CollectTaskModel::inst()->getTaskById($id);
     if (empty($task)) {
         $this->redirect[] = array('text' => '', 'href' => '/collect/task/index');
         $this->message('采集规则不存在或已被删除', self::MSG_ERROR, true);
     }
     if ($_GET['type'] == 'list') {
         $cacheKey = "jvod.collect.list.urls";
         if ($this->redis->llen($cacheKey) == 0) {
             $collect_list_urls = CollectTaskModel::getListUrls($task['collect_task_urls']);
             foreach ($collect_list_urls as $_k => $_v) {
                 $this->redis->lpush($cacheKey, $_v);
             }
         }
         for ($i = 0; $i < 20 && $this->redis->llen($cacheKey) > 0; $i++) {
             $collect_list_url = $this->redis->rpop($cacheKey);
             $collect_content_urls = CollectTaskModel::getContentUrls($collect_list_url, $task['collect_list_rules']['begin'], $task['collect_list_rules']['end']);
             $repeat_number = 0;
             $sql = "SELECT COUNT(collect_content_id) FROM {{collect_content}} WHERE collect_content_url=:collect_content_url";
             foreach ($collect_content_urls as $_k => $_v) {
                 if ($this->db->queryScalar($sql, array(':collect_content_url' => $_v)) == 0) {
                     $this->db->insert('{{collect_content}}', array('collect_content_id' => 0, 'collect_task_id' => $id, 'collect_content_url' => $_v, 'is_published' => 0, 'is_collected' => 0, 'lasttime' => $_SERVER['REQUEST_TIME'], 'dateline' => $_SERVER['REQUEST_TIME']));
                 } else {
                     $repeat_number++;
                 }
                 if ($repeat_number > 20) {
                     // 超过20个重复地址,退出内容地址采集
                     break;
                 }
             }
         }
     } else {
         $cacheKey = "jvod.collect.content.urls";
         if ($this->queue->llen($cacheKey) == 0) {
             $sql = "SELECT collect_content_id, collect_task_id, collect_content_url, is_collected, is_published, lasttime, dateline FROM {{collect_content}} WHERE collect_task_id=:collect_task_id AND is_collected=:is_collected";
             $ret = $this->db->queryAll($sql, array(':collect_task_id' => $id, ':is_collected' => 0));
             foreach ($ret as $_k => $_v) {
                 $this->queue->lpush($cacheKey, json_encode(array($_v['collect_content_id'], $_v['collect_content_url'])));
             }
         }
         for ($i = 0; $i < 20 && $this->queue->llen($cacheKey) > 0; $i++) {
             list($collect_content_id, $collect_content_url) = json_decode($this->queue->rpop($cacheKey));
             $collect_content_charset = 'utf-8';
             $collect_content_body = CollectTask::get_url_contents($collect_content_url, $collect_content_charset);
             if (strtolower($collect_content_charset) != 'utf-8') {
                 $collect_content_body = mb_convert_encoding($collect_content_body, 'UTF-8', $collect_content_charset);
             }
             $task['collect_content_data'] = array();
             $data = array('collect_content_id' => $collect_content_id);
             foreach ($task['collect_content_rules'] as $_k => $_v) {
                 if ($_v['begin'] && $_v['end']) {
                     preg_match("/{$_v['begin']}(.+?){$_v['end']}/s", $collect_content_body, $ret);
                     $data[$_v['collect_fields_identify']] = $ret[1] ? $ret[1] : '';
                 } else {
                     $data[$_v['collect_fields_identify']] = '';
                 }
             }
             $flag = $this->db->insert("{{collect_model_addons{$task['collect_model_identify']}}}", $data);
             if ($flag) {
                 $this->db->update("{{collect_content}}", array('is_collected' => 1), 'collect_content_url=:collect_content_url', array(':collect_content_url' => $collect_content_url));
             }
         }
     }
 }