public function collectAction($id) { $task = CollectTaskModel::inst()->getTaskById($id); if (empty($task)) { $this->redirect[] = array('text' => '', 'href' => '/collect/task/index'); $this->message('采集规则不存在或已被删除', self::MSG_ERROR, true); } if ($_GET['type'] == 'list') { $cacheKey = "jvod.collect.list.urls"; if ($this->redis->llen($cacheKey) == 0) { $collect_list_urls = CollectTaskModel::getListUrls($task['collect_task_urls']); foreach ($collect_list_urls as $_k => $_v) { $this->redis->lpush($cacheKey, $_v); } } for ($i = 0; $i < 20 && $this->redis->llen($cacheKey) > 0; $i++) { $collect_list_url = $this->redis->rpop($cacheKey); $collect_content_urls = CollectTaskModel::getContentUrls($collect_list_url, $task['collect_list_rules']['begin'], $task['collect_list_rules']['end']); $repeat_number = 0; $sql = "SELECT COUNT(collect_content_id) FROM {{collect_content}} WHERE collect_content_url=:collect_content_url"; foreach ($collect_content_urls as $_k => $_v) { if ($this->db->queryScalar($sql, array(':collect_content_url' => $_v)) == 0) { $this->db->insert('{{collect_content}}', array('collect_content_id' => 0, 'collect_task_id' => $id, 'collect_content_url' => $_v, 'is_published' => 0, 'is_collected' => 0, 'lasttime' => $_SERVER['REQUEST_TIME'], 'dateline' => $_SERVER['REQUEST_TIME'])); } else { $repeat_number++; } if ($repeat_number > 20) { // 超过20个重复地址,退出内容地址采集 break; } } } } else { $cacheKey = "jvod.collect.content.urls"; if ($this->queue->llen($cacheKey) == 0) { $sql = "SELECT collect_content_id, collect_task_id, collect_content_url, is_collected, is_published, lasttime, dateline FROM {{collect_content}} WHERE collect_task_id=:collect_task_id AND is_collected=:is_collected"; $ret = $this->db->queryAll($sql, array(':collect_task_id' => $id, ':is_collected' => 0)); foreach ($ret as $_k => $_v) { $this->queue->lpush($cacheKey, json_encode(array($_v['collect_content_id'], $_v['collect_content_url']))); } } for ($i = 0; $i < 20 && $this->queue->llen($cacheKey) > 0; $i++) { list($collect_content_id, $collect_content_url) = json_decode($this->queue->rpop($cacheKey)); $collect_content_charset = 'utf-8'; $collect_content_body = CollectTask::get_url_contents($collect_content_url, $collect_content_charset); if (strtolower($collect_content_charset) != 'utf-8') { $collect_content_body = mb_convert_encoding($collect_content_body, 'UTF-8', $collect_content_charset); } $task['collect_content_data'] = array(); $data = array('collect_content_id' => $collect_content_id); foreach ($task['collect_content_rules'] as $_k => $_v) { if ($_v['begin'] && $_v['end']) { preg_match("/{$_v['begin']}(.+?){$_v['end']}/s", $collect_content_body, $ret); $data[$_v['collect_fields_identify']] = $ret[1] ? $ret[1] : ''; } else { $data[$_v['collect_fields_identify']] = ''; } } $flag = $this->db->insert("{{collect_model_addons{$task['collect_model_identify']}}}", $data); if ($flag) { $this->db->update("{{collect_content}}", array('is_collected' => 1), 'collect_content_url=:collect_content_url', array(':collect_content_url' => $collect_content_url)); } } } }