public function handleTuangou20121204() { $taskName = 'handleTuangou20121204'; $thisTask = $this->update_log_db->get_one(array('file' => $taskName)); $this->_executedCheck($taskName, $thisTask); // $groupbuying_db = bpBase::loadModel('groupbuying_model'); $tuangous = $groupbuying_db->select('', '*', '0,10000', 'id DESC'); $count = count($tuangous); $i = intval($_GET['i']); if ($i < $count) { //start $thisTuangou = $tuangous[$i]; $auto_db = bpBase::loadModel('autoclassification_model'); $autoObj = bpBase::loadAppClass('autoObj', 'auto'); $thisAuto = $auto_db->getCfByID($thisTuangou['autoid']); $thisAuto->mlogo = $autoObj->getLogo($thisAuto->id, $thisAuto->logo, $type = 'm', $thisAuto->grade); //g3autoid,autoids,coverlogo,contentlogo,brandid,serieid,autointro $row = array(); if ($thisAuto->grade == 1) { $row['brandid'] = $thisAuto->id; $row['autointro'] = remove_html_tag($thisAuto->intro); } else { $row['brandid'] = $thisAuto->g1id; $row['serieid'] = $thisAuto->id; $row['g3autoid'] = $thisAuto->id; $row['autointro'] = $thisAuto->advantage; } $row['coverlogo'] = MAIN_URL_ROOT . $thisAuto->mlogo; $row['contentlogo'] = MAIN_URL_ROOT . $thisAuto->mlogo; // if ($thisAuto->grade == 1) { $childAutos = $auto_db->select('`grade`=3 AND `status`<3 AND `g1id`=' . $thisTuangou['autoid']); } else { $childAutos = $auto_db->select('`status`<3 AND `parentid`=' . $thisTuangou['autoid']); } if ($childAutos) { $row['autoids'] = ''; $comma = ''; foreach ($childAutos as $ca) { $row['autoids'] .= $comma . $ca['id']; } } $groupbuying_db->update($row, array('id' => $thisTuangou['id'])); //end $i++; showMessage($thisTask['des'] . ':' . $i . '/' . $count, '?m=update&c=updateTask&a=' . $taskName . '&i=' . $i, 0); } else { $this->_finishTask($taskName); } }
function collect() { if (isset($_GET['id'])) { $id = intval($_GET['id']); } else { $id = 0; } $ruleid = $id; $spider_rule_db = bpBase::loadModel('spider_rule_model'); $spider_content_db = bpBase::loadModel('spider_content_model'); $thisRule = $spider_rule_db->get_one(array('id' => $id)); //$ruleConfigs=unserialize($thisRule->configs); //$sourceUrls=$ruleConfigs['urls']; if (intval($_GET['step']) == 2) { if (file_exists(ABS_PATH . 'upload' . DIRECTORY_SEPARATOR . 'collectContents' . $ruleid . '.txt')) { if (!isset($_SESSION['collectArticleCount'])) { $_SESSION['collectArticleCount'] = 0; } $contents = unserialize(file_get_contents(ABS_PATH . 'upload' . DIRECTORY_SEPARATOR . 'collectContents' . $ruleid . '.txt')); if ($contents) { $i = intval($_GET['i']); //i循环的是内容地址 $contentsCount = count($contents); if ($i < $contentsCount) { $contentid = $contents[$i]->id; //content $thisContent = $spider_content_db->get_row(array('id' => $contentid)); $content = unserialize($thisContent->content); //rule,get channelid $channelid = intval($thisRule['channelid']); // $content['channel_id'] = $channelid; $content['time'] = $content['addtime']; if (!$content['time']) { $content['time'] = SYS_TIME; } $channelObj = bpBase::loadAppClass('channelObj', 'channel'); $thisChannel = $channelObj->getChannelByID($channelid); $sameContent = $this->article_db->get_one(array('channel_id' => $channelid, 'title' => $content['title'])); if (!$sameContent) { //忽略相同标题内容 //clear links and save image //autoid $autoidAndkeyword = $this->articleObj->getRalateAuto($content['title']); //根据标题获取相关车型 $content['autoid'] = $autoidAndkeyword['autoid']; $content['keywords'] = $autoidAndkeyword['keyword']; //clear href if ($thisRule['clearhref']) { $content['content'] = clearHtmlTagA($content['content']); // } //$intro $intro = $content['intro']; if (!$intro) { $txtContent = remove_html_tag($content['content']); $intro = mb_substr($txtContent, 0, 200, 'gbk'); } if (strlen($content['title'])) { if (!$content['thumb']) { $thumb = $this->_setFirstImageAsThumb($thisChannel, $content['content']); } else { $thumb = $this->_setFirstImageAsThumb($thisChannel, '<img src="' . $content['thumb'] . '" />'); } // if (file_exists(ABS_PATH . 'config' . DIRECTORY_SEPARATOR . 'cms.php')) { require ABS_PATH . 'config' . DIRECTORY_SEPARATOR . 'cms.php'; if (defined('CONTENT_IMAGE_SEP_HANDLE') && CONTENT_IMAGE_SEP_HANDLE) { $content['content'] = $this->articleObj->autoSaveRemoteImage($content['content']); } } if (!$content['time']) { $content['time'] = SYS_TIME; } // $row = array('channel_id' => $channelid, 'title' => $content['title'], 'subtitle' => '', 'link' => '', 'externallink' => 0, 'thumb' => $thumb, 'content' => $content['content'], 'intro' => $intro, 'author' => $content['author'], 'source' => $content['source'], 'uid' => 0, 'time' => $content['time'], 'last_update' => $content['time'], 'autoid' => $content['autoid'], 'keywords' => $content['contentPageCount'], 'cancomment' => 1, 'titles' => $content['pagetitle'], 'site' => 1); if ($channelid != 1) { $row['content'] = $this->_addAutoLink($row['content']); } $row['pagecount'] = $this->_calContentPageCount($row['content']); // $row['content'] = $this->articleObj->autoSaveRemoteImage($row['content']); $siteObj = bpBase::loadAppClass('siteObj', 'site'); $thisSite = $siteObj->getSiteByID(1); if (intval($thisSite->abspath)) { $row['content'] = str_replace('src="/upload', 'src="' . MAIN_URL_ROOT . '/upload', $row['content']); } // if (substr($row['keywords'], 0, 1) != ',') { $row['keywords'] = ',' . $row['keywords']; } $this->_add($row, $thisChannel, 1); $_SESSION['collectArticleCount']++; $tip = ''; $spider_content_db->update(array('handle' => 1), array('id' => $contentid)); } else { $tip = ',该文章没有采集到标题,不能入库'; } } else { $spider_content_db->update(array('handle' => 1), array('id' => $contentid)); } //采集下一篇内容 $nextI = $i + 1; showMessage('正在入库:' . $nextI . '/' . $contentsCount . $tip . '...<a href="?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2&i=' . $i . '">刷新</a> <a href="?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2&i=' . $nextI . '">跳到下一个</a>', '?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2&i=' . $nextI, 1); } else { unlink(ABS_PATH . '/upload/collectContents' . $ruleid . '.txt'); $collectCount = $_SESSION['collectArticleCount']; unset($_SESSION['collectArticleCount']); showMessage('采集结束,共采集入库' . $collectCount . '篇文章'); } } else { showMessage('没有要入库的内容'); } } else { $contents = $spider_content_db->get_results('id', '', 'ruleid=' . $ruleid . ' AND title!=\'\' AND handle=0', 'id ASC'); file_put_contents(ABS_PATH . 'upload' . DIRECTORY_SEPARATOR . 'collectContents' . $ruleid . '.txt', serialize($contents)); showMessage('正在入库', '?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2', 1); } } }