Exemplo n.º 1
0
 public function handleTuangou20121204()
 {
     $taskName = 'handleTuangou20121204';
     $thisTask = $this->update_log_db->get_one(array('file' => $taskName));
     $this->_executedCheck($taskName, $thisTask);
     //
     $groupbuying_db = bpBase::loadModel('groupbuying_model');
     $tuangous = $groupbuying_db->select('', '*', '0,10000', 'id DESC');
     $count = count($tuangous);
     $i = intval($_GET['i']);
     if ($i < $count) {
         //start
         $thisTuangou = $tuangous[$i];
         $auto_db = bpBase::loadModel('autoclassification_model');
         $autoObj = bpBase::loadAppClass('autoObj', 'auto');
         $thisAuto = $auto_db->getCfByID($thisTuangou['autoid']);
         $thisAuto->mlogo = $autoObj->getLogo($thisAuto->id, $thisAuto->logo, $type = 'm', $thisAuto->grade);
         //g3autoid,autoids,coverlogo,contentlogo,brandid,serieid,autointro
         $row = array();
         if ($thisAuto->grade == 1) {
             $row['brandid'] = $thisAuto->id;
             $row['autointro'] = remove_html_tag($thisAuto->intro);
         } else {
             $row['brandid'] = $thisAuto->g1id;
             $row['serieid'] = $thisAuto->id;
             $row['g3autoid'] = $thisAuto->id;
             $row['autointro'] = $thisAuto->advantage;
         }
         $row['coverlogo'] = MAIN_URL_ROOT . $thisAuto->mlogo;
         $row['contentlogo'] = MAIN_URL_ROOT . $thisAuto->mlogo;
         //
         if ($thisAuto->grade == 1) {
             $childAutos = $auto_db->select('`grade`=3 AND `status`<3 AND `g1id`=' . $thisTuangou['autoid']);
         } else {
             $childAutos = $auto_db->select('`status`<3 AND `parentid`=' . $thisTuangou['autoid']);
         }
         if ($childAutos) {
             $row['autoids'] = '';
             $comma = '';
             foreach ($childAutos as $ca) {
                 $row['autoids'] .= $comma . $ca['id'];
             }
         }
         $groupbuying_db->update($row, array('id' => $thisTuangou['id']));
         //end
         $i++;
         showMessage($thisTask['des'] . ':' . $i . '/' . $count, '?m=update&c=updateTask&a=' . $taskName . '&i=' . $i, 0);
     } else {
         $this->_finishTask($taskName);
     }
 }
Exemplo n.º 2
0
 function collect()
 {
     if (isset($_GET['id'])) {
         $id = intval($_GET['id']);
     } else {
         $id = 0;
     }
     $ruleid = $id;
     $spider_rule_db = bpBase::loadModel('spider_rule_model');
     $spider_content_db = bpBase::loadModel('spider_content_model');
     $thisRule = $spider_rule_db->get_one(array('id' => $id));
     //$ruleConfigs=unserialize($thisRule->configs);
     //$sourceUrls=$ruleConfigs['urls'];
     if (intval($_GET['step']) == 2) {
         if (file_exists(ABS_PATH . 'upload' . DIRECTORY_SEPARATOR . 'collectContents' . $ruleid . '.txt')) {
             if (!isset($_SESSION['collectArticleCount'])) {
                 $_SESSION['collectArticleCount'] = 0;
             }
             $contents = unserialize(file_get_contents(ABS_PATH . 'upload' . DIRECTORY_SEPARATOR . 'collectContents' . $ruleid . '.txt'));
             if ($contents) {
                 $i = intval($_GET['i']);
                 //i循环的是内容地址
                 $contentsCount = count($contents);
                 if ($i < $contentsCount) {
                     $contentid = $contents[$i]->id;
                     //content
                     $thisContent = $spider_content_db->get_row(array('id' => $contentid));
                     $content = unserialize($thisContent->content);
                     //rule,get channelid
                     $channelid = intval($thisRule['channelid']);
                     //
                     $content['channel_id'] = $channelid;
                     $content['time'] = $content['addtime'];
                     if (!$content['time']) {
                         $content['time'] = SYS_TIME;
                     }
                     $channelObj = bpBase::loadAppClass('channelObj', 'channel');
                     $thisChannel = $channelObj->getChannelByID($channelid);
                     $sameContent = $this->article_db->get_one(array('channel_id' => $channelid, 'title' => $content['title']));
                     if (!$sameContent) {
                         //忽略相同标题内容
                         //clear links and save image
                         //autoid
                         $autoidAndkeyword = $this->articleObj->getRalateAuto($content['title']);
                         //根据标题获取相关车型
                         $content['autoid'] = $autoidAndkeyword['autoid'];
                         $content['keywords'] = $autoidAndkeyword['keyword'];
                         //clear href
                         if ($thisRule['clearhref']) {
                             $content['content'] = clearHtmlTagA($content['content']);
                             //
                         }
                         //$intro
                         $intro = $content['intro'];
                         if (!$intro) {
                             $txtContent = remove_html_tag($content['content']);
                             $intro = mb_substr($txtContent, 0, 200, 'gbk');
                         }
                         if (strlen($content['title'])) {
                             if (!$content['thumb']) {
                                 $thumb = $this->_setFirstImageAsThumb($thisChannel, $content['content']);
                             } else {
                                 $thumb = $this->_setFirstImageAsThumb($thisChannel, '<img src="' . $content['thumb'] . '" />');
                             }
                             //
                             if (file_exists(ABS_PATH . 'config' . DIRECTORY_SEPARATOR . 'cms.php')) {
                                 require ABS_PATH . 'config' . DIRECTORY_SEPARATOR . 'cms.php';
                                 if (defined('CONTENT_IMAGE_SEP_HANDLE') && CONTENT_IMAGE_SEP_HANDLE) {
                                     $content['content'] = $this->articleObj->autoSaveRemoteImage($content['content']);
                                 }
                             }
                             if (!$content['time']) {
                                 $content['time'] = SYS_TIME;
                             }
                             //
                             $row = array('channel_id' => $channelid, 'title' => $content['title'], 'subtitle' => '', 'link' => '', 'externallink' => 0, 'thumb' => $thumb, 'content' => $content['content'], 'intro' => $intro, 'author' => $content['author'], 'source' => $content['source'], 'uid' => 0, 'time' => $content['time'], 'last_update' => $content['time'], 'autoid' => $content['autoid'], 'keywords' => $content['contentPageCount'], 'cancomment' => 1, 'titles' => $content['pagetitle'], 'site' => 1);
                             if ($channelid != 1) {
                                 $row['content'] = $this->_addAutoLink($row['content']);
                             }
                             $row['pagecount'] = $this->_calContentPageCount($row['content']);
                             //
                             $row['content'] = $this->articleObj->autoSaveRemoteImage($row['content']);
                             $siteObj = bpBase::loadAppClass('siteObj', 'site');
                             $thisSite = $siteObj->getSiteByID(1);
                             if (intval($thisSite->abspath)) {
                                 $row['content'] = str_replace('src="/upload', 'src="' . MAIN_URL_ROOT . '/upload', $row['content']);
                             }
                             //
                             if (substr($row['keywords'], 0, 1) != ',') {
                                 $row['keywords'] = ',' . $row['keywords'];
                             }
                             $this->_add($row, $thisChannel, 1);
                             $_SESSION['collectArticleCount']++;
                             $tip = '';
                             $spider_content_db->update(array('handle' => 1), array('id' => $contentid));
                         } else {
                             $tip = ',该文章没有采集到标题,不能入库';
                         }
                     } else {
                         $spider_content_db->update(array('handle' => 1), array('id' => $contentid));
                     }
                     //采集下一篇内容
                     $nextI = $i + 1;
                     showMessage('正在入库:' . $nextI . '/' . $contentsCount . $tip . '...<a href="?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2&i=' . $i . '">刷新</a>&nbsp;&nbsp;<a href="?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2&i=' . $nextI . '">跳到下一个</a>', '?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2&i=' . $nextI, 1);
                 } else {
                     unlink(ABS_PATH . '/upload/collectContents' . $ruleid . '.txt');
                     $collectCount = $_SESSION['collectArticleCount'];
                     unset($_SESSION['collectArticleCount']);
                     showMessage('采集结束,共采集入库' . $collectCount . '篇文章');
                 }
             } else {
                 showMessage('没有要入库的内容');
             }
         } else {
             $contents = $spider_content_db->get_results('id', '', 'ruleid=' . $ruleid . ' AND title!=\'\' AND handle=0', 'id ASC');
             file_put_contents(ABS_PATH . 'upload' . DIRECTORY_SEPARATOR . 'collectContents' . $ruleid . '.txt', serialize($contents));
             showMessage('正在入库', '?m=article&c=m_article&a=collect&id=' . $ruleid . '&step=2', 1);
         }
     }
 }