function getBody() { if ($this->parts && $this->body === null) { foreach ($this->parts as $part) { $partNo = $part['partNo']; $encoding = $part['encoding']; $charset = $part['charset']; $body = imap_fetchbody($this->message->getMailbox(), $this->message->getUID(), $partNo, FT_UID); $body = BrIMAP::decode($body, $encoding); if ($charset) { $body = @iconv($charset, 'UTF-8', $body); } $body = trim($body); $body = preg_replace('~<head[^>]*?>.*?</head>~ism', '', $body); $body = preg_replace('~<meta[^>]*?>~ism', '', $body); $body = preg_replace('~<base[^>]*?>~ism', '', $body); $body = preg_replace('~<style[^>]*?>.*?</style>~ism', '', $body); if ($this->isHTML && $body) { try { $doc = phpQuery::newDocument($body); $bodyTag = $doc->find('body'); if ($bodyTag->length() > 0) { $body = trim(pq($bodyTag)->html()); } else { $body = trim($doc->html()); } phpQuery::unloadDocuments(); } catch (Exception $e) { } } $this->body .= $body; } } return $this->body; }
function load_html($html) { $tidy = tidy_parse_string($html); tidy_clean_repair($tidy); $html = tidy_get_html($tidy); phpQuery::unloadDocuments(); return phpQuery::newDocumentHTML($html); }
/** * @param $conf ConfNode * @param $html * @return array|float|int|String */ public static function parseHtml($conf, $html) { query::$defaultCharset = "utf-8"; $doc = query::newDocument($html); query::selectDocument($doc); $value = self::queryValue(pq($doc), $conf); //清理内存 query::unloadDocuments($doc); return $value; }
public function parseTest() { $html = file_get_contents('http://dmtoys.com.ua'); $results = phpQuery::newDocument($html); $resultsA = pq($results)['img']; $dd = []; foreach ($resultsA as $key => $value) { $dd[]['href'] = pq($value)->attr('src'); } phpQuery::unloadDocuments(); return $dd; }
/** * add image download task, echo parsed content * * @param unknown $r * @param unknown $param */ function cb2($r, $param) { if (!$this->hasHttpError($r['info'])) { $html = phpQuery::newDocumentHTML($r['content']); $list = $html['#song-list td.songTitle a']; foreach ($list as $v) { $v = pq($v); echo "cb2:\t" . $v->text() . "\n"; } $imgUrl = $html['div.sidebar dl.singerInfo img']->attr('src'); $imgFile = $this->imgDir . '/' . $param['artistName'] . '.' . pathinfo($imgUrl, PATHINFO_EXTENSION); $this->getCurl()->add(array('url' => $imgUrl, 'file' => $imgFile, 'ctl' => array('type' => 'img'), 'args' => array_merge($param, array('imgFile' => $imgFile))), array($this, 'cb3')); phpQuery::unloadDocuments(); } }
/** * Unloads whole document from memory. * CAUTION! None further operations will be possible on this document. * All objects refering to it will be useless. * * @return phpQueryObject|QueryTemplatesSource|QueryTemplatesParse|QueryTemplatesSourceQuery */ public function unloadDocument() { phpQuery::unloadDocuments($this->getDocumentID()); }
public static function match($html, $data, $rule) { $match_hash = array(); if ($data['dom']) { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$dataTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); if (strpos($data['rule'], '@') !== false) { list($content_dom, $content_attr) = explode("@", $data['rule']); $content_fun = 'attr'; } else { list($content_dom, $content_fun, $content_attr) = explode("\n", $data['rule']); } $content_dom = trim($content_dom); $content_fun = trim($content_fun); $content_attr = trim($content_attr); $content_fun or $content_fun = 'html'; if ($data['multi']) { $conArray = array(); $_content = null; foreach ($doc[$content_dom] as $doc_key => $doc_value) { if ($content_attr) { $_content = phpQuery::pq($doc_value)->{$content_fun}($content_attr); } else { $_content = phpQuery::pq($doc_value)->{$content_fun}(); } $cmd5 = md5($_content); if ($match_hash[$cmd5]) { break; } $conArray[$doc_key] = $_content; $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray, $_content, $match_hash); } else { if ($content_attr) { $content = $doc[$content_dom]->{$content_fun}($content_attr); } else { $content = $doc[$content_dom]->{$content_fun}(); } } phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc); } else { if (trim($data['rule']) == '<%content%>') { $content = $html; } else { $data_rule = spiderTools::pregTag($data['rule']); if (preg_match('/(<\\w+>|\\.\\*|\\.\\+|\\\\d|\\\\w)/i', $data_rule)) { if ($data['multi']) { preg_match_all('|' . $data_rule . '|is', $html, $matches, PREG_SET_ORDER); $conArray = array(); foreach ((array) $matches as $mkey => $mat) { $cmd5 = md5($mat['content']); if ($match_hash[$cmd5]) { break; } $conArray[$mkey] = $mat['content']; $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray, $match_hash); } else { preg_match('|' . $data_rule . '|is', $html, $matches, $PREG_SET_ORDER); $content = $matches['content']; } } else { $content = $data_rule; } } } return $content; }
/** * download and html callback * * @param unknown $r * @param unknown $args * @return * */ function cbProcess($r, $args) { if (!$this->hasHttpError($r['info'])) { if (isset($r['content'])) { $urlCurrent = $r['info']['url']; $pq = phpQuery::newDocumentHTML($r['content']); $urlDownload = array(); // css $list = $pq['link[type$=css]']; foreach ($list as $v) { $v = pq($v); $url = $this->uri2url($v->attr('href'), $urlCurrent); $v->attr('href', $this->url2uri($url, $urlCurrent)); $urlDownload[] = $url; } // script $script = $pq['script[type$=script]']; foreach ($script as $v) { $v = pq($v); $url = $this->uri2url($v->attr('src'), $urlCurrent); $v->attr('src', $this->url2uri($url, $urlCurrent)); $urlDownload[] = $url; } // pic $pic = $pq['img']; if ($this->downloadPic) { foreach ($pic as $v) { $v = pq($v); $url = $this->uri2url($v->attr('src'), $urlCurrent); $v->attr('src', $this->url2uri($url, $urlCurrent)); $urlDownload[] = $url; } } else { foreach ($pic as $v) { $v = pq($v); $v->attr('src', $this->uri2url($v->attr('src'), $urlCurrent)); } } // html $a = $pq['a']; $urlHtml = array(); foreach ($a as $v) { $v = pq($v); $url = $this->uri2url($v->attr('href'), $urlCurrent); if (0 === strpos($url, $this->urlDir($urlCurrent))) { $v->attr('href', $this->url2uri($url, $urlCurrent)); $urlHtml[] = $url; } } $r['content'] = $pq->html(); // add foreach (array('urlDownload', 'urlHtml') as $v) { ${$v} = array_unique(${$v}); foreach (${$v} as $v1) { if (!in_array($v1, $this->urlAdded)) { $file = $this->getFile($v1); if (null == $file && $v == 'urlDownload') { continue; } $item = array('url' => $v1, 'file' => $file, 'args' => array('url' => $v1, 'file' => $file)); if ($v == 'urlDownload') { unset($item['args']['file']); } else { unset($item['file']); } $this->getCurl()->add($item, array($this, 'cbProcess')); $this->urlAdded[] = $v1; } } } if (isset($args['file']) && !file_put_contents($args['file'], $r['content'], LOCK_EX)) { user_error('write file failed, file=' . $args['file'], E_USER_WARNING); } phpQuery::unloadDocuments(); } } }
/** * Clean up some messes */ function __destruct() { if ($this->browser->parserType == 'phpquery') { $id = phpQuery::getDocumentID($this->parser); phpQuery::unloadDocuments($id); } }
/** * download and html callback * * @param array $r * @param mixed $args * */ function cbProcess($r, $args) { if (!$this->hasHttpError($r['info'])) { $urlDownload = array(); $urlParse = array(); if (isset($r['content']) && 0 === strpos($r['info']['content_type'], 'text')) { $urlCurrent = $args['url']; $pq = phpQuery::newDocumentHTML($r['content']); // css $list = $pq['link[type$=css]']; foreach ($list as $v) { $v = pq($v); $url = $this->uri2url($v->attr('href'), $urlCurrent); $v->attr('href', $this->cloneUrl2uri($url, $urlCurrent)); $urlDownload[$url] = array('type' => 'css'); } // script $script = $pq['script[type$=script]']; foreach ($script as $v) { $v = pq($v); if (null != $v->attr('src')) { $url = $this->uri2url($v->attr('src'), $urlCurrent); $v->attr('src', $this->cloneUrl2uri($url, $urlCurrent)); $urlDownload[$url] = array(); } } // pic $pic = $pq['img']; if ($this->download['pic']['enable']) { foreach ($pic as $v) { $v = pq($v); $url = $this->uri2url($v->attr('src'), $urlCurrent); $v->attr('src', $this->cloneUrl2uri($url, $urlCurrent)); $urlDownload[$url] = array(); } } else { foreach ($pic as $v) { $v = pq($v); $v->attr('src', $this->uri2url($v->attr('src'), $urlCurrent)); } } // link xml $list = $pq['link[type$=xml]']; foreach ($list as $v) { $v = pq($v); $url = $this->uri2url($v->attr('href'), $urlCurrent); if ($this->isProcess($url)) { $v->attr('href', $this->cloneUrl2uri($url, $urlCurrent)); $urlDownload[$url] = array(); } } // href $a = $pq['a']; foreach ($a as $v) { $v = pq($v); $href = $v->attr('href'); $url = $this->uri2url($href, $urlCurrent); if ($this->download['zip']['enable'] && '.zip' == substr($href, -4)) { if ($this->download['zip']['withPrefix']) { $isProcess = $this->isProcess($url); } else { $isProcess = true; } if ($isProcess) { $urlDownload[$url] = array(); } } else { $isProcess = $this->isProcess($url); if ($isProcess) { $urlParse[$url] = array(); } } if ($isProcess) { $v->attr('href', $this->cloneUrl2uri($url, $urlCurrent)); } else { $v->attr('href', $url); } } $r['content'] = $pq->html(); if (isset($args['file']) && false === file_put_contents($args['file'], $r['content'], LOCK_EX)) { user_error('write file failed, file=' . $args['file'], E_USER_WARNING); } phpQuery::unloadDocuments(); } elseif ($args['isDownload']) { if ('css' == $args['type']) { $content = file_get_contents($args['file']); $uri = array(); // import preg_match_all('/@import\\s+url\\s*\\((.+)\\);/iU', $content, $matches); if (!empty($matches[1])) { $uri = array_merge($uri, $matches[1]); } // url in css preg_match_all('/:\\s*url\\((\'|")?(.+?)\\1?\\)/i', $content, $matches); if (!empty($matches[2])) { $uri = array_merge($uri, $matches[2]); } foreach ($uri as $v) { $urlDownload[$this->urlDir($r['info']['url']) . $v] = array('type' => 'css'); } } } // add foreach (array('urlDownload', 'urlParse') as $v) { foreach (${$v} as $k1 => $v1) { if (!in_array($k1, $this->urlAdded)) { $file = $this->url2file($k1); if (null == $file) { continue; } $type = null; if (isset($v1['type'])) { $type = $v1['type']; } $item = array('url' => $k1, 'file' => $file, 'args' => array('url' => $k1, 'file' => $file, 'type' => $type, 'isDownload' => $v == 'urlDownload')); if ($v == 'urlParse') { unset($item['file']); } $this->getCurl()->add($item, array($this, 'cbProcess')); $this->urlAdded[] = $k1; } } } } }
if (!$hook&&stripos($data->text(), $target)) { $local_result['rd'] = 'rd'; $hook = true; } echo'<pre>'; var_dump($data->text()); echo'<pre>'; if (count($data->find('noindex'))){ foreach (pq('noindex') as $noindex) { if (stripos(pq($noindex)->text(), $target))$local_result['nix'] = 'ni'; } } ($local_result['rd'] != 'rd'&&$local_result['nix'] != 'ni'&&$local_result['nfl'] != 'nf')?$local_result['clear'] = true:''; $hook?$local_result['live'] = true:$local_result['live'] = false; unset($data); phpQuery::unloadDocuments(); } else { $local_result['live'] = 'handed'; } return $local_result; }; $multi = new dHttp\Client(); $used_links = array(); foreach ($params as $url) { $url = trim($url); if(!in_array($url,$used_links)) { $resp_once[] = new dHttp\Client($url, array( CURLOPT_SSL_VERIFYPEER => FALSE, CURLOPT_HEADER => TRUE,
public static function check_content_code($content) { if (spider::$content_right_code) { if (strpos(spider::$content_right_code, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); $pq_dom = str_replace('DOM::', '', spider::$content_right_code); $matches = (bool) (string) phpQuery::pq($pq_dom); phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc, $content); } else { $matches = strpos($content, spider::$content_right_code); unset($content); } if ($matches === false) { return false; } } if (spider::$content_error_code) { if (strpos(spider::$content_error_code, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); $pq_dom = str_replace('DOM::', '', spider::$content_error_code); $_matches = (bool) (string) phpQuery::pq($pq_dom); phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc, $content); } else { $_matches = strpos($content, spider::$content_error_code); unset($content); } if ($_matches !== false) { return false; } } return true; }
function dataClean($rules, $content) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $ruleArray = explode("\n", $rules); foreach ($ruleArray as $key => $rule) { $rule = trim($rule); if (strpos($rule, '<%SELF%>') !== false) { $content = str_replace('<%SELF%>', $content, $rule); continue; } list($_pattern, $_replacement) = explode("==", $rule); $_pattern = trim($_pattern); $_replacement = trim($_replacement); $_replacement = str_replace('\\n', "\n", $_replacement); if (strpos($_pattern, 'NEED::') !== false) { $need = str_replace('NEED::', '', $_pattern); if (strpos($content, $need) === false) { return false; } } if (strpos($_pattern, 'NOT::') !== false) { $not = str_replace('NOT::', '', $_pattern); if (strpos($content, $not) !== false) { return false; } } if (strpos($_pattern, 'LEN::') !== false) { $len = str_replace('LEN::', '', $_pattern); $len_content = preg_replace(array('/<[\\/\\!]*?[^<>]*?>/is', '/\\s*/is'), '', $content); if (cstrlen($len_content) < $len) { return false; } } if (strpos($_pattern, 'IMG::') !== false) { $img_count = str_replace('IMG::', '', $_pattern); preg_match_all("/<img.*?src\\s*=[\"|'](.*?)[\"|']/is", $content, $match); $img_array = array_unique($match[1]); if (count($img_array) < $img_count) { return false; } } if (strpos($_pattern, 'DOM::') !== false) { $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); //echo 'dataClean:getDocumentID:'.$doc->getDocumentID()."\n"; $_pattern = str_replace('DOM::', '', $_pattern); list($pq_dom, $pq_fun, $pq_attr) = explode("::", $_pattern); $pq_array = phpQuery::pq($pq_dom); foreach ($pq_array as $pq_key => $pq_val) { if ($pq_fun) { if ($pq_attr) { $pq_content = phpQuery::pq($pq_val)->{$pq_fun}($pq_attr); } else { $pq_content = phpQuery::pq($pq_val)->{$pq_fun}(); } } else { $pq_content = (string) phpQuery::pq($pq_val); } $pq_pattern[$pq_key] = $pq_content; $pq_replacement[$pq_key] = $_replacement; } phpQuery::unloadDocuments($doc->getDocumentID()); //var_dump(array_map('htmlspecialchars', $pq_pattern)); $content = str_replace($pq_pattern, $pq_replacement, $content); } else { if ($_pattern == '~SELF~') { $_pattern = $content; } if (strpos($_replacement, '~SELF~') !== false) { $_replacement = str_replace('~SELF~', $content, $_replacement); } if (strpos($_replacement, '~S~') !== false) { $_replacement = str_replace('~S~', ' ', $_replacement); } $replacement[$key] = $_replacement; $pattern[$key] = '|' . $this->pregTag($_pattern) . '|is'; } } if ($pattern) { return preg_replace($pattern, $replacement, $content); } else { return $content; } }
public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null) { $pid === NULL && ($pid = spider::$pid); if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if (empty($rid) && $_rid !== NULL) { $rid = $_rid; } if ($work == 'shell') { $lastupdate = $project['lastupdate']; if ($project['psleep']) { if (time() - $lastupdate < $project['psleep']) { echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n"; return; } } echo "[32m开始采集方案[" . $pid . "] 采集规则[" . $rid . "][0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && ($urls = $project['urls']); spiderUrls::$urls && ($urls = spiderUrls::$urls); $_urls && ($urls = $_urls); $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if ($work == 'shell') { // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray as $_key => $_url) { $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if (strpos($_url, 'RULE@') !== false) { list($___s, $_rid, $_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls); echo "<hr />"; } $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL'); $urlsList = array_merge($urlsList, $_urlsList); unset($urlsArray[$_key]); } else { preg_match('|.*<(.*)>.*|is', $_url, $_matches); if ($_matches) { list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]); $url = str_replace($_matches[1], '*', trim($_matches[0])); $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList, $_urlsList); } } } $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList)); unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if (empty($urlsArray)) { if ($work == 'shell') { echo "采集列表为空!请填写!\n"; return false; } iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iS::escapeStr($project)); // print_r(iS::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if ($rule['mode'] == "2") { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; foreach ($urlsArray as $key => $url) { $url = trim($url); spider::$urlslast = $url; if ($work == 'shell') { echo '开始采集列表:' . $url . "\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>' . $url . "<br />"; } $html = spiderTools::remote($url); if (empty($html)) { continue; } if ($rule['mode'] == "2") { $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); // } if ($rule['list_area_format']) { $list_area_format = trim($rule['list_area_format']); if (strpos($list_area_format, 'ARRAY::') !== false) { $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format, $la); } } else { $lists = phpQuery::pq($list_area_format, $list_area); } } else { $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; } else { $list_area_rule = spiderTools::pregTag($rule['list_area_rule']); if ($list_area_rule) { preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iS::escapeStr($rule['list_area_rule']); // echo iS::escapeStr($list_area); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); $list_area = null; unset($list_area); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area); echo "<hr />"; echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']); echo "<hr />"; } if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } //PID@xx 返回URL列表 if ($callback == 'CALLBACK@URL') { $cbListUrl = array(); foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } // if(spider::checker($work)===true){ $cbListUrl[] = spider::$url; // } } return $cbListUrl; } if ($work == "shell") { $pubCount[$url]['count'] = count($lists); $pubAllCount['count'] += $pubCount[$url]['count']; echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n"; foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); echo "title:" . spider::$title . "\n"; echo "url:" . spider::$url . "\n"; spider::$rid = $rid; $checker = spider::checker($work); if ($checker === true) { echo "开始采集...."; $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; echo "....√\n"; if ($project['sleep']) { echo "sleep:" . $project['sleep'] . "s\n"; if ($rule['mode'] != "2") { unset($lists[$lkey]); } gc_collect_cycles(); sleep($project['sleep']); } else { //sleep(1); } } else { $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if ($rule['mode'] == "2") { phpQuery::unloadDocuments($doc->getDocumentID()); } else { unset($lists); } } if ($work == "WEB@MANUAL") { $listsArray[$url] = $lists; } if ($work == "WEB@AUTO" || $work == 'DATA@RULE') { foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>' . $lkey . '<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br /><hr />"; } else { if (spider::checker($work) === true || spider::$dataTest) { $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spiderData::crawl(); // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0'); spider::$dataTest or $suid = iDB::insert('spider_url', $suData); $contentArray[$lkey]['spider_url'] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid)); break; } }
public function index() { //获取该插件配置参数 $config = Amango_Addons_Config(); //查看缓存是否存在 $article = S('ADDONS_SnatchTieba'); if (empty($article)) { Amango_Addons_Import('phpQuery/phpQuery.php'); \phpQuery::$defaultCharset = 'GBK'; \phpQuery::newDocumentFile('http://tieba.baidu.com/f?kw=' . urlencode($config['tieba_name']) . '&fr=ala0'); $articlecontent = array(); $artlist = \pq(".j_thread_list"); foreach ($artlist as $li) { //获取评论数 $tz_commont = iconv('GBK', 'UTF-8', \pq($li)->find('.threadlist_rep_num')->html()); //获取标题 $tz_title = iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->html()); //获取内容 $tz_content = iconv('GBK', 'UTF-8', \pq($li)->find('.threadlist_abs_onlyline')->html()); $tz_content = preg_replace('/s/', '', $tz_content); $tz_content = str_replace('<!---->', '', $tz_content); //获取链接 $tz_link = 'http://tieba.baidu.com' . iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->attr('href')); //获取作者 $tz_author = strip_tags(iconv('GBK', 'UTF-8', \pq($li)->find('span.tb_icon_author a')->html())); $tz_author = preg_replace('/s/', '', $tz_author); //获取回复者 $tz_reply = iconv('GBK', 'UTF-8', \pq($li)->find('span.tb_icon_author_rely a')->html()); //获取回复时间 $tz_replytime = \pq($li)->find('span.j_reply_data')->text(); $tz_replytime = preg_replace('/s/', '', $tz_replytime); //获取图片 $tz_pic = iconv('GBK', 'UTF-8', \pq($li)->find('img')->attr('original')); if (!in_array($tz_title, $toptitle)) { $articlecontent['other'][] = array('Title' => 1 == $config['tieba_extra'] ? "[" . $tz_commont . "]" . $tz_title . "\n" . $tz_content . "\n作者:" . $tz_author . "|回复:" . $tz_reply . "-" . $tz_replytime : $tz_title . "\n" . $tz_content, 'Description' => '', 'PicUrl' => empty($tz_pic) ? '' : $tz_pic, 'Url' => $tz_link); } } $allownums = $config['tieba_nums'] > 8 ? 8 : $config['tieba_nums']; $allownums = $allownums >= 1 ? $allownums : 1; if ($config['tieba_jinghua'] == 1) { $arttoplist = \pq(".thread_top"); foreach ($arttoplist as $li) { //获取评论数 $tz_commont = iconv('GBK', 'UTF-8', \pq($li)->find('.threadlist_rep_num')->html()); //获取标题 $tz_title = iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->html()); //获取链接 $tz_link = 'http://tieba.baidu.com' . iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->attr('href')); //获取作者 $tz_author = strip_tags(iconv('GBK', 'UTF-8', \pq($li)->find('span.tb_icon_author a')->html())); $toptitle[] = $tz_title; $tz_author = preg_replace('/s/', '', $tz_author); $articlecontent['top'][] = array('Title' => "[" . $tz_commont . "]" . $tz_title, 'Description' => '', 'PicUrl' => empty($tz_pic) ? '' : $tz_pic, 'Url' => $tz_link); } $article = self::havejinghua($articlecontent['top'], $articlecontent['other'], $allownums); } else { $article = self::deljinghua($articlecontent['other'], $allownums); } \phpQuery::unloadDocuments(); if ($config['tieba_cache'] > 0 && !empty($article)) { S('ADDONS_SnatchTieba', $article, $config['tieba_cache']); } } $this->assign('Duotw', $article); $this->display(); }