unloadDocuments() public static method

Unloades all or specified document from memory.
public static unloadDocuments ( $id = null )
Esempio n. 1
0
 function getBody()
 {
     if ($this->parts && $this->body === null) {
         foreach ($this->parts as $part) {
             $partNo = $part['partNo'];
             $encoding = $part['encoding'];
             $charset = $part['charset'];
             $body = imap_fetchbody($this->message->getMailbox(), $this->message->getUID(), $partNo, FT_UID);
             $body = BrIMAP::decode($body, $encoding);
             if ($charset) {
                 $body = @iconv($charset, 'UTF-8', $body);
             }
             $body = trim($body);
             $body = preg_replace('~<head[^>]*?>.*?</head>~ism', '', $body);
             $body = preg_replace('~<meta[^>]*?>~ism', '', $body);
             $body = preg_replace('~<base[^>]*?>~ism', '', $body);
             $body = preg_replace('~<style[^>]*?>.*?</style>~ism', '', $body);
             if ($this->isHTML && $body) {
                 try {
                     $doc = phpQuery::newDocument($body);
                     $bodyTag = $doc->find('body');
                     if ($bodyTag->length() > 0) {
                         $body = trim(pq($bodyTag)->html());
                     } else {
                         $body = trim($doc->html());
                     }
                     phpQuery::unloadDocuments();
                 } catch (Exception $e) {
                 }
             }
             $this->body .= $body;
         }
     }
     return $this->body;
 }
Esempio n. 2
0
 function load_html($html)
 {
     $tidy = tidy_parse_string($html);
     tidy_clean_repair($tidy);
     $html = tidy_get_html($tidy);
     phpQuery::unloadDocuments();
     return phpQuery::newDocumentHTML($html);
 }
Esempio n. 3
0
 /**
  * @param $conf ConfNode
  * @param $html
  * @return array|float|int|String
  */
 public static function parseHtml($conf, $html)
 {
     query::$defaultCharset = "utf-8";
     $doc = query::newDocument($html);
     query::selectDocument($doc);
     $value = self::queryValue(pq($doc), $conf);
     //清理内存
     query::unloadDocuments($doc);
     return $value;
 }
Esempio n. 4
0
 public function parseTest()
 {
     $html = file_get_contents('http://dmtoys.com.ua');
     $results = phpQuery::newDocument($html);
     $resultsA = pq($results)['img'];
     $dd = [];
     foreach ($resultsA as $key => $value) {
         $dd[]['href'] = pq($value)->attr('src');
     }
     phpQuery::unloadDocuments();
     return $dd;
 }
Esempio n. 5
0
 /**
  * add image download task, echo parsed content
  *
  * @param unknown $r        	
  * @param unknown $param        	
  */
 function cb2($r, $param)
 {
     if (!$this->hasHttpError($r['info'])) {
         $html = phpQuery::newDocumentHTML($r['content']);
         $list = $html['#song-list td.songTitle a'];
         foreach ($list as $v) {
             $v = pq($v);
             echo "cb2:\t" . $v->text() . "\n";
         }
         $imgUrl = $html['div.sidebar dl.singerInfo img']->attr('src');
         $imgFile = $this->imgDir . '/' . $param['artistName'] . '.' . pathinfo($imgUrl, PATHINFO_EXTENSION);
         $this->getCurl()->add(array('url' => $imgUrl, 'file' => $imgFile, 'ctl' => array('type' => 'img'), 'args' => array_merge($param, array('imgFile' => $imgFile))), array($this, 'cb3'));
         phpQuery::unloadDocuments();
     }
 }
Esempio n. 6
0
 /**
  * Unloads whole document from memory.
  * CAUTION! None further operations will be possible on this document.
  * All objects refering to it will be useless.
  *
  * @return phpQueryObject|QueryTemplatesSource|QueryTemplatesParse|QueryTemplatesSourceQuery
  */
 public function unloadDocument()
 {
     phpQuery::unloadDocuments($this->getDocumentID());
 }
Esempio n. 7
0
 public static function match($html, $data, $rule)
 {
     $match_hash = array();
     if ($data['dom']) {
         iPHP::import(iPHP_LIB . '/phpQuery.php');
         spider::$dataTest && $_GET['pq_debug'] && (phpQuery::$debug = 1);
         $doc = phpQuery::newDocumentHTML($html, 'UTF-8');
         if (strpos($data['rule'], '@') !== false) {
             list($content_dom, $content_attr) = explode("@", $data['rule']);
             $content_fun = 'attr';
         } else {
             list($content_dom, $content_fun, $content_attr) = explode("\n", $data['rule']);
         }
         $content_dom = trim($content_dom);
         $content_fun = trim($content_fun);
         $content_attr = trim($content_attr);
         $content_fun or $content_fun = 'html';
         if ($data['multi']) {
             $conArray = array();
             $_content = null;
             foreach ($doc[$content_dom] as $doc_key => $doc_value) {
                 if ($content_attr) {
                     $_content = phpQuery::pq($doc_value)->{$content_fun}($content_attr);
                 } else {
                     $_content = phpQuery::pq($doc_value)->{$content_fun}();
                 }
                 $cmd5 = md5($_content);
                 if ($match_hash[$cmd5]) {
                     break;
                 }
                 $conArray[$doc_key] = $_content;
                 $match_hash[$cmd5] = true;
             }
             if (spider::$dataTest) {
                 echo "<b>多条匹配结果:</b><pre>";
                 print_r($match_hash);
                 echo "</pre><hr />";
             }
             $content = implode('#--iCMS.PageBreak--#', $conArray);
             unset($conArray, $_content, $match_hash);
         } else {
             if ($content_attr) {
                 $content = $doc[$content_dom]->{$content_fun}($content_attr);
             } else {
                 $content = $doc[$content_dom]->{$content_fun}();
             }
         }
         phpQuery::unloadDocuments($doc->getDocumentID());
         unset($doc);
     } else {
         if (trim($data['rule']) == '<%content%>') {
             $content = $html;
         } else {
             $data_rule = spiderTools::pregTag($data['rule']);
             if (preg_match('/(<\\w+>|\\.\\*|\\.\\+|\\\\d|\\\\w)/i', $data_rule)) {
                 if ($data['multi']) {
                     preg_match_all('|' . $data_rule . '|is', $html, $matches, PREG_SET_ORDER);
                     $conArray = array();
                     foreach ((array) $matches as $mkey => $mat) {
                         $cmd5 = md5($mat['content']);
                         if ($match_hash[$cmd5]) {
                             break;
                         }
                         $conArray[$mkey] = $mat['content'];
                         $match_hash[$cmd5] = true;
                     }
                     if (spider::$dataTest) {
                         echo "<b>多条匹配结果:</b><pre>";
                         print_r($match_hash);
                         echo "</pre><hr />";
                     }
                     $content = implode('#--iCMS.PageBreak--#', $conArray);
                     unset($conArray, $match_hash);
                 } else {
                     preg_match('|' . $data_rule . '|is', $html, $matches, $PREG_SET_ORDER);
                     $content = $matches['content'];
                 }
             } else {
                 $content = $data_rule;
             }
         }
     }
     return $content;
 }
Esempio n. 8
0
 /**
  * download and html callback
  *
  * @param unknown $r        	
  * @param unknown $args        	
  * @return
  *
  */
 function cbProcess($r, $args)
 {
     if (!$this->hasHttpError($r['info'])) {
         if (isset($r['content'])) {
             $urlCurrent = $r['info']['url'];
             $pq = phpQuery::newDocumentHTML($r['content']);
             $urlDownload = array();
             // css
             $list = $pq['link[type$=css]'];
             foreach ($list as $v) {
                 $v = pq($v);
                 $url = $this->uri2url($v->attr('href'), $urlCurrent);
                 $v->attr('href', $this->url2uri($url, $urlCurrent));
                 $urlDownload[] = $url;
             }
             // script
             $script = $pq['script[type$=script]'];
             foreach ($script as $v) {
                 $v = pq($v);
                 $url = $this->uri2url($v->attr('src'), $urlCurrent);
                 $v->attr('src', $this->url2uri($url, $urlCurrent));
                 $urlDownload[] = $url;
             }
             // pic
             $pic = $pq['img'];
             if ($this->downloadPic) {
                 foreach ($pic as $v) {
                     $v = pq($v);
                     $url = $this->uri2url($v->attr('src'), $urlCurrent);
                     $v->attr('src', $this->url2uri($url, $urlCurrent));
                     $urlDownload[] = $url;
                 }
             } else {
                 foreach ($pic as $v) {
                     $v = pq($v);
                     $v->attr('src', $this->uri2url($v->attr('src'), $urlCurrent));
                 }
             }
             // html
             $a = $pq['a'];
             $urlHtml = array();
             foreach ($a as $v) {
                 $v = pq($v);
                 $url = $this->uri2url($v->attr('href'), $urlCurrent);
                 if (0 === strpos($url, $this->urlDir($urlCurrent))) {
                     $v->attr('href', $this->url2uri($url, $urlCurrent));
                     $urlHtml[] = $url;
                 }
             }
             $r['content'] = $pq->html();
             // add
             foreach (array('urlDownload', 'urlHtml') as $v) {
                 ${$v} = array_unique(${$v});
                 foreach (${$v} as $v1) {
                     if (!in_array($v1, $this->urlAdded)) {
                         $file = $this->getFile($v1);
                         if (null == $file && $v == 'urlDownload') {
                             continue;
                         }
                         $item = array('url' => $v1, 'file' => $file, 'args' => array('url' => $v1, 'file' => $file));
                         if ($v == 'urlDownload') {
                             unset($item['args']['file']);
                         } else {
                             unset($item['file']);
                         }
                         $this->getCurl()->add($item, array($this, 'cbProcess'));
                         $this->urlAdded[] = $v1;
                     }
                 }
             }
             if (isset($args['file']) && !file_put_contents($args['file'], $r['content'], LOCK_EX)) {
                 user_error('write file failed, file=' . $args['file'], E_USER_WARNING);
             }
             phpQuery::unloadDocuments();
         }
     }
 }
Esempio n. 9
0
 /**
  * Clean up some messes
  */
 function __destruct()
 {
     if ($this->browser->parserType == 'phpquery') {
         $id = phpQuery::getDocumentID($this->parser);
         phpQuery::unloadDocuments($id);
     }
 }
Esempio n. 10
0
 /**
  * download and html callback
  *
  * @param array $r
  * @param mixed $args
  *
  */
 function cbProcess($r, $args)
 {
     if (!$this->hasHttpError($r['info'])) {
         $urlDownload = array();
         $urlParse = array();
         if (isset($r['content']) && 0 === strpos($r['info']['content_type'], 'text')) {
             $urlCurrent = $args['url'];
             $pq = phpQuery::newDocumentHTML($r['content']);
             // css
             $list = $pq['link[type$=css]'];
             foreach ($list as $v) {
                 $v = pq($v);
                 $url = $this->uri2url($v->attr('href'), $urlCurrent);
                 $v->attr('href', $this->cloneUrl2uri($url, $urlCurrent));
                 $urlDownload[$url] = array('type' => 'css');
             }
             // script
             $script = $pq['script[type$=script]'];
             foreach ($script as $v) {
                 $v = pq($v);
                 if (null != $v->attr('src')) {
                     $url = $this->uri2url($v->attr('src'), $urlCurrent);
                     $v->attr('src', $this->cloneUrl2uri($url, $urlCurrent));
                     $urlDownload[$url] = array();
                 }
             }
             // pic
             $pic = $pq['img'];
             if ($this->download['pic']['enable']) {
                 foreach ($pic as $v) {
                     $v = pq($v);
                     $url = $this->uri2url($v->attr('src'), $urlCurrent);
                     $v->attr('src', $this->cloneUrl2uri($url, $urlCurrent));
                     $urlDownload[$url] = array();
                 }
             } else {
                 foreach ($pic as $v) {
                     $v = pq($v);
                     $v->attr('src', $this->uri2url($v->attr('src'), $urlCurrent));
                 }
             }
             // link xml
             $list = $pq['link[type$=xml]'];
             foreach ($list as $v) {
                 $v = pq($v);
                 $url = $this->uri2url($v->attr('href'), $urlCurrent);
                 if ($this->isProcess($url)) {
                     $v->attr('href', $this->cloneUrl2uri($url, $urlCurrent));
                     $urlDownload[$url] = array();
                 }
             }
             // href
             $a = $pq['a'];
             foreach ($a as $v) {
                 $v = pq($v);
                 $href = $v->attr('href');
                 $url = $this->uri2url($href, $urlCurrent);
                 if ($this->download['zip']['enable'] && '.zip' == substr($href, -4)) {
                     if ($this->download['zip']['withPrefix']) {
                         $isProcess = $this->isProcess($url);
                     } else {
                         $isProcess = true;
                     }
                     if ($isProcess) {
                         $urlDownload[$url] = array();
                     }
                 } else {
                     $isProcess = $this->isProcess($url);
                     if ($isProcess) {
                         $urlParse[$url] = array();
                     }
                 }
                 if ($isProcess) {
                     $v->attr('href', $this->cloneUrl2uri($url, $urlCurrent));
                 } else {
                     $v->attr('href', $url);
                 }
             }
             $r['content'] = $pq->html();
             if (isset($args['file']) && false === file_put_contents($args['file'], $r['content'], LOCK_EX)) {
                 user_error('write file failed, file=' . $args['file'], E_USER_WARNING);
             }
             phpQuery::unloadDocuments();
         } elseif ($args['isDownload']) {
             if ('css' == $args['type']) {
                 $content = file_get_contents($args['file']);
                 $uri = array();
                 // import
                 preg_match_all('/@import\\s+url\\s*\\((.+)\\);/iU', $content, $matches);
                 if (!empty($matches[1])) {
                     $uri = array_merge($uri, $matches[1]);
                 }
                 // url in css
                 preg_match_all('/:\\s*url\\((\'|")?(.+?)\\1?\\)/i', $content, $matches);
                 if (!empty($matches[2])) {
                     $uri = array_merge($uri, $matches[2]);
                 }
                 foreach ($uri as $v) {
                     $urlDownload[$this->urlDir($r['info']['url']) . $v] = array('type' => 'css');
                 }
             }
         }
         // add
         foreach (array('urlDownload', 'urlParse') as $v) {
             foreach (${$v} as $k1 => $v1) {
                 if (!in_array($k1, $this->urlAdded)) {
                     $file = $this->url2file($k1);
                     if (null == $file) {
                         continue;
                     }
                     $type = null;
                     if (isset($v1['type'])) {
                         $type = $v1['type'];
                     }
                     $item = array('url' => $k1, 'file' => $file, 'args' => array('url' => $k1, 'file' => $file, 'type' => $type, 'isDownload' => $v == 'urlDownload'));
                     if ($v == 'urlParse') {
                         unset($item['file']);
                     }
                     $this->getCurl()->add($item, array($this, 'cbProcess'));
                     $this->urlAdded[] = $k1;
                 }
             }
         }
     }
 }
Esempio n. 11
0
        if (!$hook&&stripos($data->text(), $target)) {
            $local_result['rd'] = 'rd';
            $hook = true;
        }
        echo'<pre>';
        var_dump($data->text());
        echo'<pre>';
        if (count($data->find('noindex'))){
            foreach (pq('noindex') as $noindex) {
                if (stripos(pq($noindex)->text(), $target))$local_result['nix'] = 'ni';
            }
        }
        ($local_result['rd'] != 'rd'&&$local_result['nix'] != 'ni'&&$local_result['nfl'] != 'nf')?$local_result['clear'] = true:'';
        $hook?$local_result['live'] = true:$local_result['live'] = false;
        unset($data);
        phpQuery::unloadDocuments();
    }
    else {
        $local_result['live'] = 'handed';
    }
    return $local_result;
};

$multi = new dHttp\Client();
$used_links = array();
foreach ($params as $url) {
    $url = trim($url);
    if(!in_array($url,$used_links)) {
        $resp_once[] = new dHttp\Client($url, array(
            CURLOPT_SSL_VERIFYPEER => FALSE,
            CURLOPT_HEADER => TRUE,
Esempio n. 12
0
 public static function check_content_code($content)
 {
     if (spider::$content_right_code) {
         if (strpos(spider::$content_right_code, 'DOM::') !== false) {
             iPHP::import(iPHP_LIB . '/phpQuery.php');
             $doc = phpQuery::newDocumentHTML($content, 'UTF-8');
             $pq_dom = str_replace('DOM::', '', spider::$content_right_code);
             $matches = (bool) (string) phpQuery::pq($pq_dom);
             phpQuery::unloadDocuments($doc->getDocumentID());
             unset($doc, $content);
         } else {
             $matches = strpos($content, spider::$content_right_code);
             unset($content);
         }
         if ($matches === false) {
             return false;
         }
     }
     if (spider::$content_error_code) {
         if (strpos(spider::$content_error_code, 'DOM::') !== false) {
             iPHP::import(iPHP_LIB . '/phpQuery.php');
             $doc = phpQuery::newDocumentHTML($content, 'UTF-8');
             $pq_dom = str_replace('DOM::', '', spider::$content_error_code);
             $_matches = (bool) (string) phpQuery::pq($pq_dom);
             phpQuery::unloadDocuments($doc->getDocumentID());
             unset($doc, $content);
         } else {
             $_matches = strpos($content, spider::$content_error_code);
             unset($content);
         }
         if ($_matches !== false) {
             return false;
         }
     }
     return true;
 }
Esempio n. 13
0
 function dataClean($rules, $content)
 {
     iPHP::import(iPHP_LIB . '/phpQuery.php');
     $ruleArray = explode("\n", $rules);
     foreach ($ruleArray as $key => $rule) {
         $rule = trim($rule);
         if (strpos($rule, '<%SELF%>') !== false) {
             $content = str_replace('<%SELF%>', $content, $rule);
             continue;
         }
         list($_pattern, $_replacement) = explode("==", $rule);
         $_pattern = trim($_pattern);
         $_replacement = trim($_replacement);
         $_replacement = str_replace('\\n', "\n", $_replacement);
         if (strpos($_pattern, 'NEED::') !== false) {
             $need = str_replace('NEED::', '', $_pattern);
             if (strpos($content, $need) === false) {
                 return false;
             }
         }
         if (strpos($_pattern, 'NOT::') !== false) {
             $not = str_replace('NOT::', '', $_pattern);
             if (strpos($content, $not) !== false) {
                 return false;
             }
         }
         if (strpos($_pattern, 'LEN::') !== false) {
             $len = str_replace('LEN::', '', $_pattern);
             $len_content = preg_replace(array('/<[\\/\\!]*?[^<>]*?>/is', '/\\s*/is'), '', $content);
             if (cstrlen($len_content) < $len) {
                 return false;
             }
         }
         if (strpos($_pattern, 'IMG::') !== false) {
             $img_count = str_replace('IMG::', '', $_pattern);
             preg_match_all("/<img.*?src\\s*=[\"|'](.*?)[\"|']/is", $content, $match);
             $img_array = array_unique($match[1]);
             if (count($img_array) < $img_count) {
                 return false;
             }
         }
         if (strpos($_pattern, 'DOM::') !== false) {
             $doc = phpQuery::newDocumentHTML($content, 'UTF-8');
             //echo 'dataClean:getDocumentID:'.$doc->getDocumentID()."\n";
             $_pattern = str_replace('DOM::', '', $_pattern);
             list($pq_dom, $pq_fun, $pq_attr) = explode("::", $_pattern);
             $pq_array = phpQuery::pq($pq_dom);
             foreach ($pq_array as $pq_key => $pq_val) {
                 if ($pq_fun) {
                     if ($pq_attr) {
                         $pq_content = phpQuery::pq($pq_val)->{$pq_fun}($pq_attr);
                     } else {
                         $pq_content = phpQuery::pq($pq_val)->{$pq_fun}();
                     }
                 } else {
                     $pq_content = (string) phpQuery::pq($pq_val);
                 }
                 $pq_pattern[$pq_key] = $pq_content;
                 $pq_replacement[$pq_key] = $_replacement;
             }
             phpQuery::unloadDocuments($doc->getDocumentID());
             //var_dump(array_map('htmlspecialchars', $pq_pattern));
             $content = str_replace($pq_pattern, $pq_replacement, $content);
         } else {
             if ($_pattern == '~SELF~') {
                 $_pattern = $content;
             }
             if (strpos($_replacement, '~SELF~') !== false) {
                 $_replacement = str_replace('~SELF~', $content, $_replacement);
             }
             if (strpos($_replacement, '~S~') !== false) {
                 $_replacement = str_replace('~S~', ' ', $_replacement);
             }
             $replacement[$key] = $_replacement;
             $pattern[$key] = '|' . $this->pregTag($_pattern) . '|is';
         }
     }
     if ($pattern) {
         return preg_replace($pattern, $replacement, $content);
     } else {
         return $content;
     }
 }
Esempio n. 14
0
 public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null)
 {
     $pid === NULL && ($pid = spider::$pid);
     if ($pid) {
         $project = spider::project($pid);
         $cid = $project['cid'];
         $rid = $project['rid'];
         $prule_list_url = $project['list_url'];
         $lastupdate = $project['lastupdate'];
     } else {
         $cid = spider::$cid;
         $rid = spider::$rid;
     }
     if (empty($rid) && $_rid !== NULL) {
         $rid = $_rid;
     }
     if ($work == 'shell') {
         $lastupdate = $project['lastupdate'];
         if ($project['psleep']) {
             if (time() - $lastupdate < $project['psleep']) {
                 echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n";
                 return;
             }
         }
         echo "开始采集方案[" . $pid . "] 采集规则[" . $rid . "]\n";
     }
     $ruleA = spider::rule($rid);
     $rule = $ruleA['rule'];
     $urls = $rule['list_urls'];
     $project['urls'] && ($urls = $project['urls']);
     spiderUrls::$urls && ($urls = spiderUrls::$urls);
     $_urls && ($urls = $_urls);
     $urlsArray = explode("\n", $urls);
     $urlsArray = array_filter($urlsArray);
     $_urlsArray = $urlsArray;
     $urlsList = array();
     if ($work == 'shell') {
         // echo "$urls\n";
         print_r($urlsArray);
     }
     foreach ($_urlsArray as $_key => $_url) {
         $_url = htmlspecialchars_decode($_url);
         $_urlsList = array();
         /**
          * RULE@rid@url
          * url使用[rid]规则采集并返回列表结果
          */
         if (strpos($_url, 'RULE@') !== false) {
             list($___s, $_rid, $_urls) = explode('@', $_url);
             if (spider::$ruleTest) {
                 print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls);
                 echo "<hr />";
             }
             $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL');
             $urlsList = array_merge($urlsList, $_urlsList);
             unset($urlsArray[$_key]);
         } else {
             preg_match('|.*<(.*)>.*|is', $_url, $_matches);
             if ($_matches) {
                 list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]);
                 $url = str_replace($_matches[1], '*', trim($_matches[0]));
                 $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse);
                 unset($urlsArray[$_key]);
                 $urlsList = array_merge($urlsList, $_urlsList);
             }
         }
     }
     $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList));
     unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList);
     $urlsArray = array_unique($urlsArray);
     // spider::$useragent = $rule['user_agent'];
     // spider::$encoding  = $rule['curl']['encoding'];
     // spider::$referer   = $rule['curl']['referer'];
     // spider::$charset   = $rule['charset'];
     if (empty($urlsArray)) {
         if ($work == 'shell') {
             echo "采集列表为空!请填写!\n";
             return false;
         }
         iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();');
     }
     //      if(spider::$ruleTest){
     //          echo "<pre>";
     //          print_r(iS::escapeStr($project));
     //          print_r(iS::escapeStr($rule));
     //          echo "</pre>";
     //          echo "<hr />";
     //      }
     if ($rule['mode'] == "2") {
         iPHP::import(iPHP_LIB . '/phpQuery.php');
         spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1);
     }
     $pubArray = array();
     $pubCount = array();
     $pubAllCount = array();
     spider::$curl_proxy = $rule['proxy'];
     spider::$urlslast = null;
     foreach ($urlsArray as $key => $url) {
         $url = trim($url);
         spider::$urlslast = $url;
         if ($work == 'shell') {
             echo '开始采集列表:' . $url . "\n";
         }
         if (spider::$ruleTest) {
             echo '<b>抓取列表:</b>' . $url . "<br />";
         }
         $html = spiderTools::remote($url);
         if (empty($html)) {
             continue;
         }
         if ($rule['mode'] == "2") {
             $doc = phpQuery::newDocumentHTML($html, 'UTF-8');
             $list_area = $doc[trim($rule['list_area_rule'])];
             // if(strpos($rule['list_area_format'], 'DOM::')!==false){
             //     $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area);
             // }
             if ($rule['list_area_format']) {
                 $list_area_format = trim($rule['list_area_format']);
                 if (strpos($list_area_format, 'ARRAY::') !== false) {
                     $list_area_format = str_replace('ARRAY::', '', $list_area_format);
                     $lists = array();
                     foreach ($list_area as $la_key => $la) {
                         $lists[] = phpQuery::pq($list_area_format, $la);
                     }
                 } else {
                     $lists = phpQuery::pq($list_area_format, $list_area);
                 }
             } else {
                 $lists = $list_area;
             }
             // $lists = $list_area;
             //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n";
         } else {
             $list_area_rule = spiderTools::pregTag($rule['list_area_rule']);
             if ($list_area_rule) {
                 preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER);
                 $list_area = $matches['content'];
             } else {
                 $list_area = $html;
             }
             $html = null;
             unset($html);
             if (spider::$ruleTest) {
                 echo iS::escapeStr($rule['list_area_rule']);
                 //              echo iS::escapeStr($list_area);
                 echo "<hr />";
             }
             if ($rule['list_area_format']) {
                 $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area);
             }
             preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER);
             $list_area = null;
             unset($list_area);
             if ($rule['sort'] == "1") {
                 //arsort($lists);
             } elseif ($rule['sort'] == "2") {
                 asort($lists);
             } elseif ($rule['sort'] == "3") {
                 shuffle($lists);
             }
         }
         if (spider::$ruleTest) {
             echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']);
             echo "<hr />";
             echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area);
             echo "<hr />";
             echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']);
             echo "<hr />";
             echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']);
             echo "<hr />";
         }
         if ($prule_list_url) {
             $rule['list_url'] = $prule_list_url;
         }
         //PID@xx 返回URL列表
         if ($callback == 'CALLBACK@URL') {
             $cbListUrl = array();
             foreach ($lists as $lkey => $row) {
                 list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url);
                 if (spider::$url === false) {
                     continue;
                 }
                 // if(spider::checker($work)===true){
                 $cbListUrl[] = spider::$url;
                 // }
             }
             return $cbListUrl;
         }
         if ($work == "shell") {
             $pubCount[$url]['count'] = count($lists);
             $pubAllCount['count'] += $pubCount[$url]['count'];
             echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n";
             foreach ($lists as $lkey => $row) {
                 list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url);
                 if (spider::$url === false) {
                     continue;
                 }
                 $hash = md5(spider::$url);
                 echo "title:" . spider::$title . "\n";
                 echo "url:" . spider::$url . "\n";
                 spider::$rid = $rid;
                 $checker = spider::checker($work);
                 if ($checker === true) {
                     echo "开始采集....";
                     $callback = spider::publish("shell");
                     if ($callback['code'] == "1001") {
                         $pubCount[$url]['success']++;
                         $pubAllCount['success']++;
                         echo "....√\n";
                         if ($project['sleep']) {
                             echo "sleep:" . $project['sleep'] . "s\n";
                             if ($rule['mode'] != "2") {
                                 unset($lists[$lkey]);
                             }
                             gc_collect_cycles();
                             sleep($project['sleep']);
                         } else {
                             //sleep(1);
                         }
                     } else {
                         $pubCount[$url]['error']++;
                         $pubAllCount['error']++;
                         echo "error\n\n";
                         continue;
                     }
                 }
                 $pubCount[$url]['published']++;
                 $pubAllCount['published']++;
             }
             if ($rule['mode'] == "2") {
                 phpQuery::unloadDocuments($doc->getDocumentID());
             } else {
                 unset($lists);
             }
         }
         if ($work == "WEB@MANUAL") {
             $listsArray[$url] = $lists;
         }
         if ($work == "WEB@AUTO" || $work == 'DATA@RULE') {
             foreach ($lists as $lkey => $row) {
                 list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url);
                 if (spider::$url === false) {
                     continue;
                 }
                 $hash = md5(spider::$url);
                 if (spider::$ruleTest) {
                     echo '<b>列表抓取结果:</b>' . $lkey . '<br />';
                     echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />';
                     echo spider::$url . "<br />";
                     echo $hash . "<br /><hr />";
                 } else {
                     if (spider::checker($work) === true || spider::$dataTest) {
                         $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash);
                         switch ($work) {
                             case 'DATA@RULE':
                                 $contentArray[$lkey] = spiderData::crawl();
                                 // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid);
                                 unset($suData['sid']);
                                 $suData['title'] = addslashes($suData['title']);
                                 $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0');
                                 spider::$dataTest or $suid = iDB::insert('spider_url', $suData);
                                 $contentArray[$lkey]['spider_url'] = $suid;
                                 break;
                             case 'WEB@AUTO':
                                 $pubArray[] = $suData;
                                 break;
                         }
                     }
                 }
             }
         }
     }
     $lists = null;
     unset($lists);
     gc_collect_cycles();
     switch ($work) {
         case 'WEB@AUTO':
             return $pubArray;
             break;
         case 'DATA@RULE':
             return $contentArray;
             break;
         case 'WEB@MANUAL':
             return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray);
             break;
         case "shell":
             echo "采集数据统结果:\n";
             print_r($pubCount);
             print_r($pubAllCount);
             echo "全部采集完成....\n";
             iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid));
             break;
     }
 }
Esempio n. 15
0
 public function index()
 {
     //获取该插件配置参数
     $config = Amango_Addons_Config();
     //查看缓存是否存在
     $article = S('ADDONS_SnatchTieba');
     if (empty($article)) {
         Amango_Addons_Import('phpQuery/phpQuery.php');
         \phpQuery::$defaultCharset = 'GBK';
         \phpQuery::newDocumentFile('http://tieba.baidu.com/f?kw=' . urlencode($config['tieba_name']) . '&fr=ala0');
         $articlecontent = array();
         $artlist = \pq(".j_thread_list");
         foreach ($artlist as $li) {
             //获取评论数
             $tz_commont = iconv('GBK', 'UTF-8', \pq($li)->find('.threadlist_rep_num')->html());
             //获取标题
             $tz_title = iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->html());
             //获取内容
             $tz_content = iconv('GBK', 'UTF-8', \pq($li)->find('.threadlist_abs_onlyline')->html());
             $tz_content = preg_replace('/s/', '', $tz_content);
             $tz_content = str_replace('<!---->', '', $tz_content);
             //获取链接
             $tz_link = 'http://tieba.baidu.com' . iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->attr('href'));
             //获取作者
             $tz_author = strip_tags(iconv('GBK', 'UTF-8', \pq($li)->find('span.tb_icon_author a')->html()));
             $tz_author = preg_replace('/s/', '', $tz_author);
             //获取回复者
             $tz_reply = iconv('GBK', 'UTF-8', \pq($li)->find('span.tb_icon_author_rely a')->html());
             //获取回复时间
             $tz_replytime = \pq($li)->find('span.j_reply_data')->text();
             $tz_replytime = preg_replace('/s/', '', $tz_replytime);
             //获取图片
             $tz_pic = iconv('GBK', 'UTF-8', \pq($li)->find('img')->attr('original'));
             if (!in_array($tz_title, $toptitle)) {
                 $articlecontent['other'][] = array('Title' => 1 == $config['tieba_extra'] ? "[" . $tz_commont . "]" . $tz_title . "\n" . $tz_content . "\n作者:" . $tz_author . "|回复:" . $tz_reply . "-" . $tz_replytime : $tz_title . "\n" . $tz_content, 'Description' => '', 'PicUrl' => empty($tz_pic) ? '' : $tz_pic, 'Url' => $tz_link);
             }
         }
         $allownums = $config['tieba_nums'] > 8 ? 8 : $config['tieba_nums'];
         $allownums = $allownums >= 1 ? $allownums : 1;
         if ($config['tieba_jinghua'] == 1) {
             $arttoplist = \pq(".thread_top");
             foreach ($arttoplist as $li) {
                 //获取评论数
                 $tz_commont = iconv('GBK', 'UTF-8', \pq($li)->find('.threadlist_rep_num')->html());
                 //获取标题
                 $tz_title = iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->html());
                 //获取链接
                 $tz_link = 'http://tieba.baidu.com' . iconv('GBK', 'UTF-8', \pq($li)->find('a.j_th_tit')->attr('href'));
                 //获取作者
                 $tz_author = strip_tags(iconv('GBK', 'UTF-8', \pq($li)->find('span.tb_icon_author a')->html()));
                 $toptitle[] = $tz_title;
                 $tz_author = preg_replace('/s/', '', $tz_author);
                 $articlecontent['top'][] = array('Title' => "[" . $tz_commont . "]" . $tz_title, 'Description' => '', 'PicUrl' => empty($tz_pic) ? '' : $tz_pic, 'Url' => $tz_link);
             }
             $article = self::havejinghua($articlecontent['top'], $articlecontent['other'], $allownums);
         } else {
             $article = self::deljinghua($articlecontent['other'], $allownums);
         }
         \phpQuery::unloadDocuments();
         if ($config['tieba_cache'] > 0 && !empty($article)) {
             S('ADDONS_SnatchTieba', $article, $config['tieba_cache']);
         }
     }
     $this->assign('Duotw', $article);
     $this->display();
 }