/** * Enter description here... * * @param phpQueryObject $self */ public static function script($self, $arg1) { $params = func_get_args(); $params = array_slice($params, 2); $return = null; $config = self::$config; if (phpQueryPlugin_Scripts::$scriptMethods[$arg1]) { phpQuery::callbackRun(phpQueryPlugin_Scripts::$scriptMethods[$arg1], array($self, $params, &$return, $config)); } elseif ($arg1 != '__config' && file_exists(dirname(__FILE__) . "/Scripts/{$arg1}.php")) { phpQuery::debug("Loading script '{$arg1}'"); require dirname(__FILE__) . "/Scripts/{$arg1}.php"; } else { phpQuery::debug("Requested script '{$arg1}' doesn't exist"); } return $return ? $return : $self; }
<?php require_once '../phpQuery/phpQuery.php'; phpQuery::$debug = true; $testName = 'ReplaceWith'; phpQuery::newDocumentFile('test.html')->find('p:eq(1)')->replaceWith("<p class='newTitle'>\n this is example title\n </p>"); $result = pq('p:eq(1)'); if ($result->hasClass('newTitle')) { print "Test '{$testName}' PASSED :)"; } else { print "Test '{$testName}' <strong>FAILED</strong> !!! "; } $result->dump(); print "\n"; $testName = 'ReplaceAll'; $testResult = 3; phpQuery::newDocumentFile('test.html'); pq('<div class="replacer">')->replaceAll('li:first p'); $result = pq('.replacer'); if ($result->size() == $testResult) { print "Test '{$testName}' PASSED :)"; } else { print "Test '{$testName}' <strong>FAILED</strong> !!! "; } $result->dump(); print "\n";
/** * Binds a handler to one or more events (like click) for each matched element. * Can also bind custom events. * * @param DOMNode|phpQueryObject|string $document * @param unknown_type $type * @param unknown_type $data Optional * @param unknown_type $callback * * @TODO support '!' (exclusive) events * @TODO support more than event in $type (space-separated) * @TODO support binding to global events */ public static function add($document, $node, $type, $data, $callback = null) { phpQuery::debug("Binding '{$type}' event"); $documentID = phpQuery::getDocumentID($document); // if (is_null($callback) && is_callable($data)) { // $callback = $data; // $data = null; // } $eventNode = self::getNode($documentID, $node); if (!$eventNode) { $eventNode = self::setNode($documentID, $node); } if (!isset($eventNode->eventHandlers[$type])) { $eventNode->eventHandlers[$type] = array(); } $eventNode->eventHandlers[$type][] = array('callback' => $callback, 'data' => $data); }
public static function debug($text) { phpQuery::debug($text); }
public function dumpTree($html = true, $title = true) { $output = $title ? 'DUMP #' . phpQuery::$dumpCount++ . " \n" : ''; $debug = phpQuery::$debug; phpQuery::$debug = false; foreach ($this->stack() as $node) { $output .= $this->__dumpTree($node); } phpQuery::$debug = $debug; print $html ? nl2br(str_replace(' ', ' ', $output)) : $output; return $this; }
*/ class phpQuery { static $defaultDocumentID; static $debug = 0; static $documents = array(); static $defaultCharset = 'utf-8'; static function debug($text) { if (self::$debug) { print var_dump($text); } } } require_once '../src/phpQuery/DOMDocumentWrapper.php'; phpQuery::$debug = 2; /* ENCODINGS */ //print '<meta http-equiv="Content-Type" content="text/html;charset=iso-8859-2">'; print '<meta http-equiv="Content-Type" content="text/html;charset=utf-8">'; /* HTML */ //$htmlIso = new DOMDocumentWrapper( // file_get_contents('document-types/document-iso88592.html') //); //$htmlIsoNoCharset = new DOMDocumentWrapper( // file_get_contents('document-types/document-iso88592-nocharset.html'), // 'text/html;charset=iso-8859-2' //); $htmlUtf = new phpQuery\DOMDocumentWrapper(file_get_contents('document-types/document-utf8.html')); var_dump($htmlUtf->markup()); //$htmlUtfNoCharset = new DOMDocumentWrapper( // file_get_contents('document-types/document-utf8-nocharset.html'),
/** * @param Zend_Http_Client $xhr */ public static function browserReceive($xhr) { phpQuery::debug("[WebBrowser] Received from " . $xhr->getUri(true)); // TODO handle meta redirects $body = $xhr->getLastResponse()->getBody(); // XXX error ??? if (strpos($body, '<!doctype html>') !== false) { $body = '<html>' . str_replace('<!doctype html>', '', $body) . '</html>'; } $pq = phpQuery::newDocument($body); $pq->document->xhr = $xhr; $pq->document->location = $xhr->getUri(true); $refresh = $pq->find('meta[http-equiv=refresh]')->add('meta[http-equiv=Refresh]'); if ($refresh->size()) { // print htmlspecialchars(var_export($xhr->getCookieJar()->getAllCookies(), true)); // print htmlspecialchars(var_export($xhr->getLastResponse()->getHeader('Set-Cookie'), true)); phpQuery::debug("Meta redirect... '{$refresh->attr('content')}'\n"); // there is a refresh, so get the new url $content = $refresh->attr('content'); $urlRefresh = substr($content, strpos($content, '=') + 1); $urlRefresh = trim($urlRefresh, '\'"'); // XXX not secure ?! phpQuery::ajaxAllowURL($urlRefresh); // $urlRefresh = urldecode($urlRefresh); // make ajax call, passing last $xhr object to preserve important stuff $xhr = phpQuery::ajax(array('type' => 'GET', 'url' => $urlRefresh, 'dataType' => 'html'), $xhr); if ($xhr->getLastResponse()->isSuccessful()) { // if all is ok, repeat this method... return call_user_func_array(array('phpQueryPlugin_WebBrowser', 'browserReceive'), array($xhr)); } } else { return $pq; } }
public static function match($html, $data, $rule) { $match_hash = array(); if ($data['dom']) { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$dataTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); if (strpos($data['rule'], '@') !== false) { list($content_dom, $content_attr) = explode("@", $data['rule']); $content_fun = 'attr'; } else { list($content_dom, $content_fun, $content_attr) = explode("\n", $data['rule']); } $content_dom = trim($content_dom); $content_fun = trim($content_fun); $content_attr = trim($content_attr); $content_fun or $content_fun = 'html'; if ($data['multi']) { $conArray = array(); $_content = null; foreach ($doc[$content_dom] as $doc_key => $doc_value) { if ($content_attr) { $_content = phpQuery::pq($doc_value)->{$content_fun}($content_attr); } else { $_content = phpQuery::pq($doc_value)->{$content_fun}(); } $cmd5 = md5($_content); if ($match_hash[$cmd5]) { break; } $conArray[$doc_key] = $_content; $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray, $_content, $match_hash); } else { if ($content_attr) { $content = $doc[$content_dom]->{$content_fun}($content_attr); } else { $content = $doc[$content_dom]->{$content_fun}(); } } phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc); } else { if (trim($data['rule']) == '<%content%>') { $content = $html; } else { $data_rule = spiderTools::pregTag($data['rule']); if (preg_match('/(<\\w+>|\\.\\*|\\.\\+|\\\\d|\\\\w)/i', $data_rule)) { if ($data['multi']) { preg_match_all('|' . $data_rule . '|is', $html, $matches, PREG_SET_ORDER); $conArray = array(); foreach ((array) $matches as $mkey => $mat) { $cmd5 = md5($mat['content']); if ($match_hash[$cmd5]) { break; } $conArray[$mkey] = $mat['content']; $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray, $match_hash); } else { preg_match('|' . $data_rule . '|is', $html, $matches, $PREG_SET_ORDER); $content = $matches['content']; } } else { $content = $data_rule; } } } return $content; }
/** * @param Zend_Http_Client $xhr */ public static function browserDownload($xhr) { phpQuery::debug("[WebBrowser] Received from " . $xhr->getUri(true)); // TODO handle meta redirects $body = $xhr->getLastResponse()->getBody(); return $body; }
/** * * @param $ajaxSettings * @param $callback * @param $param1 * @param $param2 * @param $param3 * @return phpQueryObject */ public static function browser($ajaxSettings, $callback, $param1 = null, $param2 = null, $param3 = null) { if (self::plugin('WebBrowser')) { $params = func_get_args(); return self::callbackRun(array(self::$plugins, 'browser'), $params); } else { phpQuery::debug('WebBrowser plugin not available...'); } }
function content($html, $data, $rule) { if (trim($data['rule']) === '') { return; } $name = $data['name']; if ($data['page']) { if (empty($rule['page_url'])) { $rule['page_url'] = $rule['list_url']; } if (empty($this->allHtml)) { $page_url_array = array(); $page_area_rule = trim($rule['page_area_rule']); if ($page_area_rule) { if (strpos($page_area_rule, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $pq_dom = str_replace('DOM::', '', $page_area_rule); $pq_array = phpQuery::pq($pq_dom); foreach ($pq_array as $pn => $pq_val) { $href = phpQuery::pq($pq_val)->attr('href'); if ($href) { if ($rule['page_url_rule']) { $page_url_rule = $this->pregTag($rule['page_url_rule']); // var_dump('|' . $page_url_rule . '|is'); if (!preg_match('|' . $page_url_rule . '|is', $href)) { continue; } } $href = str_replace('<%url%>', $href, $rule['page_url']); $page_url_array[$pn] = $this->_url_complement($rule['__url__'], $href); } } if ($page_url_array) { $page_url_array = array_filter($page_url_array); $page_url_array = array_unique($page_url_array); $puk = array_search($rule['__url__'], $page_url_array); unset($page_url_array[$puk]); } //var_dump($page_url_array); // exit; } else { $page_area_rule = $this->pregTag($page_area_rule); if ($page_area_rule) { preg_match('|' . $page_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $page_area = $matches['content']; } else { $page_area = $html; } if ($rule['page_url_rule']) { $page_url_rule = $this->pregTag($rule['page_url_rule']); preg_match_all('|' . $page_url_rule . '|is', $page_area, $page_url_matches, PREG_SET_ORDER); foreach ($page_url_matches as $pn => $row) { $href = str_replace('<%url%>', $row['url'], $rule['page_url']); $page_url_array[$pn] = $this->_url_complement($rule['__url__'], $href); gc_collect_cycles(); } } if ($page_url_array) { $page_url_array = array_filter($page_url_array); $page_url_array = array_unique($page_url_array); $puk = array_search($rule['__url__'], $page_url_array); unset($page_url_array[$puk]); } unset($page_area); } } else { // 逻辑方式 if ($rule['page_url_parse'] == '<%url%>') { $page_url = str_replace('<%url%>', $rule['__url__'], $rule['page_url']); } else { $page_url_rule = $this->pregTag($rule['page_url_parse']); preg_match('|' . $page_url_rule . '|is', $rule['__url__'], $matches, $PREG_SET_ORDER); $page_url = str_replace('<%url%>', $matches['url'], $rule['page_url']); } if (stripos($page_url, '<%step%>') !== false) { for ($pn = $rule['page_no_start']; $pn <= $rule['page_no_end']; $pn = $pn + $rule['page_no_step']) { $page_url_array[$pn] = str_replace('<%step%>', $pn, $page_url); gc_collect_cycles(); } } } if ($this->contTest) { echo $rule['__url__'] . "<br />"; echo $rule['page_url'] . "<br />"; echo iS::escapeStr($page_url_rule); echo "<hr />"; } if ($this->contTest) { echo "<pre>"; print_r($page_url_array); echo "</pre><hr />"; } $this->content_right_code = trim($rule['page_url_right']); $this->content_error_code = trim($rule['page_url_error']); $this->curl_proxy = $rule['proxy']; $pcontent = ''; $pcon = ''; foreach ($page_url_array as $pukey => $purl) { //usleep(100); $phtml = $this->remote($purl); if (empty($phtml)) { break; } $phttp = $this->check_content_code($phtml); if ($phttp['match'] == false) { break; } $pageurl[] = $purl; $pcon .= $phttp['content']; } gc_collect_cycles(); $html .= $pcon; unset($pcon); $this->allHtml = $html; if ($this->contTest) { echo "<pre>"; print_r($pageurl); echo "</pre><hr />"; } } else { $html = $this->allHtml; } } if ($data['dom']) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $this->contTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); //echo "\ndata:getDocumentID:".$doc->getDocumentID()."\n"; list($content_dom, $content_fun, $content_attr) = explode("\n", $data['rule']); $content_dom = trim($content_dom); $content_fun = trim($content_fun); $content_attr = trim($content_attr); $content_fun or $content_fun = 'html'; if ($data['multi']) { $conArray = array(); foreach ($doc[$content_dom] as $doc_key => $doc_value) { if ($content_attr) { $conArray[] = phpQuery::pq($doc_value)->{$content_fun}($content_attr); } else { $conArray[] = phpQuery::pq($doc_value)->{$content_fun}(); } } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray); } else { if ($content_attr) { $content = $doc[$content_dom]->{$content_fun}($content_attr); } else { $content = $doc[$content_dom]->{$content_fun}(); } } if ($this->contTest) { print_r(htmlspecialchars($content)); echo "<hr />"; } phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc); } else { if (trim($data['rule']) == '<%content%>') { $content = $html; } else { $data_rule = $this->pregTag($data['rule']); if ($this->contTest) { print_r(iS::escapeStr($data_rule)); echo "<hr />"; } if (preg_match('/(<\\w+>|\\.\\*|\\.\\+|\\\\d|\\\\w)/i', $data_rule)) { if ($data['multi']) { preg_match_all('|' . $data_rule . '|is', $html, $matches, PREG_SET_ORDER); $conArray = array(); foreach ((array) $matches as $mkey => $mat) { $conArray[] = $mat['content']; } $content = implode('#--iCMS.PageBreak--#', $conArray); if ($this->contTest) { print_r(htmlspecialchars($content)); echo "<hr />"; } } else { preg_match('|' . $data_rule . '|is', $html, $matches, $PREG_SET_ORDER); $content = $matches['content']; } } else { $content = $data_rule; } } } $html = null; unset($html); if ($data['cleanbefor']) { $content = $this->dataClean($data['cleanbefor'], $content); } if ($data['cleanhtml']) { $content = preg_replace('/<[\\/\\!]*?[^<>]*?>/is', '', $content); } if ($data['format'] && $content) { // $_content = iPHP::cleanHtml($content); // trim($_content) && $content = $_content; $content = autoformat($content); $content = stripslashes($content); // unset($_content); } if ($data['img_absolute'] && $content) { preg_match_all("/<img.*?src\\s*=[\"|'](.*?)[\"|']/is", $content, $img_match); if ($img_match[1]) { $_img_array = array_unique($img_match[1]); $_img_urls = array(); foreach ((array) $_img_array as $_img_key => $_img_src) { $_img_urls[$_img_key] = $this->_url_complement($rule['__url__'], $_img_src); } $content = str_replace($_img_array, $_img_urls, $content); } } $data['trim'] && ($content = trim($content)); if ($data['capture']) { $capture = str_replace('\\', '', $content); $content = $this->remote($capture); } if ($data['cleanafter']) { $content = $this->dataClean($data['cleanafter'], $content); } if ($data['mergepage']) { $_content = $content; preg_match_all("/<img.*?src\\s*=[\"|'|\\s]*(http:\\/\\/.*?\\.(gif|jpg|jpeg|bmp|png)).*?>/is", $_content, $picArray); $pA = array_unique($picArray[1]); $pA = array_filter($pA); $_pcount = count($pA); if ($_pcount < 4) { $content = str_replace('#--iCMS.PageBreak--#', "", $content); } else { $contentA = explode("#--iCMS.PageBreak--#", $_content); $newcontent = array(); $this->checkpage($newcontent, $contentA, 2); if (is_array($newcontent)) { $content = array_filter($newcontent); $content = implode('#--iCMS.PageBreak--#', $content); //$content = addslashes($content); } else { //$content = addslashes($newcontent); $content = $newcontent; } unset($newcontent, $contentA); } unset($_content); } if ($data['empty'] && empty($content)) { if ($this->work) { echo "\n[" . $name . "内容为空!请检查,规则是否正确!]\n"; return false; } else { $this->contTest && (iPHP::$dialog['alert'] = 'window'); iPHP::alert($name . '内容为空!请检查,规则是否正确!!'); } } if ($data['json_decode']) { $content = json_decode($content, true); // $content = preg_replace_callback('/&#\d{2,5};/u','utf8_num_decode',$content); // $content = preg_replace_callback(array( // '/&#x([a-fA-F0-7]{2,8});/u', // '/%u([a-fA-F0-7]{2,8})/u', // '/\\\u([a-fA-F0-7]{2,8})/u' // ),'utf8_entity_decode',$content); // $content = htmlspecialchars_decode($content); } if ($data['array']) { return (array) $content; } return $content; }
public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null) { $pid === NULL && ($pid = spider::$pid); if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if (empty($rid) && $_rid !== NULL) { $rid = $_rid; } if ($work == 'shell') { $lastupdate = $project['lastupdate']; if ($project['psleep']) { if (time() - $lastupdate < $project['psleep']) { echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n"; return; } } echo "[32m开始采集方案[" . $pid . "] 采集规则[" . $rid . "][0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && ($urls = $project['urls']); spiderUrls::$urls && ($urls = spiderUrls::$urls); $_urls && ($urls = $_urls); $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if ($work == 'shell') { // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray as $_key => $_url) { $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if (strpos($_url, 'RULE@') !== false) { list($___s, $_rid, $_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls); echo "<hr />"; } $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL'); $urlsList = array_merge($urlsList, $_urlsList); unset($urlsArray[$_key]); } else { preg_match('|.*<(.*)>.*|is', $_url, $_matches); if ($_matches) { list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]); $url = str_replace($_matches[1], '*', trim($_matches[0])); $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList, $_urlsList); } } } $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList)); unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if (empty($urlsArray)) { if ($work == 'shell') { echo "采集列表为空!请填写!\n"; return false; } iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iS::escapeStr($project)); // print_r(iS::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if ($rule['mode'] == "2") { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; foreach ($urlsArray as $key => $url) { $url = trim($url); spider::$urlslast = $url; if ($work == 'shell') { echo '开始采集列表:' . $url . "\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>' . $url . "<br />"; } $html = spiderTools::remote($url); if (empty($html)) { continue; } if ($rule['mode'] == "2") { $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); // } if ($rule['list_area_format']) { $list_area_format = trim($rule['list_area_format']); if (strpos($list_area_format, 'ARRAY::') !== false) { $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format, $la); } } else { $lists = phpQuery::pq($list_area_format, $list_area); } } else { $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; } else { $list_area_rule = spiderTools::pregTag($rule['list_area_rule']); if ($list_area_rule) { preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iS::escapeStr($rule['list_area_rule']); // echo iS::escapeStr($list_area); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); $list_area = null; unset($list_area); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area); echo "<hr />"; echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']); echo "<hr />"; } if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } //PID@xx 返回URL列表 if ($callback == 'CALLBACK@URL') { $cbListUrl = array(); foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } // if(spider::checker($work)===true){ $cbListUrl[] = spider::$url; // } } return $cbListUrl; } if ($work == "shell") { $pubCount[$url]['count'] = count($lists); $pubAllCount['count'] += $pubCount[$url]['count']; echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n"; foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); echo "title:" . spider::$title . "\n"; echo "url:" . spider::$url . "\n"; spider::$rid = $rid; $checker = spider::checker($work); if ($checker === true) { echo "开始采集...."; $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; echo "....√\n"; if ($project['sleep']) { echo "sleep:" . $project['sleep'] . "s\n"; if ($rule['mode'] != "2") { unset($lists[$lkey]); } gc_collect_cycles(); sleep($project['sleep']); } else { //sleep(1); } } else { $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if ($rule['mode'] == "2") { phpQuery::unloadDocuments($doc->getDocumentID()); } else { unset($lists); } } if ($work == "WEB@MANUAL") { $listsArray[$url] = $lists; } if ($work == "WEB@AUTO" || $work == 'DATA@RULE') { foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>' . $lkey . '<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br /><hr />"; } else { if (spider::checker($work) === true || spider::$dataTest) { $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spiderData::crawl(); // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0'); spider::$dataTest or $suid = iDB::insert('spider_url', $suData); $contentArray[$lkey]['spider_url'] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid)); break; } }
public function dumpTree() { print 'DUMP #' . phpQuery::$dumpCount++ . ' '; $debug = phpQuery::$debug; phpQuery::$debug = false; foreach ($this->stack() as $node) { print $this->__dumpTree($node); } phpQuery::$debug = $debug; return $this; }