function createSpots() { // TODO :: Some caching ?? $this->pq = $pq = new phpQuery(); $this->dom = $dom = $pq->newDocument($this->owner->template->template_source); if (!$this->owner instanceof \Frontend) { $pq->pq($dom)->attr('xepan-page-content', 'true'); $pq->pq($dom)->addClass('xepan-page-content'); } foreach ($dom['.xepan-component'] as $d) { $d = $pq->pq($d); if (!$d->hasClass('xepan-serverside-component')) { continue; } $i = $this->spots++; $inner_html = $d->html(); $with_spot = '{' . $this->owner->template->name . '_' . $i . '}' . $inner_html . '{/}'; $d->html($with_spot); } $content = $this->updateBaseHrefForTemplates(); $content = str_replace('<!--xEpan-ATK-Header-Start', '', $content); $content = str_replace('xEpan-ATK-Header-End-->', '', $content); $this->owner->template->loadTemplateFromString($content); $this->owner->template->trySet($this->app->page . '_active', 'active'); }
function Plugins_RunServerSideComponent($obj, $page) { include_once getcwd() . '/lib/phpQuery.php'; $pq = new \phpQuery(); $doc = $pq->newDocument($page['content']); $server = $doc['[data-is-serverside-component=true]']; foreach ($doc['[data-is-serverside-component=true]'] as $ssc) { $options = array(); foreach ($ssc->attributes as $attrName => $attrNode) { $options[$attrName] = $pq->pq($ssc)->attr($attrName); } $namespace = $pq->pq($ssc)->attr('data-responsible-namespace'); $view = $pq->pq($ssc)->attr('data-responsible-view'); if (!file_exists($path = getcwd() . DS . 'epan-components' . DS . $namespace . DS . 'lib' . DS . 'View' . DS . 'Tools' . DS . str_replace("View_Tools_", "", $view) . '.php')) { $temp_view = $this->owner->add('View_Error')->set("Server Side Component Not Found :: {$namespace}/{$view}"); } else { $temp_view = $this->owner->add("{$namespace}/{$view}", array('html_attributes' => $options, 'data_options' => $pq->pq($ssc)->attr('data-options'))); } if (!$_GET['cut_object'] and !$_GET['cut_page']) { $html = $temp_view->getHTML(); $pq->pq($ssc)->html("")->append($html); } } $page['content'] = $doc->htmlOuter(); }
public function go() { try { $this->checker = \Ns\Bitrix\Helper::Create('iblock')->useVariant('checker'); } catch (\Exception $e) { prentExpection($e->getMessage()); } foreach ($this->dom->find("table.fileinfo") as $table) { $this->arFields = array(); $this->arFields["IBLOCK_ID"] = self::ALFADOCUMENTS_IBLOCK_ID; $table = \phpQuery::pq($table); $this->arFields["NAME"] = $table->find('a:eq(1)')->text(); $this->arFields["PROPERTY_VALUES"]["LINK"] = $table->find('a:eq(1)')->attr("href"); prent($this->arFields); $this->Add(); } /** * Check and add element to infoblock Terminals */ return true; }
function start_el(&$output, $object, $depth = 0, $args = array(), $current_object_id = 0) { // append next menu element to $output parent::start_el($output, $object, $depth, $args, $current_object_id); // now let's add a custom form field if (!class_exists('phpQuery')) { // load phpQuery at the last moment, to minimise chance of conflicts (ok, it's probably a bit too defensive) require_once 'phpQuery-onefile.php'; } $_doc = phpQuery::newDocumentHTML($output); $_li = phpQuery::pq('li.menu-item:last'); // ":last" is important, because $output will contain all the menu elements before current element // if the last <li>'s id attribute doesn't match $item->ID something is very wrong, don't do anything // just a safety, should never happen... $menu_item_id = str_replace('menu-item-', '', $_li->attr('id')); if ($menu_item_id != $object->ID) { return; } // fetch previously saved meta for the post (menu_item is just a post type) $curr_bg = esc_attr(get_post_meta($menu_item_id, 'snpshpwp_menu_item_bg', TRUE)); $curr_bg_pos = esc_attr(get_post_meta($menu_item_id, 'snpshpwp_menu_item_bg_pos', TRUE)); $curr_upldr = '<span class="button media_upload_button" id="snpshpwp_upload_' . $menu_item_id . '">' . __('Upload', 'snpshpwp') . '</span>'; // by means of phpQuery magic, inject a new input field $_li->find('a.item-delete')->before("\n\t\t\t\t\t<p class='snpshpwp_menu_item_bg description description-thin'>\n\t\t\t\t\t<label for='snpshpwp_menu_item_bg_{$menu_item_id}'>" . __('Background image', 'snpshpwp') . "<br/>\n\t\t\t\t\t<input type='text' value='{$curr_bg}' name='snpshpwp_menu_item_bg_{$menu_item_id}' /><br/>\n\t\t\t\t\t</label>\n\t\t\t\t\t{$curr_upldr}\n\t\t\t\t\t</p>\n\t\t\t\t\t<p class='snpshpwp_menu_item_bg_pos description description-thin'>\n\t\t\t\t\t<label for='snpshpwp_menu_item_bg_{$menu_item_id}'>" . __('Background orientation', 'snpshpwp') . "<br/>\n\t\t\t\t\t<select name='snpshpwp_menu_item_bg_pos_{$menu_item_id}'>\n\t\t\t\t\t\t<option value='left-landscape'" . ($curr_bg_pos == 'left-landscape' ? ' selected' : '') . ">" . __('Left Landscape', 'snpshpwp') . "</option>\n\t\t\t\t\t\t<option value='left-portraid'" . ($curr_bg_pos == 'left-portraid' ? ' selected' : '') . ">" . __('Left Portraid', 'snpshpwp') . "</option>\n\t\t\t\t\t\t<option value='right-landscape'" . ($curr_bg_pos == 'right-landscape' ? ' selected' : '') . ">" . __('Right Landscape', 'snpshpwp') . "</option>\n\t\t\t\t\t\t<option value='right-portraid'" . ($curr_bg_pos == 'right-portraid' ? ' selected' : '') . ">" . __('Right Portraid', 'snpshpwp') . "</option>\n\t\t\t\t\t\t<option value='pattern-repeat'" . ($curr_bg_pos == 'pattern-repeat' ? ' selected' : '') . ">" . __('Pattern', 'snpshpwp') . "</option>\n\t\t\t\t\t\t<option value='framed-full'" . ($curr_bg_pos == 'framed-full' ? ' selected' : '') . ">" . __('Framed', 'snpshpwp') . "</option>\n\t\t\t\t\t</select>\n\t\t\t\t\t</label>\n\t\t\t\t\t</p>\n\t\t\t\t\t"); // swap the $output $output = $_doc->html(); }
function init() { parent::init(); if (!$this->api->auth->isLoggedIn()) { $this->js()->univ()->errorMessage('You Are Not Logged In')->execute(); } if ($_POST['length'] != strlen($_POST['body_html'])) { $this->js()->univ()->successMessage('Length send ' . $_POST['length'] . " AND Length calculated again is " . strlen($_POST['body_html']))->execute(); } if ($_POST['crc32'] != sprintf("%u", crc32($_POST['body_html']))) { $this->js()->univ()->successMessage('CRC send ' . $_POST['crc32'] . " AND CRC calculated again is " . sprintf("%u", crc32($_POST['body_html'])))->execute(); } try { $content = $_POST['body_html']; include_once getcwd() . '/lib/phpQuery.php'; $pq = new \phpQuery(); $doc =& $pq->newDocument(trim($content)); // include_once getcwd().'/lib/phpQuery.php'; // $doc = \phpQuery::newDocument( $content ); $server = $doc['[data-is-serverside-component=true]']; foreach ($doc['[data-is-serverside-component=true]'] as $ssc) { $pq->pq($ssc)->html("")->append($html); } $content = $doc->htmlOuter(); $this->api->current_page['content'] = urldecode(trim($content)); $this->api->current_page['body_attributes'] = urldecode($_POST['body_attributes']); $this->api->exec_plugins('epan-page-before-save', $this->api->current_page); $this->api->current_page->save(); $this->api->exec_plugins('epan-page-after-save', $this->api->current_page); if ($_POST['take_snapshot'] == 'Y') { // $this->api->exec_plugins('epan-page-before-snapshot',$this->api->current_page); $new_version = $this->api->current_page->ref('EpanPageSnapshots'); $new_version['title'] = $this->api->current_page['title']; $new_version['keywords'] = $this->api->current_page['keywords']; $new_version['description'] = $this->api->current_page['description']; $new_version['body_attributes'] = $this->api->current_page['body_attributes']; $new_version['content'] = $this->api->current_page['content']; $new_version->save(); // $this->api->exec_plugins('epan-page-after-snapshot',$this->api->current_page); } } catch (Exception_StopInit $e) { } catch (Exception $e) { throw $e; $this->js()->univ()->errorMessage('Error... Could not save your page ' . $e->getMEssage())->excute(); exit; } echo "saved"; exit; }
public function go() { try { $this->checker = \Ns\Bitrix\Helper::Create('iblock')->useVariant('checker'); } catch (\Exception $e) { prentExpection($e->getMessage()); } foreach ($this->dom->find("span.cat_block") as $span) { $this->arFields = array(); $this->arFields["IBLOCK_ID"] = self::ALFATAXES_IBLOCK_ID; /** * Получение Даты создания и Имени новости */ $span = \phpQuery::pq($span); if ($span->find("strong")) { $mainSection = $this->findOrCreateSection($span->find("strong")->text()); } $this->arFields["IBLOCK_SECTION_ID"] = $this->findOrCreateSection($span->find("a:eq(0)")->text(), $mainSection); foreach ($span->find('table.fileinfo') as $table) { $table = \phpQuery::pq($table); $this->arFields["NAME"] = $table->find("a:eq(1)")->text(); $this->arFields["PROPERTY_VALUES"]["LINK"] = $table->find("a:eq(1)")->attr("href"); prent($this->arFields); // $this->Add(); } /** * Compose name for element of infoblock */ } /** * Check and add element to infoblock Terminals */ return true; }
function init() { parent::init(); if (!$this->api->auth->isLoggedIn()) { $this->js()->univ()->errorMessage('You Are Not Logged In')->execute(); } if ($_POST['length'] != strlen($_POST['body_html'])) { $this->js()->univ()->errorMessage('Length send ' . $_POST['length'] . " AND Length calculated again is " . strlen($_POST['body_html']))->execute(); } if ($_POST['crc32'] != sprintf("%u", crc32($_POST['body_html']))) { $this->js()->univ()->errorsMessage('CRC send ' . $_POST['crc32'] . " AND CRC calculated again is " . sprintf("%u", crc32($_POST['body_html'])))->execute(); } if (strpos($_POST['file_path'], realpath('websites/' . $this->app->current_website_name) !== 0)) { $this->js()->univ()->errorMessage('You cannot save in this location')->execute(); } $html_content = urldecode(trim($_POST['body_html'])); // convert all absolute url to relative $domain = $this->app->pm->base_url . $this->app->pm->base_path . 'websites/' . $this->app->current_website_name . '/www/'; $html_content = str_replace($domain, '', $html_content); // add {$Content} tag if its template being saved if (strpos($_POST['file_path'], $this->app->pm->base_path . 'websites/' . $this->app->current_website_name . '/www/layout/')) { $this->pq = $pq = new phpQuery(); $this->dom = $dom = $pq->newDocument($html_content); foreach ($dom['.xepan-page-wrapper'] as $d) { $d = $pq->pq($d); $d->html('{$Content}'); } $html_content = $dom->html(); // $this->js()->univ()->errorMessage('Yes its template')->execute(); } // $this->js()->univ()->errorMessage($this->app->pm->base_path.'websites/'.$this->app->current_website_name.'/www/layout/')->execute(); try { file_put_contents($_POST['file_path'], $html_content); $this->js()->_selectorDocument()->univ()->successMessage("Content Saved")->execute(); } catch (\Exception $e) { $this->js()->_selectorDocument()->univ()->errorMessage($e->getMessage())->execute(); } }
public function go() { try { $this->checker = \Ns\Bitrix\Helper::Create('iblock')->useVariant('checker'); } catch (\Exception $e) { prentExpection($e->getMessage()); } $table = \phpQuery::pq($this->dom->find("table.mainfnt")); foreach ($table->find("tr") as $tr) { /** * Получение Даты создания и Имени новости */ $tr = \phpQuery::pq($tr); $this->arFields["PROPERTY_VALUES"]["CITY"] = $tr->find('td:eq(0)')->text(); $this->arFields["PROPERTY_VALUES"]["ADDRESS"] = $tr->find('td:eq(1)')->text(); $this->arFields["PROPERTY_VALUES"]["LOCATION"] = $tr->find('td:eq(2)')->text(); $this->arFields["PROPERTY_VALUES"]["CURRENCY"] = $tr->find('td:eq(3)')->text(); $this->arFields["PROPERTY_VALUES"]["OPERATION_TIME"] = $tr->find('td:eq(4)')->text(); $this->arFields["PROPERTY_VALUES"]["STATUS"] = $tr->find('td:eq(5)')->text(); if ($this->arFields["PROPERTY_VALUES"]["LOCATION"] == "РАСПОЛОЖЕНИЕ") { continue; } /** * Compose name for element of infoblock */ $this->arFields["NAME"] = $this->composeName(); $this->Add(); } /** * Check and add element to infoblock Terminals */ return true; }
/** * Enter description here... * * @param string|phpQueryObject * @return phpQueryObject|QueryTemplatesSource|QueryTemplatesParse|QueryTemplatesSourceQuery */ public function add($selector = null) { if (!$selector) { return $this; } $stack = array(); $this->elementsBackup = $this->elements; $found = phpQuery::pq($selector, $this->getDocumentID()); $this->merge($found->elements); return $this->newInstance(); }
/** * Enter description here... * * @param unknown_type $e * @TODO trigger submit for form after form's submit button has a click event */ public static function handleSubmit($e, $callback = null) { $node = phpQuery::pq($e->target); if (!$node->is('form') || !$node->is('[action]')) { return; } // TODO document.location $xhr = isset($node->document->xhr) ? $node->document->xhr : null; $submit = pq($e->relatedTarget)->is(':submit') ? $e->relatedTarget : $node->find('*:submit:first')->get(0); $data = array(); foreach ($node->serializeArray($submit) as $r) { // XXXt.c maybe $node->not(':submit')->add($sumit) would be better ? // foreach($node->serializeArray($submit) as $r) $data[$r['name']] = $r['value']; } $options = array('type' => $node->attr('method') ? $node->attr('method') : 'GET', 'url' => resolve_url($e->data[0], $node->attr('action')), 'data' => $data, 'referer' => $node->document->location); if ($node->attr('enctype')) { $options['contentType'] = $node->attr('enctype'); } $xhr = phpQuery::ajax($options, $xhr); if ((!$callback || !$callback instanceof Callback) && $e->data[1]) { $callback = $e->data[1]; } if ($xhr->getLastResponse()->isSuccessful() && $callback) { phpQuery::callbackRun($callback, array(self::browserReceive($xhr))); } }
public static function match($html, $data, $rule) { $match_hash = array(); if ($data['dom']) { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$dataTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); if (strpos($data['rule'], '@') !== false) { list($content_dom, $content_attr) = explode("@", $data['rule']); $content_fun = 'attr'; } else { list($content_dom, $content_fun, $content_attr) = explode("\n", $data['rule']); } $content_dom = trim($content_dom); $content_fun = trim($content_fun); $content_attr = trim($content_attr); $content_fun or $content_fun = 'html'; if ($data['multi']) { $conArray = array(); $_content = null; foreach ($doc[$content_dom] as $doc_key => $doc_value) { if ($content_attr) { $_content = phpQuery::pq($doc_value)->{$content_fun}($content_attr); } else { $_content = phpQuery::pq($doc_value)->{$content_fun}(); } $cmd5 = md5($_content); if ($match_hash[$cmd5]) { break; } $conArray[$doc_key] = $_content; $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray, $_content, $match_hash); } else { if ($content_attr) { $content = $doc[$content_dom]->{$content_fun}($content_attr); } else { $content = $doc[$content_dom]->{$content_fun}(); } } phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc); } else { if (trim($data['rule']) == '<%content%>') { $content = $html; } else { $data_rule = spiderTools::pregTag($data['rule']); if (preg_match('/(<\\w+>|\\.\\*|\\.\\+|\\\\d|\\\\w)/i', $data_rule)) { if ($data['multi']) { preg_match_all('|' . $data_rule . '|is', $html, $matches, PREG_SET_ORDER); $conArray = array(); foreach ((array) $matches as $mkey => $mat) { $cmd5 = md5($mat['content']); if ($match_hash[$cmd5]) { break; } $conArray[$mkey] = $mat['content']; $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray, $match_hash); } else { preg_match('|' . $data_rule . '|is', $html, $matches, $PREG_SET_ORDER); $content = $matches['content']; } } else { $content = $data_rule; } } } return $content; }
function grab($url, $content, $regex_selector) { try { $parsed_url = parse_url($url); $start = microtime(true); // get Emails and Mobile Number and ... $pattern = '/[a-z0-9_\\-\\+\\.]+(@|(.)?\\[(.)?at(.)?\\](.)?)[a-z0-9\\-]+(\\.|(.)?\\[(.)?dot(.)?\\](.)?)([a-z]{2,3})(?:(\\.|(.)?\\[(.)?dot(.)?\\](.)?)[a-z]{2})?/i'; $pattern = '/[a-z0-9_\\-\\+\\.]{1,80}+@[a-z0-9\\-]{1,80}+\\.([a-z]{2,3})(?:\\.[a-z]{2})?/i'; // preg_match_all returns an associative array preg_match_all($pattern, $content, $email_found); // echo '<br/>'.$path . " [<b> $url </b>] @ <b>$max_page_depth</b> level". "<br/>"; $end = microtime(true); // echo print_r($email_found[0],true) . ' in '.($end-$start).' seconds from <b>'.$url.'</b><br/>'; // ob_flush(); // flush(); $this->grabbed_data[$parsed_url['host']][$parsed_url['path'] . $parsed_url['query']] = $email_found[0]; $pq = new phpQuery(); $doc = @$pq->newDocumentHTML($content); // if($max_domain_depth== $initial_domain_depth) $get_a = $doc[$regex_selector]; // else // $get_a = $doc['a:contains("contact")']; // echo "Found Links: "; $unique_filtered_links = array(); foreach ($get_a as $a) { // echo '<br/>-------- '.$pq->pq($a)->attr('href'). ' <br/>'; preg_match('/(\\.pdf|\\.exe|\\.msi|\\.zip|\\.rar|\\.gz|\\.tar|\\.flv|\\.mov|\\.mpg|\\.mpeg)/i', $pq->pq($a)->attr('href'), $arr); if (count($arr)) { // echo "Found pdf etc so not taking to check in ". $pq->pq($a)->attr('href') .'<br/>'; continue; } $new_website = parse_url($pq->pq($a)->attr('href')); if (!$new_website['scheme']) { $new_website['scheme'] = $parsed_url['scheme']; } if (!$new_website['host']) { $new_website['host'] = $parsed_url['host']; } $new_url = $new_website['scheme'] . '://' . $new_website['host'] . '/' . $new_website['path'] . $new_website['query']; // if(in_array($new_website['path'].$new_website['query'], array_keys($this->grabbed_data[$parsed_url['host']]))){ // echo "Already Visited <br/>"; // continue; // } if (!in_array($new_url, $unique_filtered_links)) { $unique_filtered_links[] = $new_url; } } // echo "Unique Links to check <br/>"; // print_r($unique_filtered_links); $start = microtime(true); $results = $this->multi_request($unique_filtered_links); // ==================== // echo "Fetched ". count($unique_filtered_links). " websites in ". (microtime(true) - $start) . ' seconds <br/>'; $contact_us_pages = array(); foreach ($unique_filtered_links as $id => $site_url) { // somehow if no result was found just carry on if (!$results[$id]) { // echo "No Result for " . $site_url. '<br/>'; continue; } $parsed_url = parse_url($site_url); preg_match_all($pattern, $results[$id], $email_found); $this->grabbed_data[$parsed_url['host']][$parsed_url['path'] . $parsed_url['query']] = $email_found[0]; $doc = @$pq->newDocumentHTML($results[$id]); $get_a = $doc['a:contains("contact")']; foreach ($get_a as $a) { // echo '<br/>-------- '.$pq->pq($a)->attr('href'). ' <br/>'; preg_match('/(\\.pdf|\\.exe|\\.msi|\\.zip|\\.rar|\\.gz|\\.tar|\\.flv|\\.mov|\\.mpg|\\.mpeg)/i', $pq->pq($a)->attr('href'), $arr); if (count($arr)) { // echo "Found pdf etc so not taking to check in ". $pq->pq($a)->attr('href') .'<br/>'; continue; } $new_website = parse_url($pq->pq($a)->attr('href')); if (!$new_website['scheme']) { $new_website['scheme'] = $parsed_url['scheme']; } if (!$new_website['host']) { $new_website['host'] = $parsed_url['host']; } $new_url = $new_website['scheme'] . '://' . $new_website['host'] . '/' . $new_website['path'] . $new_website['query']; // if(in_array($new_website['path'].$new_website['query'], array_keys(is_array($this->grabbed_data[$parsed_url['host']])?:array()))){ // echo "Already Visited <br/>"; // continue; // } if (!in_array($new_url, $contact_us_pages)) { $contact_us_pages[] = $new_url; } } } // echo "Unique Contact Links to check <br/>"; // print_r($contact_us_pages); $start = microtime(true); $results = $this->multi_request($contact_us_pages); // ==================== // echo "Fetched ". count($contact_us_pages). " contact-pages in ". (microtime(true) - $start) . ' seconds <br/>'; foreach ($results as $id => $contact_page_content) { if (!$results[$id]) { // echo "Contact Page no result ". $contact_us_pages[$id] .'<br/>'; continue; } $parsed_url = parse_url($contact_us_pages[$id]); preg_match_all($pattern, $contact_page_content, $email_found); $this->grabbed_data[$parsed_url['host']][$parsed_url['path'] . $parsed_url['query']] = $email_found[0]; } } catch (Exception $e) { return; } }
public static function check_content_code($content) { if (spider::$content_right_code) { if (strpos(spider::$content_right_code, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); $pq_dom = str_replace('DOM::', '', spider::$content_right_code); $matches = (bool) (string) phpQuery::pq($pq_dom); phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc, $content); } else { $matches = strpos($content, spider::$content_right_code); unset($content); } if ($matches === false) { return false; } } if (spider::$content_error_code) { if (strpos(spider::$content_error_code, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); $pq_dom = str_replace('DOM::', '', spider::$content_error_code); $_matches = (bool) (string) phpQuery::pq($pq_dom); phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc, $content); } else { $_matches = strpos($content, spider::$content_error_code); unset($content); } if ($_matches !== false) { return false; } } return true; }
public function go() { try { $this->checker = \Ns\Bitrix\Helper::Create('iblock')->useVariant('checker'); } catch (\Exception $e) { prentExpection($e->getMessage()); } $table = \phpQuery::pq($this->dom->find("div#catalog")->find("table#tbl")); $index = 0; foreach ($table->find("tr") as $tr) { var_dump($index); if (++$index == 1) { continue; } $this->arFields = array(); $this->arFields["IBLOCK_ID"] = self::ALFATERMINALS_IBLOCK_ID; $this->arFields["IBLOCK_SECTION_ID"] = ($this->type == "alfa") ? self::ALFATERMINALS_SECTION_ID : self::SBSTERMINALS_SECTION_ID; /** * Получение информации о банкомете */ $tr = \phpQuery::pq($tr); /** * 1. Name */ $this->arFields["NAME"] = $tr->find('td:eq(0)')->find("a")->text(); $infoLink = $tr->find('td:eq(0)')->find("a")->attr("href"); /** * 2. State */ $this->arFields["PROPERTY_VALUES"]["STATE"] = $tr->find('td:eq(1)')->text(); /** * 3. City as link */ $this->arFields["PROPERTY_VALUES"]["BIND_CITY"] = $this->findCity($tr->find('td:eq(2)')->text()); /** * 4. City text */ $this->arFields["PROPERTY_VALUES"]["CITY"] = $tr->find('td:eq(2)')->text(); /** * 5. Address */ $this->arFields["PROPERTY_VALUES"]["ADDRESS"] = $tr->find('td:eq(3)')->text(); /** * 5. Operating mode */ $this->arFields["PROPERTY_VALUES"]["OPERATION_TIME"] = $tr->find('td:eq(4)')->text(); /** * Find nessesary element of link */ try { $this->arFields["PROPERTY_VALUES"]["WORK_TYPES"] = $this->findWorkType($this->arFields["PROPERTY_VALUES"]["OPERATION_TIME"]); } catch (\Exception $e) { prentExpection($e->getMessage()); } $html = file_get_contents("http://www.alfabank.by" . $infoLink); $fullInfo = \phpQuery::newDocumentHTML($html); $info = \phpQuery::pq($fullInfo->find("div.content")); /** * Compose currency string */ $info->find("div.section.s1")->find('table')->find('tr'); foreach ($info->find("div.section.s4")->find('ul')->find('li') as $li) { $li = \phpQuery::pq($li); $strCurrency .= $li->text() . " "; } $this->arFields["PROPERTY_VALUES"]["CURRENCY"] = trim($strCurrency); unset($strCurrency); /** * Lat & len of map */ // $coordinates = explode(",", $info->find("div.section:eq(5)")->find("div.ya_map_data")->text()); $coordinates = $info->find("div.section:eq(5)")->find("div.ya_map_data")->text(); $this->arFields["PROPERTY_VALUES"]["POINT"] = $coordinates; /** * Compose name for element of infoblock */ prent($this->arFields); $this->Add(); } /** * Check and add element to infoblock Terminals */ return true; }
function check_content_code($content) { if ($this->content_right_code) { if (strpos($this->content_right_code, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); $pq_dom = str_replace('DOM::', '', $this->content_right_code); $matches = (bool) (string) phpQuery::pq($pq_dom); } else { $matches = strpos($content, $this->content_right_code); } if ($matches === false) { $match = false; return false; } } if ($this->content_error_code) { if (strpos($this->content_right_code, 'DOM::') !== false) { iPHP::import(iPHP_LIB . '/phpQuery.php'); $doc = phpQuery::newDocumentHTML($content, 'UTF-8'); $pq_dom = str_replace('DOM::', '', $this->content_right_code); $_matches = (bool) (string) phpQuery::pq($pq_dom); } else { $_matches = strpos($content, $this->content_error_code); } if ($_matches !== false) { $match = false; return false; } } $match = true; return compact('content', 'match'); }
public static function crawl($work = NULL, $pid = NULL, $_rid = NULL, $_urls = null, $callback = null) { $pid === NULL && ($pid = spider::$pid); if ($pid) { $project = spider::project($pid); $cid = $project['cid']; $rid = $project['rid']; $prule_list_url = $project['list_url']; $lastupdate = $project['lastupdate']; } else { $cid = spider::$cid; $rid = spider::$rid; } if (empty($rid) && $_rid !== NULL) { $rid = $_rid; } if ($work == 'shell') { $lastupdate = $project['lastupdate']; if ($project['psleep']) { if (time() - $lastupdate < $project['psleep']) { echo '采集方案[' . $pid . "]:" . format_date($lastupdate) . "刚采集过了,请" . $project['psleep'] / 3600 . "小时后在继续采集\n"; return; } } echo "[32m开始采集方案[" . $pid . "] 采集规则[" . $rid . "][0m\n"; } $ruleA = spider::rule($rid); $rule = $ruleA['rule']; $urls = $rule['list_urls']; $project['urls'] && ($urls = $project['urls']); spiderUrls::$urls && ($urls = spiderUrls::$urls); $_urls && ($urls = $_urls); $urlsArray = explode("\n", $urls); $urlsArray = array_filter($urlsArray); $_urlsArray = $urlsArray; $urlsList = array(); if ($work == 'shell') { // echo "$urls\n"; print_r($urlsArray); } foreach ($_urlsArray as $_key => $_url) { $_url = htmlspecialchars_decode($_url); $_urlsList = array(); /** * RULE@rid@url * url使用[rid]规则采集并返回列表结果 */ if (strpos($_url, 'RULE@') !== false) { list($___s, $_rid, $_urls) = explode('@', $_url); if (spider::$ruleTest) { print_r('<b>使用[rid:' . $_rid . ']规则抓取列表</b>:' . $_urls); echo "<hr />"; } $_urlsList = spiderUrls::crawl($work, false, $_rid, $_urls, 'CALLBACK@URL'); $urlsList = array_merge($urlsList, $_urlsList); unset($urlsArray[$_key]); } else { preg_match('|.*<(.*)>.*|is', $_url, $_matches); if ($_matches) { list($format, $begin, $num, $step, $zeroize, $reverse) = explode(',', $_matches[1]); $url = str_replace($_matches[1], '*', trim($_matches[0])); $_urlsList = spiderTools::mkurls($url, $format, $begin, $num, $step, $zeroize, $reverse); unset($urlsArray[$_key]); $urlsList = array_merge($urlsList, $_urlsList); } } } $urlsList && ($urlsArray = array_merge($urlsArray, $urlsList)); unset($_urlsArray, $_key, $_url, $_matches, $_urlsList, $urlsList); $urlsArray = array_unique($urlsArray); // spider::$useragent = $rule['user_agent']; // spider::$encoding = $rule['curl']['encoding']; // spider::$referer = $rule['curl']['referer']; // spider::$charset = $rule['charset']; if (empty($urlsArray)) { if ($work == 'shell') { echo "采集列表为空!请填写!\n"; return false; } iPHP::alert('采集列表为空!请填写!', 'js:parent.window.iCMS_MODAL.destroy();'); } // if(spider::$ruleTest){ // echo "<pre>"; // print_r(iS::escapeStr($project)); // print_r(iS::escapeStr($rule)); // echo "</pre>"; // echo "<hr />"; // } if ($rule['mode'] == "2") { iPHP::import(iPHP_LIB . '/phpQuery.php'); spider::$ruleTest && $_GET['pq_debug'] && (phpQuery::$debug = 1); } $pubArray = array(); $pubCount = array(); $pubAllCount = array(); spider::$curl_proxy = $rule['proxy']; spider::$urlslast = null; foreach ($urlsArray as $key => $url) { $url = trim($url); spider::$urlslast = $url; if ($work == 'shell') { echo '开始采集列表:' . $url . "\n"; } if (spider::$ruleTest) { echo '<b>抓取列表:</b>' . $url . "<br />"; } $html = spiderTools::remote($url); if (empty($html)) { continue; } if ($rule['mode'] == "2") { $doc = phpQuery::newDocumentHTML($html, 'UTF-8'); $list_area = $doc[trim($rule['list_area_rule'])]; // if(strpos($rule['list_area_format'], 'DOM::')!==false){ // $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); // } if ($rule['list_area_format']) { $list_area_format = trim($rule['list_area_format']); if (strpos($list_area_format, 'ARRAY::') !== false) { $list_area_format = str_replace('ARRAY::', '', $list_area_format); $lists = array(); foreach ($list_area as $la_key => $la) { $lists[] = phpQuery::pq($list_area_format, $la); } } else { $lists = phpQuery::pq($list_area_format, $list_area); } } else { $lists = $list_area; } // $lists = $list_area; //echo 'list:getDocumentID:'.$lists->getDocumentID()."\n"; } else { $list_area_rule = spiderTools::pregTag($rule['list_area_rule']); if ($list_area_rule) { preg_match('|' . $list_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $list_area = $matches['content']; } else { $list_area = $html; } $html = null; unset($html); if (spider::$ruleTest) { echo iS::escapeStr($rule['list_area_rule']); // echo iS::escapeStr($list_area); echo "<hr />"; } if ($rule['list_area_format']) { $list_area = spiderTools::dataClean($rule['list_area_format'], $list_area); } preg_match_all('|' . spiderTools::pregTag($rule['list_url_rule']) . '|is', $list_area, $lists, PREG_SET_ORDER); $list_area = null; unset($list_area); if ($rule['sort'] == "1") { //arsort($lists); } elseif ($rule['sort'] == "2") { asort($lists); } elseif ($rule['sort'] == "3") { shuffle($lists); } } if (spider::$ruleTest) { echo '<b>列表区域规则:</b>' . iS::escapeStr($rule['list_area_rule']); echo "<hr />"; echo '<b>列表区域抓取结果:</b>' . iS::escapeStr($list_area); echo "<hr />"; echo '<b>列表链接规则:</b>' . iS::escapeStr($rule['list_url_rule']); echo "<hr />"; echo '<b>网址合成规则:</b>' . iS::escapeStr($rule['list_url']); echo "<hr />"; } if ($prule_list_url) { $rule['list_url'] = $prule_list_url; } //PID@xx 返回URL列表 if ($callback == 'CALLBACK@URL') { $cbListUrl = array(); foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } // if(spider::checker($work)===true){ $cbListUrl[] = spider::$url; // } } return $cbListUrl; } if ($work == "shell") { $pubCount[$url]['count'] = count($lists); $pubAllCount['count'] += $pubCount[$url]['count']; echo "开始采集:" . $url . " 列表 " . $pubCount[$url]['count'] . "条记录\n"; foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); echo "title:" . spider::$title . "\n"; echo "url:" . spider::$url . "\n"; spider::$rid = $rid; $checker = spider::checker($work); if ($checker === true) { echo "开始采集...."; $callback = spider::publish("shell"); if ($callback['code'] == "1001") { $pubCount[$url]['success']++; $pubAllCount['success']++; echo "....√\n"; if ($project['sleep']) { echo "sleep:" . $project['sleep'] . "s\n"; if ($rule['mode'] != "2") { unset($lists[$lkey]); } gc_collect_cycles(); sleep($project['sleep']); } else { //sleep(1); } } else { $pubCount[$url]['error']++; $pubAllCount['error']++; echo "error\n\n"; continue; } } $pubCount[$url]['published']++; $pubAllCount['published']++; } if ($rule['mode'] == "2") { phpQuery::unloadDocuments($doc->getDocumentID()); } else { unset($lists); } } if ($work == "WEB@MANUAL") { $listsArray[$url] = $lists; } if ($work == "WEB@AUTO" || $work == 'DATA@RULE') { foreach ($lists as $lkey => $row) { list(spider::$title, spider::$url) = spiderTools::title_url($row, $rule, $url); if (spider::$url === false) { continue; } $hash = md5(spider::$url); if (spider::$ruleTest) { echo '<b>列表抓取结果:</b>' . $lkey . '<br />'; echo spider::$title . ' (<a href="' . APP_URI . '&do=testdata' . '&url=' . urlencode(spider::$url) . '&rid=' . $rid . '&pid=' . $pid . '&title=' . urlencode(spider::$title) . '" target="_blank">测试内容规则</a>) <br />'; echo spider::$url . "<br />"; echo $hash . "<br /><hr />"; } else { if (spider::checker($work) === true || spider::$dataTest) { $suData = array('sid' => 0, 'url' => spider::$url, 'title' => spider::$title, 'cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'hash' => $hash); switch ($work) { case 'DATA@RULE': $contentArray[$lkey] = spiderData::crawl(); // $contentArray[$lkey] = spiderUrls::crawl($work,$_pid); unset($suData['sid']); $suData['title'] = addslashes($suData['title']); $suData += array('addtime' => time(), 'status' => '2', 'publish' => '2', 'indexid' => '0', 'pubdate' => '0'); spider::$dataTest or $suid = iDB::insert('spider_url', $suData); $contentArray[$lkey]['spider_url'] = $suid; break; case 'WEB@AUTO': $pubArray[] = $suData; break; } } } } } } $lists = null; unset($lists); gc_collect_cycles(); switch ($work) { case 'WEB@AUTO': return $pubArray; break; case 'DATA@RULE': return $contentArray; break; case 'WEB@MANUAL': return array('cid' => $cid, 'rid' => $rid, 'pid' => $pid, 'sid' => $sid, 'work' => $work, 'rule' => $rule, 'listsArray' => $listsArray); break; case "shell": echo "采集数据统结果:\n"; print_r($pubCount); print_r($pubAllCount); echo "全部采集完成....\n"; iDB::update('spider_project', array('lastupdate' => time()), array('id' => $pid)); break; } }