function getProducts() { include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; error_reporting(E_ALL ^ E_WARNING); $http = new HttpConnection(); $http->setCookiePath("cookies/"); $http->init(); $DOM = new DOMDocument(); $url_array = split(PHP_EOL, file_get_contents('../file/cats_list.ccd')); while (count($url_array) > 0) { $url = array_shift($url_array); $http->get($url, true); get_dom($url, $DOM, $http); $finder = new DomXPath($DOM); $classname = "product-container"; $product = $finder->query("//*[contains( normalize-space( @class ), ' {$classname} ' )\r\n\t\t \t\t\tor substring( normalize-space( @class ), 1, string-length( '{$classname}' ) + 1 ) = '{$classname} '\r\n\t\t \t\t\tor substring( normalize-space( @class ), string-length( @class ) - string-length( '{$classname}' ) ) = ' {$classname}'\r\n\t\t \t\t\tor @class = '{$classname}']"); foreach ($product as $p) { $enlaces = $p->getElementsByTagName('a'); $enlace = $enlaces->item(0)->getAttribute('href'); echo $enlace . '<br/>'; file_put_contents("../file/url_list.ccd", $enlace . PHP_EOL, FILE_APPEND); } } $http->close(); }
function getProducts() { $base_url = 'http://www.marcapl.com/marca/'; include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; $http = new HttpConnection(); $http->init('galleta'); $DOM = new DOMDocument(); $url_array = explode(PHP_EOL, file_get_contents('../file/cats_list.ccd')); while (count($url_array) > 0) { $url = array_shift($url_array); if ($url == null || $url == " ") { echo "ENtrando"; continue; } //$url = 'http://www.marcapl.com/marca/index.php?seccion=productos&productos=listado&seccion1=Guantes%20de%20Trabajo&seccion2=Abrigo&i=0'; //$http->get(urlencode($url),false); get_dom($url, $DOM, $http); //$listado = $DOM->getElementById('listado_producto_referencia'); $finder = new DomXPath($DOM); $listado = $finder->query("//*[contains(@id, 'listado_producto_referencia')]"); foreach ($listado as $item) { $link = $item->getElementsByTagName('a'); $enlace = $link->item(0)->getAttribute('href'); if ($link != null) { file_put_contents("../file/url_list.ccd", $base_url . str_replace(" ", "%20", $enlace) . PHP_EOL, FILE_APPEND); } echo $enlace . '<br />'; } } $http->close(); }
<?php while (count($url_array) > 0) { $my_url = array_shift($url_array); get_dom($my_url, $DOM, $http); //obtenemos el precio $precio = $DOM->getElementById('our_price_display'); //if($precio==null){ //pequeño fix /*$my_url = try_fix_url($DOM); if($my_url !=null) get_dom($my_url,$DOM,$http); $precio = $DOM->getElementById('our_price_display');*/ if ($precio == null) { echo "producto fallido" . PHP_EOL; $count_failed++; insert_failed($my_url); //ChromePhp::error("producto fallido"); //continue; $precio = 0; $precio_s = 0; } else { //} $precio_s = $precio->nodeValue; $precio_s = preg_replace('/[^0-9,]/', '', $precio_s); } //El lio para buscar las categorias $nodes = null; $finder = new DomXPath($DOM); //$classname="navigation_page"; // $nodes = $finder->query("//*[contains(@class, '$classname')]"); if ($nodes == null) {
/** * 获得商品列表 * * @access public * @params integer $isdelete * @params integer $real_goods * @params integer $conditions * @return array */ function goods_list($is_delete, $real_goods = 1, $conditions = '') { /* 过滤条件 */ $param_str = '-' . $is_delete . '-' . $real_goods; $result = get_filter($param_str); if ($result === false) { $day = getdate(); $today = local_mktime(23, 59, 59, $day['mon'], $day['mday'], $day['year']); $filter['cat_id'] = empty($_REQUEST['cat_id']) ? 0 : intval($_REQUEST['cat_id']); $filter['intro_type'] = empty($_REQUEST['intro_type']) ? '' : trim($_REQUEST['intro_type']); $filter['is_promote'] = empty($_REQUEST['is_promote']) ? 0 : intval($_REQUEST['is_promote']); $filter['stock_warning'] = empty($_REQUEST['stock_warning']) ? 0 : intval($_REQUEST['stock_warning']); $filter['brand_id'] = empty($_REQUEST['brand_id']) ? 0 : intval($_REQUEST['brand_id']); $filter['keyword'] = empty($_REQUEST['keyword']) ? '' : trim($_REQUEST['keyword']); $filter['suppliers_id'] = isset($_REQUEST['suppliers_id']) ? empty($_REQUEST['suppliers_id']) ? '' : trim($_REQUEST['suppliers_id']) : ''; $filter['is_on_sale'] = isset($_REQUEST['is_on_sale']) ? empty($_REQUEST['is_on_sale']) && $_REQUEST['is_on_sale'] === 0 ? '' : trim($_REQUEST['is_on_sale']) : ''; if (isset($_REQUEST['is_ajax']) && $_REQUEST['is_ajax'] == 1) { $filter['keyword'] = json_str_iconv($filter['keyword']); } $filter['sort_by'] = empty($_REQUEST['sort_by']) ? 'goods_id' : trim($_REQUEST['sort_by']); $filter['sort_order'] = empty($_REQUEST['sort_order']) ? 'ASC' : trim($_REQUEST['sort_order']); $filter['extension_code'] = empty($_REQUEST['extension_code']) ? '' : trim($_REQUEST['extension_code']); $filter['collect_link'] = empty($_REQUEST['collect_link']) ? '' : trim($_REQUEST['collect_link']); $filter['favorite_num'] = empty($_REQUEST['favorite_num']) ? 0 : intval($_REQUEST['favorite_num']); $filter['review_num'] = empty($_REQUEST['review_num']) ? 0 : intval($_REQUEST['review_num']); $filter['start_time'] = empty($_REQUEST['start_time']) ? '' : (strpos($_REQUEST['start_time'], '-') > 0 ? local_strtotime($_REQUEST['start_time']) : $_REQUEST['start_time']); $filter['end_time'] = empty($_REQUEST['end_time']) ? '' : (strpos($_REQUEST['end_time'], '-') > 0 ? local_strtotime($_REQUEST['end_time']) : $_REQUEST['end_time']); $filter['is_show_card'] = isset($_REQUEST['is_show_card']) ? trim($_REQUEST['is_show_card']) : (isset($_COOKIE['ECS']['is_show_card']) ? $_COOKIE['ECS']['is_show_card'] : 0); $filter['is_show_keywords'] = isset($_REQUEST['is_show_keywords']) ? trim($_REQUEST['is_show_keywords']) : (isset($_COOKIE['ECS']['is_show_keywords']) ? $_COOKIE['ECS']['is_show_keywords'] : 0); $filter['is_show_brief'] = isset($_REQUEST['is_show_brief']) ? trim($_REQUEST['is_show_brief']) : (isset($_COOKIE['ECS']['is_show_brief']) ? $_COOKIE['ECS']['is_show_brief'] : 0); $filter['is_show_title_cn'] = isset($_REQUEST['is_show_title_cn']) ? trim($_REQUEST['is_show_title_cn']) : (isset($_COOKIE['ECS']['is_show_title_cn']) ? $_COOKIE['ECS']['is_show_title_cn'] : 0); setcookie('ECS[is_show_card]', $filter['is_show_card'], gmtime() + 86400 * 7); setcookie('ECS[is_show_keywords]', $filter['is_show_keywords'], gmtime() + 86400 * 7); setcookie('ECS[is_show_brief]', $filter['is_show_brief'], gmtime() + 86400 * 7); setcookie('ECS[is_show_title_cn]', $filter['is_show_title_cn'], gmtime() + 86400 * 7); $filter['is_delete'] = $is_delete; $filter['real_goods'] = $real_goods; $filter['supp'] = isset($_REQUEST['supp']) && !empty($_REQUEST['supp']) && intval($_REQUEST['supp']) > 0 ? intval($_REQUEST['supp']) : 0; $where = $filter['cat_id'] > 0 ? " AND " . get_children($filter['cat_id']) : ''; /* 推荐类型 */ switch ($filter['intro_type']) { case 'is_best': $where .= " AND is_best=1"; break; case 'is_hot': $where .= ' AND is_hot=1'; break; case 'is_new': $where .= ' AND is_new=1'; break; case 'is_wish': $where .= ' AND is_wish=1'; break; case 'not_is_wish': $where .= ' AND is_wish=0'; break; case 'is_promote': $where .= " AND is_promote = 1 AND promote_price > 0 AND promote_start_date <= '{$today}' AND promote_end_date >= '{$today}'"; break; case 'all_type': $where .= " AND (is_best=1 OR is_hot=1 OR is_new=1 OR (is_promote = 1 AND promote_price > 0 AND promote_start_date <= '" . $today . "' AND promote_end_date >= '" . $today . "'))"; } /* 库存警告 */ if ($filter['stock_warning']) { $where .= ' AND goods_number <= warn_number '; } /* 品牌 */ if ($filter['brand_id']) { $where .= " AND brand_id='{$filter['brand_id']}'"; } if ($filter['favorite_num']) { $where .= " AND favorite_num>='{$filter['favorite_num']}'"; } if ($filter['review_num']) { $where .= " AND review_num>='{$filter['review_num']}'"; } if ($filter['start_time']) { $where .= " AND g.add_time >= '{$filter['start_time']}'"; } if ($filter['end_time']) { $where .= " AND g.add_time <= '{$filter['end_time']}'"; } /* 扩展 */ if ($filter['extension_code']) { $where .= " AND extension_code='{$filter['extension_code']}'"; } /* 关键字 */ if (!empty($filter['keyword'])) { $where .= " AND (goods_id = '" . mysql_like_quote($filter['keyword']) . "' OR goods_name LIKE '%" . mysql_like_quote($filter['keyword']) . "%' OR goods_name_zh LIKE '%" . mysql_like_quote($filter['keyword']) . "%')"; } if (!empty($filter['collect_link'])) { $where .= " AND collect_link LIKE '%" . mysql_like_quote($filter['collect_link']) . "%'"; } if ($real_goods > -1) { $where .= " AND is_real='{$real_goods}'"; } /* 上架 */ if ($filter['is_on_sale'] !== '') { $where .= " AND (is_on_sale = '" . $filter['is_on_sale'] . "')"; } $where_supp = $filter['supp'] > 0 ? 'AND g.supplier_id > 0' : 'AND g.supplier_id = 0'; /* 供货商 */ if (intval($_REQUEST['supp']) > 0) { /* 代码修改_start By www.68ecshop.com */ if (!empty($filter['suppliers_id'])) { //$where .= " AND (supplier_id = '" . $filter['suppliers_id'] . "')"; $where_supp = " AND (g.supplier_id = '" . $filter['suppliers_id'] . "')"; } $filter['supplier_status'] = $_REQUEST['supplier_status'] != '' ? trim($_REQUEST['supplier_status']) : ''; if (isset($filter['supplier_status']) && $filter['supplier_status'] != '') { //$where .= " AND (supplier_status = '" . $filter['supplier_status'] . "')"; $where_supp .= " AND (supplier_status = '" . $filter['supplier_status'] . "')"; } /* 代码修改_end By www.68ecshop.com */ } $where .= $where_supp; $where .= $conditions; /* 记录总数 */ $sql = "SELECT COUNT(*) FROM " . $GLOBALS['ecs']->table('goods') . " AS g WHERE is_delete='{$is_delete}' {$where}"; $filter['record_count'] = $GLOBALS['db']->getOne($sql); /* 分页大小 */ $filter = page_and_size($filter); if (intval($_REQUEST['supp']) > 0) { $sql = "SELECT goods_id, goods_name, keywords, add_time, goods_thumb, product_url, goods_name_zh, goods_type, goods_sn, shop_price, is_on_sale, is_best, is_new, is_hot, sort_order, goods_number, integral, " . " (promote_price > 0 AND promote_start_date <= '{$today}' AND promote_end_date >= '{$today}') AS is_promote " . ", supplier_status, g.supplier_id,supplier_name,favorite_num,review_num,collect_link,goods_brief,is_wish " . " FROM " . $GLOBALS['ecs']->table('goods') . " AS g " . " LEFT JOIN " . $GLOBALS['ecs']->table('supplier') . " AS s ON s.supplier_id = g.supplier_id " . " WHERE is_delete='{$is_delete}' {$where}" . " ORDER BY {$filter['sort_by']} {$filter['sort_order']} " . " LIMIT " . $filter['start'] . ",{$filter['page_size']}"; } else { $sql = "SELECT goods_id, add_time, goods_name, keywords, goods_thumb, product_url, goods_name_zh, goods_type, goods_sn, shop_price, is_on_sale, is_best, is_new, is_hot, sort_order, goods_number, integral, " . " (promote_price > 0 AND promote_start_date <= '{$today}' AND promote_end_date >= '{$today}') AS is_promote " . ", supplier_status, supplier_id,favorite_num,review_num,collect_link,goods_brief,is_wish " . " FROM " . $GLOBALS['ecs']->table('goods') . " AS g WHERE is_delete='{$is_delete}' {$where}" . " ORDER BY {$filter['sort_by']} {$filter['sort_order']} " . " LIMIT " . $filter['start'] . ",{$filter['page_size']}"; } $filter['keyword'] = stripslashes($filter['keyword']); set_filter($filter, $sql, $param_str); } else { $sql = $result['sql']; $filter = $result['filter']; } $row = $GLOBALS['db']->getAll($sql); foreach ($row as $key => $val) { $row[$key]['collect_link_formated'] = get_dom($val['collect_link']); $goods_url = array(); $row[$key]['add_time'] = local_date('Y-m-d H:i:s', $val['add_time']); $all = $GLOBALS['db']->getAll("select * from " . $GLOBALS['ecs']->table('goods_url') . " as g where goods_id='" . $val['goods_id'] . "'"); foreach ($all as $k => $v) { $goods_url[$v['url_id']]['product_url_formated'] = get_dom($v['product_url']); $goods_url[$v['url_id']]['product_url'] = $v['product_url']; $goods_url[$v['url_id']]['is_best'] = $v['is_best']; $goods_url[$v['url_id']]['url_id'] = $v['url_id']; $goods_url[$v['url_id']]['price'] = $v['price']; $goods_url[$v['url_id']]['goods_id'] = $v['goods_id']; } $row[$key]['goods_url'] = $goods_url; } return array('goods' => $row, 'filter' => $filter, 'page_count' => $filter['page_count'], 'record_count' => $filter['record_count']); }
$defs = json_decode($json); debug($defs); if (!is_object($defs)) { die("Error parsing JSON definitions:" . json_last_error()); } if (!$defs->enabled) { continue; } $next = array($defs->url); $visited = array(); $items = array(); $count = 0; do { $url = $next[0]; $visited[$url] = 1; $dom = get_dom($url); $nodes = $dom->query($defs->root); debug(count($nodes) . ' nodes'); foreach ($nodes as $node) { if ($result = parse_item($node, $defs, $fix)) { $items[] = $result; } } $next = array(); if ($defs->next) { foreach ($dom->query($defs->next) as $node) { $next[] = base_url($node->getAttribute('href')); } } } while (!empty($next) && !isset($visited[$next[0]]) && ++$count != 5); // max 5 pages
} } elseif ($_REQUEST['act'] == 'edit_url') { $product_url = empty($_REQUEST['product_url']) ? 0 : trim($_REQUEST['product_url']); $price = empty($_REQUEST['price']) ? 0 : trim($_REQUEST['price']); $goods_id = empty($_REQUEST['goods_id']) ? 0 : intval($_REQUEST['goods_id']); $product_url = json_str_iconv($product_url); $exists = $GLOBALS['db']->getAll("select * from " . $GLOBALS['ecs']->table('goods_url') . " as g where goods_id='" . $goods_id . "' and product_url='{$product_url}'"); if (empty($exists)) { $sql = 'insert INTO ' . $ecs->table('goods_url') . " (goods_id,product_url,price) values({$goods_id},'" . $product_url . "','" . $price . "')"; } else { $sql = 'update ' . $ecs->table('goods_url') . " set product_url='" . $product_url . "',price='" . $price . "' where goods_id='" . $goods_id . "'"; } if ($db->query($sql)) { $all = $GLOBALS['db']->getAll("select * from " . $GLOBALS['ecs']->table('goods_url') . " as g where goods_id='" . $goods_id . "'"); foreach ($all as $k => $v) { $goods_url[$v['url_id']]['product_url_formated'] = get_dom($v['product_url']); $goods_url[$v['url_id']]['product_url'] = $v['product_url']; $goods_url[$v['url_id']]['is_best'] = $v['is_best']; $goods_url[$v['url_id']]['url_id'] = $v['url_id']; $goods_url[$v['url_id']]['price'] = $v['price']; $goods_url[$v['url_id']]['goods_id'] = $v['goods_id']; } $smarty->assign('goods_url', $goods_url); $smarty->assign('goods_id', $goods_id); $str = $smarty->fetch('url_list.htm'); $arr = array("goods_id" => $goods_id, "url_list" => $str); clear_cache_files(); // 清除缓存 make_json_result($arr); } else { make_json_error("修改失败");
function getProducts() { include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; error_reporting(E_ALL ^ E_WARNING); $http = new HttpConnection(); $http->setCookiePath("cookies/"); $http->init(); $contador = 0; $DOM = new DOMDocument(); $url_array = explode(PHP_EOL, file_get_contents('../file/cats_list.ccd')); $contador = 0; while (count($url_array) > 0) { $url = array_shift($url_array); get_dom($url, $DOM, $http); $paginator = $DOM->getElementById('pagination_next_bottom'); if ($paginator != null) { getProductsLinks($DOM); $link_paginator = $paginator->getElementsByTagName('a'); $link_number = $link_paginator->item(0)->getAttribute('href'); $pages = substr($link_number, -1) . '<br/>'; for ($i = 2; $i <= $pages; $i++) { $http->get($url . '?p=' . $i, true); get_dom($url, $DOM, $http); getProductsLinks($DOM); } } else { getProductsLinks($DOM); } } $http->close(); }
function make_site_list() { $first_url = "http://ordnet.dk/ddo/ordbog?aselect=ti&query=ti"; $dom = get_dom($first_url); $parent = $dom->find("div.rulOp", 0)->parent(); $div_a_list = $parent->find("div.searchResultBox div a"); $i = 0; foreach ($div_a_list as $child1) { $i++; if (isset($child1->href)) { $hrefStr = str_replace("&", "&", $child1->href); print "-------------({$i}) " . $child1->tag . ": " . $hrefStr . "\n"; $pos = strpos($hrefStr, "aselect="); $substr = substr($hrefStr, $pos + 8); print "sub[" . $substr . "]\n"; } } // $rec = array( // 'word' => $word, // 'pronunciation' => $pron // ); // return $rec; }
function getProducts() { include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; error_reporting(E_ALL ^ E_WARNING); $http = new HttpConnection(); $http->setCookiePath("cookies/"); $http->init(); $contador = 0; $DOM = new DOMDocument(); $url_array = explode(PHP_EOL, file_get_contents('../file/cats_list.ccd')); while (count($url_array) > 0 && $contador < 400) { $url = array_shift($url_array); if ($url == null || ($url = " ")) { continue; } get_dom($url, $DOM, $http); $paginator = $DOM->getElementById('pagination_next_bottom'); if ($paginator != null) { getProductsLinks($DOM); $link_paginator = $paginator->previousSibling; while ($link_paginator->nodeType !== 1) { $link_paginator = $link_paginator->previousSibling; } $link_last = $link_paginator->getElementsByTagName('a'); $link_number = $link_last->item(0)->getAttribute('href'); $pages = substr($link_number, -1) . '<br/>'; for ($i = 2; $i <= $pages; $i++) { echo $i . '<br />'; $http->get($url . '?id_category=58&n=9&p=' . $i, true); get_dom($url . '?id_category=58&n=9&p=' . $i, $DOM, $http); $contador = $contador + getProductsLinks($DOM); } } else { $contador = $contador + getProductsLinks($DOM); } } $http->close(); if (count($url_array) > 0) { file_put_contents("../file/cats_list.ccd", ""); while (count($url_array) > 0) { $my_url = array_shift($url_array); file_put_contents("../file/cats_list.ccd", $my_url . PHP_EOL, FILE_APPEND); } } echo "</br>Se han procesado: " . $contador . " urls</br>"; echo "Quedan " . count($url_array) . " por procesar<br />"; }
function get_info_attribution($id) { // retourne le code html des informations de l'attribution $retour = ""; $start = "<tr class='mh_tdtitre' align='center'><td class='mh_tdpage'>"; $end = "</td></tr>"; $dom = get_dom(); $attribution = $dom->getElementsByTagName("attrib")->item($id); $retour .= $start . "<h3>" . utf8_decode(stripslashes($attribution->getAttribute("name"))) . " par " . utf8_decode(stripslashes($attribution->getAttribute("pseudo"))) . " le " . $attribution->getAttribute("date") . "</h3>" . $end; $retour .= $start . get_participants($attribution) . $end; $retour .= $start . "<h3>Résutalt du jet : " . $attribution->getAttribute("random") . "</h3>" . $end; $retour .= $start . "<h3>Vainqueur : " . utf8_decode(get_winner($attribution)) . "</h3>" . $end; return $retour; }
//error_reporting(E_ALL ^ E_WARNING); include "../classes/httpFile.php"; //include("classes/product.php"); include "../functions/crawler_functions.php"; $url = $_POST['url']; $script = $_POST['script']; echo $url . ' ' . $script; $http = new HttpConnection(); //$http->setCookiePath("cookies/"); $http->init(); /*$registration = array('email'=>'*****@*****.**','passwd'=>"Ayudasmayores",'back'=>"my-account","SubmitLogin"=>""); $http->post("http://cosmomedica.com/shop/autenticacion",$registration,true,true);*/ $http->get($url, true); echo "<p>registrandose en la aplicacion</p>"; $DOM = new DOMDocument(); get_dom($url, $DOM, $http); $count_success = 0; include "../mainscripts/cat/" . $script; getCats($DOM); //echo "<br />"; //opcciones finales $http->close(); echo "<p>Se han procesado {$count_success} urls correctamente</p>"; echo "<a href='getProducts.php?parser=" . $script . "' class='btn-type'>Obtener Productos</a>"; /* $configuracion['variables']['ref_count'] = $reference_count; $achivo_contenido = ";<?php die(); ?>".PHP_EOL.put_ini_file(null,$configuracion); file_put_contents('crawler.conf.php', $achivo_contenido); echo "<p>Se han procesado $count_success urls correctamente</p>"; echo "<p>".$count_failed." urls han fallado.</p>"; echo "<a class='btn-type' href='./actualproccesview.php'>Ver Resultados</a>";