function getProducts() { include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; error_reporting(E_ALL ^ E_WARNING); $http = new HttpConnection(); $http->setCookiePath("cookies/"); $http->init(); $contador = 0; $DOM = new DOMDocument(); $url_array = explode(PHP_EOL, file_get_contents('../file/cats_list.ccd')); $contador = 0; while (count($url_array) > 0) { $url = array_shift($url_array); get_dom($url, $DOM, $http); $paginator = $DOM->getElementById('pagination_next_bottom'); if ($paginator != null) { getProductsLinks($DOM); $link_paginator = $paginator->getElementsByTagName('a'); $link_number = $link_paginator->item(0)->getAttribute('href'); $pages = substr($link_number, -1) . '<br/>'; for ($i = 2; $i <= $pages; $i++) { $http->get($url . '?p=' . $i, true); get_dom($url, $DOM, $http); getProductsLinks($DOM); } } else { getProductsLinks($DOM); } } $http->close(); }
function getProducts() { include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; error_reporting(E_ALL ^ E_WARNING); $http = new HttpConnection(); $http->setCookiePath("cookies/"); $http->init(); $DOM = new DOMDocument(); $url_array = split(PHP_EOL, file_get_contents('../file/cats_list.ccd')); while (count($url_array) > 0) { $url = array_shift($url_array); $http->get($url, true); get_dom($url, $DOM, $http); $finder = new DomXPath($DOM); $classname = "product-container"; $product = $finder->query("//*[contains( normalize-space( @class ), ' {$classname} ' )\r\n\t\t \t\t\tor substring( normalize-space( @class ), 1, string-length( '{$classname}' ) + 1 ) = '{$classname} '\r\n\t\t \t\t\tor substring( normalize-space( @class ), string-length( @class ) - string-length( '{$classname}' ) ) = ' {$classname}'\r\n\t\t \t\t\tor @class = '{$classname}']"); foreach ($product as $p) { $enlaces = $p->getElementsByTagName('a'); $enlace = $enlaces->item(0)->getAttribute('href'); echo $enlace . '<br/>'; file_put_contents("../file/url_list.ccd", $enlace . PHP_EOL, FILE_APPEND); } } $http->close(); }
function getProducts() { include "../classes/httpFile.php"; include "../functions/crawler_functions.php"; error_reporting(E_ALL ^ E_WARNING); $http = new HttpConnection(); $http->setCookiePath("cookies/"); $http->init(); $contador = 0; $DOM = new DOMDocument(); $url_array = explode(PHP_EOL, file_get_contents('../file/cats_list.ccd')); while (count($url_array) > 0 && $contador < 400) { $url = array_shift($url_array); if ($url == null || ($url = " ")) { continue; } get_dom($url, $DOM, $http); $paginator = $DOM->getElementById('pagination_next_bottom'); if ($paginator != null) { getProductsLinks($DOM); $link_paginator = $paginator->previousSibling; while ($link_paginator->nodeType !== 1) { $link_paginator = $link_paginator->previousSibling; } $link_last = $link_paginator->getElementsByTagName('a'); $link_number = $link_last->item(0)->getAttribute('href'); $pages = substr($link_number, -1) . '<br/>'; for ($i = 2; $i <= $pages; $i++) { echo $i . '<br />'; $http->get($url . '?id_category=58&n=9&p=' . $i, true); get_dom($url . '?id_category=58&n=9&p=' . $i, $DOM, $http); $contador = $contador + getProductsLinks($DOM); } } else { $contador = $contador + getProductsLinks($DOM); } } $http->close(); if (count($url_array) > 0) { file_put_contents("../file/cats_list.ccd", ""); while (count($url_array) > 0) { $my_url = array_shift($url_array); file_put_contents("../file/cats_list.ccd", $my_url . PHP_EOL, FILE_APPEND); } } echo "</br>Se han procesado: " . $contador . " urls</br>"; echo "Quedan " . count($url_array) . " por procesar<br />"; }