/** * Добавление урлов на страницы с данными в список */ function addPageUrls($urls) { if (sizeof($urls) <= 0) { return; } foreach ($urls as $url) { $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $url); if ($this->isNewUrl($url)) { $this->pagesUrls[] = $url; } } }
function parseItems() { $items = array(); $logger =& Log::singleton("null", "results.log", "ident"); $pattern = '{ <td width="80%"> <a href="([^"]+)"> <b class="c-t-[^>]+">(.+?)</b> </a> <div[^>]*>(.+?)</div> }si'; $pattern = preg_replace("{\\s+}", "\\s*", $pattern); if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) { return new PEAR_Error("Items not found."); } for ($i = 0; $i < sizeof($matches[1]); $i++) { $item = array(); $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $matches[1][$i]); $item["url"] = $url; $item["name"] = StrUtils::cleanString($matches[2][$i]); $item["address"] = StrUtils::cleanString($matches[3][$i]); //if (!$this->checkUpdate($item["name"], $item["address"])) { // $logger->log("Exists item: " . $item["name"] . ", " . $item["address"]); // continue; //} $logger->log("New item: " . $item["name"] . ", " . $item["address"]); $item = array_merge($item, $this->getItemInfo($url)); $pattern = "{.+?bo(\\d+)/ru(\\d+)}si"; if (preg_match($pattern, $url, $numMatches)) { $item["ru"] = $numMatches[2]; $item["bo"] = $numMatches[1]; } else { $item["ru"] = 0; $item["bo"] = 0; } $this->setStatus($item); $items[] = $item; } $logger->log("Size of items: " . sizeof($items)); $logger->log(print_r($items, true)); $this->rubricsData->addNextNums(sizeof($matches[1])); return $items; }
function getSiteRubrics2($url, $parentId = 0, $parserId = 0) { $logger =& Log::singleton("file", "results.log", __FUNCTION__); $logger->log($parentId . ": " . $url); $contents = $this->loadPage($url); $pattern = '{ <div class="catalog"> <a href="([^"]+)">(.+?)</a> .+? </div> }si'; $pattern = preg_replace("{\\s+}", "\\s*", $pattern); if (preg_match_all($pattern, $contents, $matches, PREG_SET_ORDER)) { $res = array(); foreach ($matches as $m) { $url = UrlUtils::getRealUrl($url, StrUtils::cleanString($m[1])); $url = UrlUtils::removeQuery($url); $id = $this->rubricsData->addRubric($url, StrUtils::cleanString($m[2]), -1, $parserId, true, $parentId, 2); if (!PEAR::isError($id)) { $res[] = array("id" => $id, "url" => $url); } } $logger->log(print_r($res, true)); return $res; } else { $logger->log("Rubrics2 for {$url} not found."); return new PEAR_Error("Rubrics2 for {$url} not found."); } }
function parseItems() { $items = array(); $pattern = '{<td\\s+class=table_model[^>]*><a\\s+href=(print\\.php[^\\s]+)\\s*><img\\s+src=}si'; if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) { return new PEAR_Error("Items not found."); } $urls = $matches[1]; for ($i = 0; $i < sizeof($urls); $i++) { $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $urls[$i]); $item = $this->getItemInfo($url); $items[] = $item; } return $items; }
function parseItems() { $items = array(); $pattern = '{<a\\s+href="(\\/shop\\.asp[^"]+)">\\s*<img src="([^"]+)"[^>]*>}si'; if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) { return new PEAR_Error("Items not found."); } $urls = $matches[1]; $smallImg = $matches[2]; for ($i = 0; $i < sizeof($urls); $i++) { $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $urls[$i]); $item = $this->getItemInfo($url); $item["image_small"] = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $smallImg[$i]); $items[] = $item; } return $items; }
function parseItems() { $items = array(); $pattern = "{<p><a href='(/search/[^']+)'\\s*target='_blank'>}si"; if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) { return new PEAR_Error("Items not found."); } $urls = $matches[1]; for ($i = 0; $i < sizeof($urls); $i++) { $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $urls[$i]); $item = $this->getItemInfo($url); $items[] = $item; } return $items; }