Ejemplo n.º 1
0
 /**
  * Добавление урлов на страницы с данными в список
  */
 function addPageUrls($urls)
 {
     if (sizeof($urls) <= 0) {
         return;
     }
     foreach ($urls as $url) {
         $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $url);
         if ($this->isNewUrl($url)) {
             $this->pagesUrls[] = $url;
         }
     }
 }
    function parseItems()
    {
        $items = array();
        $logger =& Log::singleton("null", "results.log", "ident");
        $pattern = '{
			<td width="80%">
			<a href="([^"]+)"> <b class="c-t-[^>]+">(.+?)</b> </a>
			<div[^>]*>(.+?)</div>
		}si';
        $pattern = preg_replace("{\\s+}", "\\s*", $pattern);
        if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) {
            return new PEAR_Error("Items not found.");
        }
        for ($i = 0; $i < sizeof($matches[1]); $i++) {
            $item = array();
            $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $matches[1][$i]);
            $item["url"] = $url;
            $item["name"] = StrUtils::cleanString($matches[2][$i]);
            $item["address"] = StrUtils::cleanString($matches[3][$i]);
            //if (!$this->checkUpdate($item["name"], $item["address"])) {
            //	$logger->log("Exists item: " . $item["name"] . ", " . $item["address"]);
            //	continue;
            //}
            $logger->log("New item: " . $item["name"] . ", " . $item["address"]);
            $item = array_merge($item, $this->getItemInfo($url));
            $pattern = "{.+?bo(\\d+)/ru(\\d+)}si";
            if (preg_match($pattern, $url, $numMatches)) {
                $item["ru"] = $numMatches[2];
                $item["bo"] = $numMatches[1];
            } else {
                $item["ru"] = 0;
                $item["bo"] = 0;
            }
            $this->setStatus($item);
            $items[] = $item;
        }
        $logger->log("Size of items: " . sizeof($items));
        $logger->log(print_r($items, true));
        $this->rubricsData->addNextNums(sizeof($matches[1]));
        return $items;
    }
    function getSiteRubrics2($url, $parentId = 0, $parserId = 0)
    {
        $logger =& Log::singleton("file", "results.log", __FUNCTION__);
        $logger->log($parentId . ": " . $url);
        $contents = $this->loadPage($url);
        $pattern = '{
			<div class="catalog"> 
				<a href="([^"]+)">(.+?)</a> 
				.+? 
			</div>
		}si';
        $pattern = preg_replace("{\\s+}", "\\s*", $pattern);
        if (preg_match_all($pattern, $contents, $matches, PREG_SET_ORDER)) {
            $res = array();
            foreach ($matches as $m) {
                $url = UrlUtils::getRealUrl($url, StrUtils::cleanString($m[1]));
                $url = UrlUtils::removeQuery($url);
                $id = $this->rubricsData->addRubric($url, StrUtils::cleanString($m[2]), -1, $parserId, true, $parentId, 2);
                if (!PEAR::isError($id)) {
                    $res[] = array("id" => $id, "url" => $url);
                }
            }
            $logger->log(print_r($res, true));
            return $res;
        } else {
            $logger->log("Rubrics2 for {$url} not found.");
            return new PEAR_Error("Rubrics2 for {$url} not found.");
        }
    }
 function parseItems()
 {
     $items = array();
     $pattern = '{<td\\s+class=table_model[^>]*><a\\s+href=(print\\.php[^\\s]+)\\s*><img\\s+src=}si';
     if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) {
         return new PEAR_Error("Items not found.");
     }
     $urls = $matches[1];
     for ($i = 0; $i < sizeof($urls); $i++) {
         $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $urls[$i]);
         $item = $this->getItemInfo($url);
         $items[] = $item;
     }
     return $items;
 }
 function parseItems()
 {
     $items = array();
     $pattern = '{<a\\s+href="(\\/shop\\.asp[^"]+)">\\s*<img src="([^"]+)"[^>]*>}si';
     if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) {
         return new PEAR_Error("Items not found.");
     }
     $urls = $matches[1];
     $smallImg = $matches[2];
     for ($i = 0; $i < sizeof($urls); $i++) {
         $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $urls[$i]);
         $item = $this->getItemInfo($url);
         $item["image_small"] = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $smallImg[$i]);
         $items[] = $item;
     }
     return $items;
 }
 function parseItems()
 {
     $items = array();
     $pattern = "{<p><a href='(/search/[^']+)'\\s*target='_blank'>}si";
     if (!preg_match_all($pattern, $this->pageContent, $matches, PREG_PATTERN_ORDER)) {
         return new PEAR_Error("Items not found.");
     }
     $urls = $matches[1];
     for ($i = 0; $i < sizeof($urls); $i++) {
         $url = UrlUtils::getRealUrl(UrlUtils::basePath($this->url), $urls[$i]);
         $item = $this->getItemInfo($url);
         $items[] = $item;
     }
     return $items;
 }