function parseItems() { $items = array(); $pagesNum = 0; $sitesNum = 0; $pattern = <<<PATTERN {<div class="refblock">(.+?)</div>}s PATTERN; $pattern = preg_replace("{\\s+}", "\\s*", $pattern); if (!preg_match($pattern, $this->pageContent, $matches)) { return new PEAR_Error("Items not found"); } $pattern = "{<b>(.+?)</b>.+?<b>(.+?)</b>}si"; if (preg_match($pattern, $matches[1], $m)) { $pagesNum = (int) StrUtils::cleanString($m[1], true); $sitesNum = (int) StrUtils::cleanString($m[2], true); } $items[] = array("pages" => $pagesNum, "sites" => $sitesNum, "url" => $this->url); return $items; }
function getSiteRubrics2($url, $parentId = 0, $parserId = 0) { $logger =& Log::singleton("file", "results.log", __FUNCTION__); $contents = $this->loadPage($url); $pattern = '{ <div class="blocklist"> <a href="http://navigator.yp.ru/rusn/.+? </div> }si'; $pattern = preg_replace("{\\s+}", "\\s*", $pattern); if (!preg_match($pattern, $contents, $matches)) { $logger->log("Rubrics2 for {$url} not found."); return new PEAR_Error("Rubrics2 for {$url} not found."); } $pattern = '{ <a href="(http://navigator.yp.ru/rusn/[^"]+)"> (.+?) </a> }si'; $pattern = preg_replace("{\\s+}", "\\s*", $pattern); if (preg_match_all($pattern, $matches[0], $matches, PREG_SET_ORDER)) { $res = array(); foreach ($matches as $m) { $url = StrUtils::cleanString($m[1]); $id = $this->rubricsData->addRubric($url, StrUtils::cleanString($m[2]), -1, $parserId, true, $parentId, 2); if (!PEAR::isError($id)) { $res[] = array("id" => $id, "url" => $url); } } $logger->log(print_r($res, true)); return $res; } else { $logger->log("Rubrics2 for {$url} not found."); return new PEAR_Error("Rubrics2 for {$url} not found."); } }
function findUrl($url, $link) { $filtered = ""; $link = strtolower(UrlUtils::removeWWW(str_replace("http://", "", $link))); $contents = $this->loadPage($url); $pattern = "{<a[^>]+?(href=\".+?\"|href='.+?'|href=.+?\\s)[^>]*>(.+?)</a>}si"; if (!preg_match_all($pattern, $contents, $matches, PREG_SET_ORDER)) { return $filtered; } foreach ($matches as $match) { $src = strtolower(StrUtils::removeQuotes(substr($match[1], 5))); $src = parse_url($src); $src = UrlUtils::removeWWW($src["host"]); if (strcmp($src, $link) == 0) { $filtered = StrUtils::cleanString($match[2]); break; } else { if (strpos($src, $link) !== false) { $filtered = StrUtils::cleanString($match[2]); } } } return $filtered; }