//Crawler Page <?php //$u = new Utils(); echo $u->loadComponent("crawler"); echo Utils::loadComponent("crawler"); if (class_exists("recursiveCrawler") && !empty($_GET['url'])) { $siteURL = "http://www." . $_GET["url"]; $c = new recursiveCrawler(); $c->parse($siteURL); $returnData = $c->getLinks(); $a = array(); foreach ($returnData as $k => $v) { $a[$k]['title'] = trim(pageContent::getTitle($v)); $a[$k]['content'] = trim(pageContent::getBody($v)); $a[$k]['url'] = trim($v); } $str = "<?php\n\$siteContent = " . var_export($a, 1) . ";\n?>"; file_put_contents(SEARCHRESULTFILE, $str); echo count($a) . " pages crawled!"; //echo $siteURL."<pre>";echo $str;echo "</pre>"; }
public function parse($rawUrl = "") { if ('' == $rawUrl) { return; } $urlDetails = parse_url($rawUrl); if (empty($urlDetails['scheme'])) { $urlDetails['scheme'] = "http"; } if ("http" != $urlDetails['scheme'] && "https" != $urlDetails['scheme']) { return; } $url = $urlDetails['scheme'] . "://" . $urlDetails['host'] . @$urlDetails['path'] . "/" . (!empty($urlDetails['query']) ? "?" . $urlDetails['query'] : ""); if (!in_array($url, self::$links)) { self::$i++; self::$links[] = $url; self::$obj->siteURL = $url; $returnData = self::$obj->parser(); self::$links = (array) array_unique(array_merge((array) self::$links, (array) $returnData['domain'])); if (is_array(@$returnData['domain'])) { foreach ($returnData['domain'] as $k => $v) { self::parse(rtrim($v, "/")); } } } }