Esempio n. 1
0
//Crawler Page
<?php 
//$u = new Utils(); echo $u->loadComponent("crawler"); echo Utils::loadComponent("crawler");
if (class_exists("recursiveCrawler") && !empty($_GET['url'])) {
    $siteURL = "http://www." . $_GET["url"];
    $c = new recursiveCrawler();
    $c->parse($siteURL);
    $returnData = $c->getLinks();
    $a = array();
    foreach ($returnData as $k => $v) {
        $a[$k]['title'] = trim(pageContent::getTitle($v));
        $a[$k]['content'] = trim(pageContent::getBody($v));
        $a[$k]['url'] = trim($v);
    }
    $str = "<?php\n\$siteContent = " . var_export($a, 1) . ";\n?>";
    file_put_contents(SEARCHRESULTFILE, $str);
    echo count($a) . " pages crawled!";
    //echo $siteURL."<pre>";echo $str;echo "</pre>";
}
Esempio n. 2
0
 public function parse($rawUrl = "")
 {
     if ('' == $rawUrl) {
         return;
     }
     $urlDetails = parse_url($rawUrl);
     if (empty($urlDetails['scheme'])) {
         $urlDetails['scheme'] = "http";
     }
     if ("http" != $urlDetails['scheme'] && "https" != $urlDetails['scheme']) {
         return;
     }
     $url = $urlDetails['scheme'] . "://" . $urlDetails['host'] . @$urlDetails['path'] . "/" . (!empty($urlDetails['query']) ? "?" . $urlDetails['query'] : "");
     if (!in_array($url, self::$links)) {
         self::$i++;
         self::$links[] = $url;
         self::$obj->siteURL = $url;
         $returnData = self::$obj->parser();
         self::$links = (array) array_unique(array_merge((array) self::$links, (array) $returnData['domain']));
         if (is_array(@$returnData['domain'])) {
             foreach ($returnData['domain'] as $k => $v) {
                 self::parse(rtrim($v, "/"));
             }
         }
     }
 }