コード例 #1
0
ファイル: Crawler.php プロジェクト: nicksagona/PopSpider
 /**
  * Static method to crawl the URLs
  *
  * @param  string $url
  * @param  array  $elements
  * @param  string $parent
  * @param  string $start
  * @param  string $time
  * @return void
  */
 public static function crawl($url, $elements = null, $parent = null, $start = null, $time = null)
 {
     // Encode the URL
     $url = str_replace(array('%3A', '%2F', '%23', '%3F', '%3D', '%25', '%2B'), array(':', '/', '#', '?', '=', '%', '+'), rawurlencode($url));
     $slashes = substr_count($url, '/') - 2;
     if ($slashes > self::$depth) {
         self::$depth = $slashes;
     }
     if (!array_key_exists($url, self::$urls) && !array_key_exists(strtolower($url), self::$urls)) {
         $spider = new Spider($url, $elements);
         echo '-> (' . $spider->getCode() . ') ' . $url . PHP_EOL;
         if ($spider->isError()) {
             self::$errors[] = array('code' => $spider->getCode(), 'url' => $url, 'parent' => $parent);
         } else {
             self::$urls[$url] = $spider;
             $domain = str_replace(self::$urls[$url]->getSchema(), '', self::$urls[$url]->getBase());
             if (strpos($domain, '/') !== false) {
                 $domain = substr($domain, 0, strpos($domain, '/'));
             }
             $urls = self::$urls[$url]->getElements('a');
             if (null !== $urls) {
                 foreach ($urls as $u) {
                     $expired = false;
                     if (null !== $start && null !== $time) {
                         $expired = time() - $start > $time;
                     }
                     if (!$expired && null !== $u['href'] && $u['href'] != '' && substr($u['href'], 0, 1) != '#' && substr($u['href'], 0, 1) != '?' && stripos($u['href'], $domain) !== false) {
                         self::crawl($u['href'], $elements, $url, $start, $time);
                     }
                 }
             }
         }
     }
 }