예제 #1
0
 private static function setCookieJar()
 {
     // Create the file
     $cj = tempnam("/tmp", "cj");
     $fh = fopen($cj, "w+");
     fclose($fh);
     // Save the cookiejar file
     self::$cookiejar = $cj;
     // Validate the cookiejar
     if (!file_exists($cj)) {
         die("Could not create cookiejar file.");
     }
     if (!is_readable($cj)) {
         die("Cannot read cookiejar file.");
     }
     if (!is_writable($cj)) {
         die("Cannot write to cookiejar file.");
     }
     return;
 }
예제 #2
0
파일: crawler.php 프로젝트: Pamblam/Crawler
 public function crawl($max_depth = 0, $current_depth = 0)
 {
     // Begin the loop through each URL row
     foreach ($this->queue as $k => $page) {
         // Make sure it's a crawlable format
         $ctype = CrawlerRequest::getContentType($page['url']);
         if (strpos($ctype, "text/") === false) {
             $bn = array_pop(explode("/", $page['url']));
             $this->addOutput("Skipping {$bn} - ({$ctype}).");
             // Update the record for the page we just crawled
             CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => "skipped", "depth" => CrawlerPDO::getDepthOfUrl($page['url']), "crawled" => 1));
             continue;
         }
         // Get the depth of the current item
         $depth = CrawlerPDO::getDepthOfUrl($page['url']);
         // Get the page body
         $body = CrawlerRequest::request($page['url']);
         // Get an new instance of our HTML parser
         $parser = new CrawlerParser($body, $page['url']);
         // Add images to database
         $images = $parser->getImages();
         CrawlerPDO::addImages($images, $page['url']);
         // Download images if configured
         if ($this->config['SAVE_IMAGES'] === true) {
             foreach ($images as $image) {
                 // Check download size
                 if (!empty($this->config['MIN_IMAGE_SIZE'])) {
                     $size = CrawlerRequest::getFileSize($image);
                     if ($size < $this->config['MIN_IMAGE_SIZE']) {
                         continue;
                     }
                 }
                 $ctype = CrawlerRequest::getContentType($image);
                 // skip files that don't have explicit contetn type
                 if (strpos($ctype, "image/") === false) {
                     continue;
                 }
                 // get extention
                 $ext = explode("/", $ctype);
                 $ext = $ext[1];
                 // save the file
                 $fn = preg_replace("/[^A-Za-z0-9 ]/", '', $image);
                 $filename = realpath(dirname(__FILE__)) . "/media/cj_{$fn}.{$ext}";
                 // Get the image if we don't already have it
                 if (!file_exists($filename)) {
                     CrawlerRequest::request($image, $params = array(), $filename);
                 }
             }
         }
         /* Crawl result contains two things we need...
          *   - 1) Info needed to update the current $page in the $queue, and
          *   - 2) A new list of links
          *  Each of the new links will be checked to see if they exist in 
          *  the table yet, if they do they will be updated with referrer 
          *  information, etc. If the new link doesn't exist it will be added
          *  to the table to be crawled next time the queue is updated.
          */
         $crawlResult = array("body" => $parser->getPlaintext(), "links" => $parser->getLinks(), "depth" => $depth + 1);
         // Loop thru and check and update or insert each new link
         foreach ($crawlResult['links'] as $link) {
             // If the URL was already discovered
             if (CrawlerPDO::URLDiscovered($link['url'])) {
                 CrawlerPDO::updateRow(array("title" => $link['title'], "url" => $link['url'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth']));
             } else {
                 CrawlerPDO::insertRow(array("url" => $link['url'], "title" => $link['title'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth']));
             }
         }
         // Update the record for the page we just crawled
         CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => $crawlResult['body'], "depth" => $depth, "crawled" => 1));
         // Add some output
         $this->addOutput("Found " . count($crawlResult['links']) . " links on {$page['url']}.");
         // pop this item off the queue
         unset($this->queue[$k]);
     }
     // Queue is empty!
     // Incremenent the depth counter
     $current_depth++;
     if (time() > $this->started + $this->timelimit && $this->timelimit > 0) {
         $this->addOutput("Ran for " . (time() - $this->started) . " seconds, timeout set to " . $this->timelimit . ".");
         return;
     }
     // Refresh the queue and keep going?
     if ($max_depth == 0 || $max_depth > $current_depth) {
         $this->queue = CrawlerPDO::getNextURLs();
         if (!empty($this->queue)) {
             $this->crawl($max_depth, $current_depth);
         }
     }
 }