private static function setCookieJar() { // Create the file $cj = tempnam("/tmp", "cj"); $fh = fopen($cj, "w+"); fclose($fh); // Save the cookiejar file self::$cookiejar = $cj; // Validate the cookiejar if (!file_exists($cj)) { die("Could not create cookiejar file."); } if (!is_readable($cj)) { die("Cannot read cookiejar file."); } if (!is_writable($cj)) { die("Cannot write to cookiejar file."); } return; }
public function crawl($max_depth = 0, $current_depth = 0) { // Begin the loop through each URL row foreach ($this->queue as $k => $page) { // Make sure it's a crawlable format $ctype = CrawlerRequest::getContentType($page['url']); if (strpos($ctype, "text/") === false) { $bn = array_pop(explode("/", $page['url'])); $this->addOutput("Skipping {$bn} - ({$ctype})."); // Update the record for the page we just crawled CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => "skipped", "depth" => CrawlerPDO::getDepthOfUrl($page['url']), "crawled" => 1)); continue; } // Get the depth of the current item $depth = CrawlerPDO::getDepthOfUrl($page['url']); // Get the page body $body = CrawlerRequest::request($page['url']); // Get an new instance of our HTML parser $parser = new CrawlerParser($body, $page['url']); // Add images to database $images = $parser->getImages(); CrawlerPDO::addImages($images, $page['url']); // Download images if configured if ($this->config['SAVE_IMAGES'] === true) { foreach ($images as $image) { // Check download size if (!empty($this->config['MIN_IMAGE_SIZE'])) { $size = CrawlerRequest::getFileSize($image); if ($size < $this->config['MIN_IMAGE_SIZE']) { continue; } } $ctype = CrawlerRequest::getContentType($image); // skip files that don't have explicit contetn type if (strpos($ctype, "image/") === false) { continue; } // get extention $ext = explode("/", $ctype); $ext = $ext[1]; // save the file $fn = preg_replace("/[^A-Za-z0-9 ]/", '', $image); $filename = realpath(dirname(__FILE__)) . "/media/cj_{$fn}.{$ext}"; // Get the image if we don't already have it if (!file_exists($filename)) { CrawlerRequest::request($image, $params = array(), $filename); } } } /* Crawl result contains two things we need... * - 1) Info needed to update the current $page in the $queue, and * - 2) A new list of links * Each of the new links will be checked to see if they exist in * the table yet, if they do they will be updated with referrer * information, etc. If the new link doesn't exist it will be added * to the table to be crawled next time the queue is updated. */ $crawlResult = array("body" => $parser->getPlaintext(), "links" => $parser->getLinks(), "depth" => $depth + 1); // Loop thru and check and update or insert each new link foreach ($crawlResult['links'] as $link) { // If the URL was already discovered if (CrawlerPDO::URLDiscovered($link['url'])) { CrawlerPDO::updateRow(array("title" => $link['title'], "url" => $link['url'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth'])); } else { CrawlerPDO::insertRow(array("url" => $link['url'], "title" => $link['title'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth'])); } } // Update the record for the page we just crawled CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => $crawlResult['body'], "depth" => $depth, "crawled" => 1)); // Add some output $this->addOutput("Found " . count($crawlResult['links']) . " links on {$page['url']}."); // pop this item off the queue unset($this->queue[$k]); } // Queue is empty! // Incremenent the depth counter $current_depth++; if (time() > $this->started + $this->timelimit && $this->timelimit > 0) { $this->addOutput("Ran for " . (time() - $this->started) . " seconds, timeout set to " . $this->timelimit . "."); return; } // Refresh the queue and keep going? if ($max_depth == 0 || $max_depth > $current_depth) { $this->queue = CrawlerPDO::getNextURLs(); if (!empty($this->queue)) { $this->crawl($max_depth, $current_depth); } } }