private static function pdo() { // Get the global config global $CrawlerConfig; // If the $pdo_instance isn't set... if (empty(self::$pdo_instance)) { // ...Create the $pdo_instance if ($CrawlerConfig['DB_TYPE'] === "MySQL") { self::$pdo_instance = new PDO('mysql:host=' . $CrawlerConfig['PDO_CONFIG']['HOST'] . ';' . 'dbname=' . $CrawlerConfig['PDO_CONFIG']['DB'] . ';' . 'charset=utf8', $CrawlerConfig['PDO_CONFIG']['USER'], $CrawlerConfig['PDO_CONFIG']['PASS']); } else { self::$pdo_instance = new PDO('oci:dbname=//' . $CrawlerConfig['PDO_CONFIG']['HOST'] . '/' . $CrawlerConfig['PDO_CONFIG']['DB'], $CrawlerConfig['PDO_CONFIG']['USER'], $CrawlerConfig['PDO_CONFIG']['PASS']); self::$pdo_instance->setAttribute(PDO::ATTR_CASE, PDO::CASE_LOWER); } } self::$pdo_instance->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); return self::$pdo_instance; }
* -Crawls a website, * -Saves data to database * -Saves Page title, URL, Body text, and depth level * @author Robert Parham <adelphia at protonmail dot ch> * @license Apache 2.0 Lic. */ // ini_set('memory_limit','300M'); /******************************************************************************* * This is an extremely basic sample search page used to search the crawler results ******************************************************************************/ ################################################################################ ############################## AJAX STUFF ###################################### ################################################################################ if (isset($_POST['action']) && $_POST['action'] == "search") { require realpath(dirname(__FILE__)) . "/crawler/autoload.php"; $return = CrawlerPDO::doSearch($_POST['term']); echo json_encode($return); exit; } ################################################################################ ############################## END AJAX STUFF ################################## ################################################################################ ?> <!DOCTYPE html> <html lang="en"> <head> <!-- metas --> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1">
public function crawl($max_depth = 0, $current_depth = 0) { // Begin the loop through each URL row foreach ($this->queue as $k => $page) { // Make sure it's a crawlable format $ctype = CrawlerRequest::getContentType($page['url']); if (strpos($ctype, "text/") === false) { $bn = array_pop(explode("/", $page['url'])); $this->addOutput("Skipping {$bn} - ({$ctype})."); // Update the record for the page we just crawled CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => "skipped", "depth" => CrawlerPDO::getDepthOfUrl($page['url']), "crawled" => 1)); continue; } // Get the depth of the current item $depth = CrawlerPDO::getDepthOfUrl($page['url']); // Get the page body $body = CrawlerRequest::request($page['url']); // Get an new instance of our HTML parser $parser = new CrawlerParser($body, $page['url']); // Add images to database $images = $parser->getImages(); CrawlerPDO::addImages($images, $page['url']); // Download images if configured if ($this->config['SAVE_IMAGES'] === true) { foreach ($images as $image) { // Check download size if (!empty($this->config['MIN_IMAGE_SIZE'])) { $size = CrawlerRequest::getFileSize($image); if ($size < $this->config['MIN_IMAGE_SIZE']) { continue; } } $ctype = CrawlerRequest::getContentType($image); // skip files that don't have explicit contetn type if (strpos($ctype, "image/") === false) { continue; } // get extention $ext = explode("/", $ctype); $ext = $ext[1]; // save the file $fn = preg_replace("/[^A-Za-z0-9 ]/", '', $image); $filename = realpath(dirname(__FILE__)) . "/media/cj_{$fn}.{$ext}"; // Get the image if we don't already have it if (!file_exists($filename)) { CrawlerRequest::request($image, $params = array(), $filename); } } } /* Crawl result contains two things we need... * - 1) Info needed to update the current $page in the $queue, and * - 2) A new list of links * Each of the new links will be checked to see if they exist in * the table yet, if they do they will be updated with referrer * information, etc. If the new link doesn't exist it will be added * to the table to be crawled next time the queue is updated. */ $crawlResult = array("body" => $parser->getPlaintext(), "links" => $parser->getLinks(), "depth" => $depth + 1); // Loop thru and check and update or insert each new link foreach ($crawlResult['links'] as $link) { // If the URL was already discovered if (CrawlerPDO::URLDiscovered($link['url'])) { CrawlerPDO::updateRow(array("title" => $link['title'], "url" => $link['url'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth'])); } else { CrawlerPDO::insertRow(array("url" => $link['url'], "title" => $link['title'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth'])); } } // Update the record for the page we just crawled CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => $crawlResult['body'], "depth" => $depth, "crawled" => 1)); // Add some output $this->addOutput("Found " . count($crawlResult['links']) . " links on {$page['url']}."); // pop this item off the queue unset($this->queue[$k]); } // Queue is empty! // Incremenent the depth counter $current_depth++; if (time() > $this->started + $this->timelimit && $this->timelimit > 0) { $this->addOutput("Ran for " . (time() - $this->started) . " seconds, timeout set to " . $this->timelimit . "."); return; } // Refresh the queue and keep going? if ($max_depth == 0 || $max_depth > $current_depth) { $this->queue = CrawlerPDO::getNextURLs(); if (!empty($this->queue)) { $this->crawl($max_depth, $current_depth); } } }