Example #1
0
 private static function pdo()
 {
     // Get the global config
     global $CrawlerConfig;
     // If the $pdo_instance isn't set...
     if (empty(self::$pdo_instance)) {
         // ...Create the $pdo_instance
         if ($CrawlerConfig['DB_TYPE'] === "MySQL") {
             self::$pdo_instance = new PDO('mysql:host=' . $CrawlerConfig['PDO_CONFIG']['HOST'] . ';' . 'dbname=' . $CrawlerConfig['PDO_CONFIG']['DB'] . ';' . 'charset=utf8', $CrawlerConfig['PDO_CONFIG']['USER'], $CrawlerConfig['PDO_CONFIG']['PASS']);
         } else {
             self::$pdo_instance = new PDO('oci:dbname=//' . $CrawlerConfig['PDO_CONFIG']['HOST'] . '/' . $CrawlerConfig['PDO_CONFIG']['DB'], $CrawlerConfig['PDO_CONFIG']['USER'], $CrawlerConfig['PDO_CONFIG']['PASS']);
             self::$pdo_instance->setAttribute(PDO::ATTR_CASE, PDO::CASE_LOWER);
         }
     }
     self::$pdo_instance->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
     return self::$pdo_instance;
 }
Example #2
0
 * -Crawls a website,
 * -Saves data to database
 * -Saves Page title, URL, Body text, and depth level
 * @author Robert Parham <adelphia at protonmail dot ch>
 * @license Apache 2.0 Lic.
 */
// ini_set('memory_limit','300M');
/*******************************************************************************
 * This is an extremely basic sample search page used to search the crawler results
 ******************************************************************************/
################################################################################
############################## AJAX STUFF ######################################
################################################################################
if (isset($_POST['action']) && $_POST['action'] == "search") {
    require realpath(dirname(__FILE__)) . "/crawler/autoload.php";
    $return = CrawlerPDO::doSearch($_POST['term']);
    echo json_encode($return);
    exit;
}
################################################################################
############################## END AJAX STUFF ##################################
################################################################################
?>
<!DOCTYPE html>
<html lang="en">
    <head>
        
        <!-- metas -->
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1">
Example #3
0
 public function crawl($max_depth = 0, $current_depth = 0)
 {
     // Begin the loop through each URL row
     foreach ($this->queue as $k => $page) {
         // Make sure it's a crawlable format
         $ctype = CrawlerRequest::getContentType($page['url']);
         if (strpos($ctype, "text/") === false) {
             $bn = array_pop(explode("/", $page['url']));
             $this->addOutput("Skipping {$bn} - ({$ctype}).");
             // Update the record for the page we just crawled
             CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => "skipped", "depth" => CrawlerPDO::getDepthOfUrl($page['url']), "crawled" => 1));
             continue;
         }
         // Get the depth of the current item
         $depth = CrawlerPDO::getDepthOfUrl($page['url']);
         // Get the page body
         $body = CrawlerRequest::request($page['url']);
         // Get an new instance of our HTML parser
         $parser = new CrawlerParser($body, $page['url']);
         // Add images to database
         $images = $parser->getImages();
         CrawlerPDO::addImages($images, $page['url']);
         // Download images if configured
         if ($this->config['SAVE_IMAGES'] === true) {
             foreach ($images as $image) {
                 // Check download size
                 if (!empty($this->config['MIN_IMAGE_SIZE'])) {
                     $size = CrawlerRequest::getFileSize($image);
                     if ($size < $this->config['MIN_IMAGE_SIZE']) {
                         continue;
                     }
                 }
                 $ctype = CrawlerRequest::getContentType($image);
                 // skip files that don't have explicit contetn type
                 if (strpos($ctype, "image/") === false) {
                     continue;
                 }
                 // get extention
                 $ext = explode("/", $ctype);
                 $ext = $ext[1];
                 // save the file
                 $fn = preg_replace("/[^A-Za-z0-9 ]/", '', $image);
                 $filename = realpath(dirname(__FILE__)) . "/media/cj_{$fn}.{$ext}";
                 // Get the image if we don't already have it
                 if (!file_exists($filename)) {
                     CrawlerRequest::request($image, $params = array(), $filename);
                 }
             }
         }
         /* Crawl result contains two things we need...
          *   - 1) Info needed to update the current $page in the $queue, and
          *   - 2) A new list of links
          *  Each of the new links will be checked to see if they exist in 
          *  the table yet, if they do they will be updated with referrer 
          *  information, etc. If the new link doesn't exist it will be added
          *  to the table to be crawled next time the queue is updated.
          */
         $crawlResult = array("body" => $parser->getPlaintext(), "links" => $parser->getLinks(), "depth" => $depth + 1);
         // Loop thru and check and update or insert each new link
         foreach ($crawlResult['links'] as $link) {
             // If the URL was already discovered
             if (CrawlerPDO::URLDiscovered($link['url'])) {
                 CrawlerPDO::updateRow(array("title" => $link['title'], "url" => $link['url'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth']));
             } else {
                 CrawlerPDO::insertRow(array("url" => $link['url'], "title" => $link['title'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth']));
             }
         }
         // Update the record for the page we just crawled
         CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => $crawlResult['body'], "depth" => $depth, "crawled" => 1));
         // Add some output
         $this->addOutput("Found " . count($crawlResult['links']) . " links on {$page['url']}.");
         // pop this item off the queue
         unset($this->queue[$k]);
     }
     // Queue is empty!
     // Incremenent the depth counter
     $current_depth++;
     if (time() > $this->started + $this->timelimit && $this->timelimit > 0) {
         $this->addOutput("Ran for " . (time() - $this->started) . " seconds, timeout set to " . $this->timelimit . ".");
         return;
     }
     // Refresh the queue and keep going?
     if ($max_depth == 0 || $max_depth > $current_depth) {
         $this->queue = CrawlerPDO::getNextURLs();
         if (!empty($this->queue)) {
             $this->crawl($max_depth, $current_depth);
         }
     }
 }