PHP CrawlerPDO Examples

Programming Language: PHP

Class/Type: CrawlerPDO

Examples at hotexamples.com: 3

PHP CrawlerPDO - 3 examples found. These are the top rated real world PHP examples of CrawlerPDO extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

URLDiscovered(1)

addImages(1)

checkTable(1)

doSearch(1)

getDepthOfUrl(1)

getNextURLs(1)

getURLID(1)

insertRow(1)

pdo_instance(1)

updateRow(1)

Example #1

Show file

File: crawlerpdo.php Project: Pamblam/Crawler

 private static function pdo()
 {
     // Get the global config
     global $CrawlerConfig;
     // If the $pdo_instance isn't set...
     if (empty(self::$pdo_instance)) {
         // ...Create the $pdo_instance
         if ($CrawlerConfig['DB_TYPE'] === "MySQL") {
             self::$pdo_instance = new PDO('mysql:host=' . $CrawlerConfig['PDO_CONFIG']['HOST'] . ';' . 'dbname=' . $CrawlerConfig['PDO_CONFIG']['DB'] . ';' . 'charset=utf8', $CrawlerConfig['PDO_CONFIG']['USER'], $CrawlerConfig['PDO_CONFIG']['PASS']);
         } else {
             self::$pdo_instance = new PDO('oci:dbname=//' . $CrawlerConfig['PDO_CONFIG']['HOST'] . '/' . $CrawlerConfig['PDO_CONFIG']['DB'], $CrawlerConfig['PDO_CONFIG']['USER'], $CrawlerConfig['PDO_CONFIG']['PASS']);
             self::$pdo_instance->setAttribute(PDO::ATTR_CASE, PDO::CASE_LOWER);
         }
     }
     self::$pdo_instance->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
     return self::$pdo_instance;
 }

Example #2

Show file

File: search.php Project: Pamblam/Crawler

 * -Crawls a website,
 * -Saves data to database
 * -Saves Page title, URL, Body text, and depth level
 * @author Robert Parham <adelphia at protonmail dot ch>
 * @license Apache 2.0 Lic.
 */
// ini_set('memory_limit','300M');
/*******************************************************************************
 * This is an extremely basic sample search page used to search the crawler results
 ******************************************************************************/
################################################################################
############################## AJAX STUFF ######################################
################################################################################
if (isset($_POST['action']) && $_POST['action'] == "search") {
    require realpath(dirname(__FILE__)) . "/crawler/autoload.php";
    $return = CrawlerPDO::doSearch($_POST['term']);
    echo json_encode($return);
    exit;
}
################################################################################
############################## END AJAX STUFF ##################################
################################################################################
?>
<!DOCTYPE html>
<html lang="en">
    <head>
        
        <!-- metas -->
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1">

Example #3

Show file

File: crawler.php Project: Pamblam/Crawler

 public function crawl($max_depth = 0, $current_depth = 0)
 {
     // Begin the loop through each URL row
     foreach ($this->queue as $k => $page) {
         // Make sure it's a crawlable format
         $ctype = CrawlerRequest::getContentType($page['url']);
         if (strpos($ctype, "text/") === false) {
             $bn = array_pop(explode("/", $page['url']));
             $this->addOutput("Skipping {$bn} - ({$ctype}).");
             // Update the record for the page we just crawled
             CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => "skipped", "depth" => CrawlerPDO::getDepthOfUrl($page['url']), "crawled" => 1));
             continue;
         }
         // Get the depth of the current item
         $depth = CrawlerPDO::getDepthOfUrl($page['url']);
         // Get the page body
         $body = CrawlerRequest::request($page['url']);
         // Get an new instance of our HTML parser
         $parser = new CrawlerParser($body, $page['url']);
         // Add images to database
         $images = $parser->getImages();
         CrawlerPDO::addImages($images, $page['url']);
         // Download images if configured
         if ($this->config['SAVE_IMAGES'] === true) {
             foreach ($images as $image) {
                 // Check download size
                 if (!empty($this->config['MIN_IMAGE_SIZE'])) {
                     $size = CrawlerRequest::getFileSize($image);
                     if ($size < $this->config['MIN_IMAGE_SIZE']) {
                         continue;
                     }
                 }
                 $ctype = CrawlerRequest::getContentType($image);
                 // skip files that don't have explicit contetn type
                 if (strpos($ctype, "image/") === false) {
                     continue;
                 }
                 // get extention
                 $ext = explode("/", $ctype);
                 $ext = $ext[1];
                 // save the file
                 $fn = preg_replace("/[^A-Za-z0-9 ]/", '', $image);
                 $filename = realpath(dirname(__FILE__)) . "/media/cj_{$fn}.{$ext}";
                 // Get the image if we don't already have it
                 if (!file_exists($filename)) {
                     CrawlerRequest::request($image, $params = array(), $filename);
                 }
             }
         }
         /* Crawl result contains two things we need...
          *   - 1) Info needed to update the current $page in the $queue, and
          *   - 2) A new list of links
          *  Each of the new links will be checked to see if they exist in 
          *  the table yet, if they do they will be updated with referrer 
          *  information, etc. If the new link doesn't exist it will be added
          *  to the table to be crawled next time the queue is updated.
          */
         $crawlResult = array("body" => $parser->getPlaintext(), "links" => $parser->getLinks(), "depth" => $depth + 1);
         // Loop thru and check and update or insert each new link
         foreach ($crawlResult['links'] as $link) {
             // If the URL was already discovered
             if (CrawlerPDO::URLDiscovered($link['url'])) {
                 CrawlerPDO::updateRow(array("title" => $link['title'], "url" => $link['url'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth']));
             } else {
                 CrawlerPDO::insertRow(array("url" => $link['url'], "title" => $link['title'], "linked_from" => CrawlerPDO::getURLID($page['url']), "depth" => $crawlResult['depth']));
             }
         }
         // Update the record for the page we just crawled
         CrawlerPDO::updateRow(array("title" => $page['title'], "url" => $page['url'], "body" => $crawlResult['body'], "depth" => $depth, "crawled" => 1));
         // Add some output
         $this->addOutput("Found " . count($crawlResult['links']) . " links on {$page['url']}.");
         // pop this item off the queue
         unset($this->queue[$k]);
     }
     // Queue is empty!
     // Incremenent the depth counter
     $current_depth++;
     if (time() > $this->started + $this->timelimit && $this->timelimit > 0) {
         $this->addOutput("Ran for " . (time() - $this->started) . " seconds, timeout set to " . $this->timelimit . ".");
         return;
     }
     // Refresh the queue and keep going?
     if ($max_depth == 0 || $max_depth > $current_depth) {
         $this->queue = CrawlerPDO::getNextURLs();
         if (!empty($this->queue)) {
             $this->crawl($max_depth, $current_depth);
         }
     }
 }