cbzget.sh.php

#!/usr/bin/env php
<?php
/**
 * @file Make a zip of images from a web page
 *
 * Given an URL of a page containing a gallery,
 * download the images,
 * zip them into an album
 */
/**
 * Requirements
 *
 * Install pear
 * sudo pear install Console_Commandline Net_URL2 Log
 */

// Configurable settings

// Images smaller than this (thumbnails) are discarded.
$min_filesize = 30*1024;

// Try to follow non-image links in case they go to gallery pages
$follow_gallery_pages = TRUE;

// Don't recurse too much though. Keep a panic flag to exit if it seems to be
// out of control.
$maximum_follow_distance = 4;

// If the gallery provides nonsensical filenames, choose to $rename_files
// and a name will be generated based on the page title.
$rename_files = FALSE;

// Traditional CBZ have all files in the root,
// but that explodes if unpacked. Retain the folder inside the package.
$use_subfolder = TRUE;


#######################################
require_once 'Console/CommandLine.php';
require_once 'Log.php';
require_once 'Net/URL2.php';

$blacklist = get_domain_blacklist();

global $logger;
$logger = Log::singleton('console', '', 'ident', NULL, PEAR_LOG_INFO);
function print_log($message, $priority = PEAR_LOG_INFO){
  global $logger;
  $logger->log($message, $priority);
};


// Start parsing options

$parser = new Console_CommandLine();
$parser->description = 'Lists files in the current directory and creates a full size PDF or HTML page listing them with any available metadata.';
$parser->version = '1.1';
$parser->addArgument('url', array('multiple'=>true, 'required' => TRUE));
$parser->addOption('quiet', array(
    'short_name'  => '-q',
    'long_name'   => '--quiet',
    'description' => "Don't print status messages to stdout",
    'action'      => 'StoreTrue'
));
$parser->addOption('verbose', array(
    'short_name'  => '-v',
    'long_name'   => '--verbose',
    'description' => "Display extra log messages",
    'action'      => 'StoreTrue'
));
$parser->addOption('rename', array(
    'short_name'  => '-r',
    'long_name'   => '--rename',
    'description' => "Rename downloaded files to an autogenerated style.",
    'action'      => 'StoreTrue'
));


global $commandline_input;
try {
    $commandline_input = $parser->parse();
} catch (Exception $exc) {
    $parser->displayError($exc->getMessage());
}
$options = $commandline_input->options;

if ($options['verbose']) { $logger->setMask(PEAR_LOG_ALL); ; }
if ($options['quiet']) { $logger->setMask(PEAR_LOG_NONE); ; }
if ($options['rename']) { $rename_files = TRUE; }
$urls = $commandline_input->args['url'];

#print_r($commandline_input);
////////////////////////////////////////////////////////////////////////////
// START
$pages_processed = 0;
$contents = array();
// Ensure uniqueness of filenames by counting if the title starts to repeat.
$title_count = array();

// If there are more than One URL, they get bundled into the same archive.
// The allows scraping paged galleries.

// Use arrayIterator so I can add things to the list-in-progress.
$todo = new ArrayIterator($urls);
foreach ($todo as $url) {
  try {
    $base_url = new Net_URL2($url);

    print_log("Fetching $url");
    $page_source = file_get_contents($url);
    $doc = new DOMDocument();
    @$doc->loadHTML($page_source);

    // Calculate the filename based off the URL, or scraped title.
    // Only calc this the first time.
    if (empty($safe_name)) {
      $original_url = $base_url;
      $safe_name = safename($base_url->host);
      if (substr($safe_name, 0, 3) == 'www') {
        $safe_name = substr($safe_name, 4);
      }
      if ($base_url->path) {
        $safe_name .= '-' .  safename($base_url->path);
      }

      $tags = $doc->getElementsByTagName('title');
      foreach ($tags as $tag) {
        $page_title = trim($tag->textContent);
      }
      if (! empty($page_title)) {
        $safe_name = safename($page_title);
      }

      $dirname = "/tmp/$safe_name";
      mkdir($dirname);

      $page_title = preg_replace('/[^a-zA-Z0-9_\-]+/', ' ', $base_url->path) . ' - ' . $base_url->host;
    }
    // Identification setup complete, start scraping.

    // Discard parts of the page that may be distracting.
    // Wordpress etc.
    if ($unwanted = $doc->getElementByID('sidebar')) {
      $unwanted->parentNode->removeChild($unwanted);
      print_log("Discarded sidebar for cleaner parsing.");
    }


    $tags = $doc->getElementsByTagName('img');

    foreach ($tags as $tag) {
      $suggested_image_title = "";
      // If the image is surrounded by an a tag
      // And that links directly to a jpeg, get that instead
      $tag_parent = $tag->parentNode;
      while ($tag_parent && $tag_parent->nodeName != 'a') {
        $tag_parent = $tag_parent->parentNode;
      }
      if ($tag_parent && $tag_parent->nodeName == 'a') {
        $tag_src = $base_url->resolve($tag_parent->getAttribute('href'));
      }
      else {
        $tag_src = $base_url->resolve($tag->getAttribute('src'));
      }

      // Check it's an image
      $url_parts = parse_url($tag_src);
      if (in_array($url_parts['host'], $blacklist)) {
        continue;
      }

      $filename = basename($url_parts['path']);
      $suffix = pathinfo($filename, PATHINFO_EXTENSION);
      if (strtolower($suffix) != 'jpg') {
        // Maybe it's a gallery page that holds an image.
        $maybe_gallery_img = NULL;
        if ($follow_gallery_pages) {
          print_log("Following a link to $tag_src to see if it's a dedicated image page.");
          $maybe_gallery_img = find_image_in_gallery_page($tag_src, $suggested_image_title);
        }
        if (! $maybe_gallery_img) {
          print_log("$tag_src is a $suffix not a jpeg, skipping it");
          continue;
        }
        // Else we may have found an image after all.
        $tag_src = $maybe_gallery_img;
      }

      // Need to recalculate this if the tag_src has changed from a gallery
      // page to the actual gallery file now.
      $url_parts = parse_url($tag_src);
      $filename = basename($url_parts['path']);
      $suffix = pathinfo($filename, PATHINFO_EXTENSION);

      // Some image servers are /image.php?...&n=07.jpg so we need to grab the n value as the filename.
      if ($filename == 'image.php') {
        parse_str($url_parts['query'], $args);
        if (isset($args['n'])) {
          $filename = $args['n'];
        }
      }

      print_log("Fetching $tag_src");
      // Take care to keep the filename but drop args, anchors and such.
      if ($rename_files && !empty($suggested_image_title)) {

        // Ensure uniqueness - gallery pages may not return useful titles each time.
        if (isset($title_count[$suggested_image_title])) {

          if ($title_count[$suggested_image_title] == 1) {
            // Hm, should retroactivelyrename the first occurance with a 01 now.
            $previous_filename = safename($suggested_image_title) . '.' . $suffix;;
            $replacement_filename = safename($suggested_image_title) . '-01.' . $suffix;;
            $previous = $contents[$previous_filename];
            unset($contents[$previous_filename]);
            rename($dirname . '/' . $previous_filename, $dirname . '/' . $replacement_filename);
            $contents[$replacement_filename] = array(
              'filename' => $replacement_filename,
              'url' => $tag_src,
              'title' => $suggested_image_title,
            );
            print_log("Renamed the first occurance to $replacement_filename");
          }

          $title_count[$suggested_image_title] ++;
          $suggested_image_title .= '-' . sprintf("%02d", $title_count[$suggested_image_title]);
        }
        else {
          $title_count[$suggested_image_title] = 1;
        }

        $filename = $suggested_image_title . '.' . $suffix;
      }

      $safe_filename = safename($filename);
      $save_as = $dirname . '/' . $safe_filename ;
      if (! is_file($save_as)) {
        $file_source = file_get_contents($tag_src);
        file_put_contents($save_as, $file_source);
        print_log("Saved to $save_as");
        // Discard thumbnails if any were picked up. (e-hentai)

        if (filesize($save_as) < $min_filesize) {
          print_log("File size of $save_as is too small, throwing it away.");
          unlink($save_as);
        }
      } 
      else {
        print_log("Did not save fetched $tag_src as '$filename'");
      }
      if (is_file($save_as)) {
        $contents[$save_as] = array(
          'filename' => $safe_filename,
          'url' => $tag_src,
          'title' => $suggested_image_title,
        );
      }
    } // Finished fetching each image into the temp dir.

    // Support paging.
    // If there is a link looking like a 'next', then add that to the list of
    // page URLs that need processing (may loop forever if you get it wrong)
    $tags = $doc->getElementsByTagName('a');
    // Only allow one 'next' to be detected per page
    $nextpage_link = NULL;
    foreach ($tags as $tag) {
      if (trim($tag->textContent) == '>' || strtolower(trim($tag->textContent)) == 'next') {
        $nextpage_link = $tag;
      }
    }
    if ($nextpage_link && $pages_processed <= $maximum_follow_distance) {
      print_log("Looks like I found a next-pager. Adding the 'next' URL to the TO-DO list.");
      $todo->append($base_url->resolve($nextpage_link->getAttribute('href')));
    }

    $pages_processed ++;

  } catch (Exceptions $exc) {
    print_log("Processing $url failed");
  }
} // Each URL

// Downloaded what we need,

if (! empty ($contents)) {
  // make the zip (current directory)
  $zip = new ZipArchive();
  $zipfilename = $safe_name .".cbz";

  if ($zip->open($zipfilename, ZIPARCHIVE::CREATE)!==TRUE) {
    exit("cannot open <$zipfilename>\n");
  }
  $manifest = '';
  foreach ($contents as $filepath => $file) {
    // Traditional CBZ have all files in the root,
    // but that explodes if unpacked. Retain the folder inside the package.
    $innerfilename = $file['filename'];
    if ($use_subfolder) {
      $innerfilename = $safe_name . '/' . $file['filename'];
    }
    $zip->addFile($filepath, $innerfilename);
    $manifest .= "{$innerfilename}, '{$file['url']}'\n";
    $zip->setCommentName($innerfilename, $file['url']);
  }
  $zipinfo = '{
  "appID":"cbzget/001",
  "lastModified":"'. date("Y-m-d h:i:s O") . '",
  "ComicBookInfo/1.0":{
    "title":"'. $page_title . '",
    "series":"'. $original_url->host .'",
    "publisher":"'. $original_url .'",
    "url":"'. $original_url .'",
    "publicationYear":'. date('Y') .',
    "publicationMonth":'. date('n') .',
    "tags":["Downloads"]
  }}';

  $zip->addFromString("ZipInfo.txt", $zipinfo);
  $zip->addFromString("manifest.txt", $manifest);
  $zip->setArchiveComment($zipinfo);
  $zip->close();
  print_log("Created zip file '$zipfilename'");
  print_log("numfiles: {$zip->numFiles}, size: " . format_bytes(filesize($zipfilename)));
}
else {
  print_log("No valid files retrieved. With no contents to bundle, I'm going to skip creating that zip file for you.");
}
// Clean up
rrmdir($dirname);

///////////////////////////////////////////////////////////////////////

/**
 * A list of domains to ignore.
 * These are domains that often look like image thumbnails -
 * An img icon with a link around it is indistinguishable from a thumbnail
 * linking to a gallery page - until you visit it.
 */
function get_domain_blacklist() {
  return array(
    'www.facebook.com',
    'feeds.feedburner.com',
    'twitter.com',
    'www.addthis.com',
    'ehgt.org',
  );
}

/**
 * Guess if I can find the most significant image in a page.
 */
function find_image_in_gallery_page($url, &$title = "") {
  $page_source = file_get_contents($url);
  $base_url = new Net_URL2($url);

  $doc = new DOMDocument();
  @$doc->loadHTML($page_source);

  $tags = $doc->getElementsByTagName('title');
  foreach ($tags as $tag) {
    $title = trim($tag->textContent);
  }

  $imgs = $doc->getElementsByTagName('img');
  $favorite_image = NULL;
  $contents = array();
  foreach ($imgs as $img) {
    // A bunch of heuristics here.
    $parentNode = $img->parentNode;
    $parentNodeName = $parentNode->nodeName;
    $img_url = $img->getAttribute('src');
    $img_url_parts = parse_url($img_url);
    $img_class = $img->getAttribute('class');

    // http://g.e-hentai.org/
    if ($img->getAttribute('id') == 'img') {
      $favorite_image = $img;
    }
    // http://danbooru.donmai.us/
    elseif ($img->getAttribute('id') == 'image') {
      $favorite_image = $img;
    }
    // http://imgdino.com/
    elseif ($img->getAttribute('id') == 'cursor_lupa') {
      $favorite_image = $img;
    }
    // http://imagetwist.com/
    elseif ($img_class == 'pic' && preg_match('/imagetwist.com/', $img_url_parts['host'])) {
      $favorite_image = $img;
    }

    // Is it big?
    elseif (intval($img->getAttribute('height')) >= 350 && intval($img->getAttribute('width')) >= 350) {
      $favorite_image = $img;
    }

    // An embedded image that also just links to itself is often a main image.
    // http://imgcash.net
    // http://picstwist.com
    elseif ($parentNodeName == 'a' && ($parentNode->getAttribute('href') == $img->getAttribute('src'))) {
      $favorite_image = $img;
    }

    // Other patterns for scraping?
  }
  if ($favorite_image) {
    return $base_url->resolve($favorite_image->getAttribute('src'));
  }
}


///////////////////////////////////////////////////////////////////////

function safename($string) {
  return preg_replace('/[^a-zA-Z0-9_\-\.\,\(\)\[\]]+/', '_', $string);
}

function format_bytes($size) {
    $units = array(' B', ' KB', ' MB', ' GB', ' TB');
    for ($i = 0; $size >= 1024 && $i < 4; $i++) $size /= 1024;
    return round($size, 2).$units[$i];
}

function rrmdir($path){
  return is_file($path)?
    @unlink($path):
    array_map('rrmdir',glob($path.'/*'))==@rmdir($path) ;
}