#!/usr/bin/php <?php require_once 'Classes/Autoloader.php'; $queue = array(); array_push($queue, $argv[1]); $no_parse_extensions = array("zip", "jpg", "png", "gif", "jpeg", "jfif", "dmg", "exe", "rar", "7z", "tar", "gz", "bzip", "swf", "cab", "mp3", "wav", "midi", "tgz", "reg", "msi", "zip", "pkg", "ogg", "ogm", "mpg", "avi", "divx", "m4a", "aiff", "lzw", "psd", "rm", "wmv", "wma"); $base_url_host = parse_url($queue[0], PHP_URL_HOST); $i = 0; while ($url = $queue[$i]) { echo "Downloading: {$url}\n"; $document = new simple_html_dom(); $html = \SpiderMonkey\Downloader::getInstance()->execute($url); if (!$html) { echo "Error downloading URL: " . \SpiderMonkey\Downloader::getInstance()->getStatus(); continue; } $document->load($html); $base = $document->find('head > base', 0); $base_url = $base ? $base->href : $url; echo "Base URL: {$base_url}\n"; $links = $document->find('a'); $unique_links = 0; echo "Found Links: " . count($links) . "\n"; foreach ($links as $link) { $href = $link->href; if (strpos($href, '#') === 0) { # Link is something like '#item', skip it entirely, we already have the guts of this page continue; } if (strpos($href, '#') !== FALSE) { # Link is something like '/page#item' (want to keep part of it though)
function download($url) { echo "DOWNLOAD: {$url}\n"; $document = new simple_html_dom(); $html = \SpiderMonkey\Downloader::getInstance()->execute($url, NULL, PROXY_TYPE, PROXY_ADDRESS); $document->load($html); return $document; }
#!/usr/bin/php <?php require_once "Classes/Autoloader.php"; $subreddit = 'pics'; $current_url = "http://www.reddit.com/r/{$subreddit}/over18?dest=%2Fr%2F{$subreddit}%2F"; $image_link_selector = "#siteTable a.thumbnail"; $next_link_selector = "p.nextprev a"; $acceptable_extensions = array('jpg', 'jpeg', 'png', 'gif', 'jfif'); $post = array('uh' => '', 'over18' => 'yes'); $pages_to_download = 100; $download_directory = "downloads"; for ($i = 1; $i <= $pages_to_download; $i++) { echo "Page: {$i}/{$pages_to_download}\n"; echo "URL: {$current_url}\n"; $document = new simple_html_dom(); $html = \SpiderMonkey\Downloader::getInstance()->execute($current_url, $post); $document->load($html); $images = $document->find($image_link_selector); if (!$images) { echo $document->find('title', 0) . "\n"; die("Couldn't find any images.\n"); } foreach ($images as $image) { $href = $image->href; $data = pathinfo($href); if (in_array($data['extension'], $acceptable_extensions)) { echo "Grab: {$data['basename']}\n"; $picture = file_get_contents($href); $fp = fopen($download_directory . '/' . $data['basename'], 'w'); fwrite($fp, $picture); fclose($fp);