コード例 #1
0
ファイル: index.php プロジェクト: amlun/spidermonkey
#!/usr/bin/php
<?php 
require_once 'Classes/Autoloader.php';
$queue = array();
array_push($queue, $argv[1]);
$no_parse_extensions = array("zip", "jpg", "png", "gif", "jpeg", "jfif", "dmg", "exe", "rar", "7z", "tar", "gz", "bzip", "swf", "cab", "mp3", "wav", "midi", "tgz", "reg", "msi", "zip", "pkg", "ogg", "ogm", "mpg", "avi", "divx", "m4a", "aiff", "lzw", "psd", "rm", "wmv", "wma");
$base_url_host = parse_url($queue[0], PHP_URL_HOST);
$i = 0;
while ($url = $queue[$i]) {
    echo "Downloading: {$url}\n";
    $document = new simple_html_dom();
    $html = \SpiderMonkey\Downloader::getInstance()->execute($url);
    if (!$html) {
        echo "Error downloading URL: " . \SpiderMonkey\Downloader::getInstance()->getStatus();
        continue;
    }
    $document->load($html);
    $base = $document->find('head > base', 0);
    $base_url = $base ? $base->href : $url;
    echo "Base URL: {$base_url}\n";
    $links = $document->find('a');
    $unique_links = 0;
    echo "Found Links: " . count($links) . "\n";
    foreach ($links as $link) {
        $href = $link->href;
        if (strpos($href, '#') === 0) {
            # Link is something like '#item', skip it entirely, we already have the guts of this page
            continue;
        }
        if (strpos($href, '#') !== FALSE) {
            # Link is something like '/page#item' (want to keep part of it though)
コード例 #2
0
ファイル: zillow.php プロジェクト: amlun/spidermonkey
function download($url)
{
    echo "DOWNLOAD: {$url}\n";
    $document = new simple_html_dom();
    $html = \SpiderMonkey\Downloader::getInstance()->execute($url, NULL, PROXY_TYPE, PROXY_ADDRESS);
    $document->load($html);
    return $document;
}
コード例 #3
0
#!/usr/bin/php
<?php 
require_once "Classes/Autoloader.php";
$subreddit = 'pics';
$current_url = "http://www.reddit.com/r/{$subreddit}/over18?dest=%2Fr%2F{$subreddit}%2F";
$image_link_selector = "#siteTable a.thumbnail";
$next_link_selector = "p.nextprev a";
$acceptable_extensions = array('jpg', 'jpeg', 'png', 'gif', 'jfif');
$post = array('uh' => '', 'over18' => 'yes');
$pages_to_download = 100;
$download_directory = "downloads";
for ($i = 1; $i <= $pages_to_download; $i++) {
    echo "Page: {$i}/{$pages_to_download}\n";
    echo "URL: {$current_url}\n";
    $document = new simple_html_dom();
    $html = \SpiderMonkey\Downloader::getInstance()->execute($current_url, $post);
    $document->load($html);
    $images = $document->find($image_link_selector);
    if (!$images) {
        echo $document->find('title', 0) . "\n";
        die("Couldn't find any images.\n");
    }
    foreach ($images as $image) {
        $href = $image->href;
        $data = pathinfo($href);
        if (in_array($data['extension'], $acceptable_extensions)) {
            echo "Grab: {$data['basename']}\n";
            $picture = file_get_contents($href);
            $fp = fopen($download_directory . '/' . $data['basename'], 'w');
            fwrite($fp, $picture);
            fclose($fp);