Exemple #1
0
 public function crawl_page($url)
 {
     // crawl_page
     $c = new Crawler($url);
     // get title
     $c->go_to('<title>');
     $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>');
     $title = preg_replace('/\\W/', '_', $title);
     // get the date
     $c->go_to('date_left.gif');
     $c->readline(2);
     $line = $c->curline;
     preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches);
     //print_r($matches);flush();
     list($full, $month, $date, $year) = $matches;
     if (strlen($date) < 2) {
         $date = '0' . $date;
     }
     if (strlen($month) < 2) {
         $month = '0' . $month;
     }
     $fileprefix = "{$year}_{$month}_{$date}_{$title}";
     // get the img url
     $c->go2linewhere('<td bgcolor=#FFFFFF');
     $line = $c->curline;
     preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches);
     $img = $matches[1];
     $filename = basename($img);
     $ext = substr($filename, strrpos($filename, '.'));
     echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>";
     flush();
     $c->close();
     unset($c);
 }
function crawl_1_chapter($url, $chapter)
{
    global $sitename;
    global $prefix;
    $c = new Crawler($url);
    $c->go_to('name="pagejump"');
    $pages = array();
    while ($line = $c->readline()) {
        if (Crawler::is_there($line, '<option')) {
            $pages[] = Crawler::extract($line, 'value="', '"');
        } else {
            if (Crawler::is_there($line, '</select>')) {
                break;
            }
        }
    }
    $c->go_to('id="nextpage"');
    $c->readline();
    $img = $c->getbetween('src="', '"');
    $c->close();
    $img_base = dirname($img);
    $ext = '.jpg';
    $chapter = Crawler::pad($chapter, 3);
    foreach ($pages as $page) {
        echo "<a href='{$img_base}/{$page}{$ext}'>{$prefix}-{$chapter}-{$page}{$ext}</a><br/>\n";
        flush();
    }
    //print_r($pages);flush();
}
function crawl_1_page($url)
{
    echo "URL2 {$url} <br/>\n";
    flush();
    $dirname = html_entity_decode(Crawler::cutfromlast1(substr($url, 0, strlen($url) - 1), '/'));
    $hasil = array();
    $c = new Crawler($url);
    $c->go_to('<div class="entry">');
    while ($line = $c->readline()) {
        if (Crawler::is_there($line, "href='")) {
            $img = Crawler::extract($line, "href='", "'");
            echo "<a href='{$img}'>{$dirname}</a><br/>\n";
            flush();
        } else {
            if (Crawler::is_there($line, 'href="')) {
                $img = Crawler::extract($line, 'href="', '"');
                echo "<a href='{$img}'>{$dirname}</a><br/>\n";
                flush();
            } else {
                if (Crawler::is_there($line, '</div>')) {
                    break;
                }
            }
        }
    }
    $c->close();
}
Exemple #4
0
function crawl_album($url, $alias = false)
{
    $c = new Crawler($url);
    $c->go_to('<noscript>');
    $c->go_to('<noscript>');
    $c->readline();
    $target = '';
    //$c->curline;
    while ($line = $c->readline()) {
        if (Crawler::is_there($line, '</noscript>')) {
            break;
        } else {
            $target .= trim($line);
        }
    }
    $hasil = Crawler::extract_to_array($target, 'src="', '"');
    $c->close();
    /* kalo mo ngambil desc sebagai nama file
    	preg_match_all('/<img src="([^"]+)"><\\/a><p><a [^>]+>([^<]+)<\\/a>/', $target, $match);
    	//file_put_contents('picasaweb.out', print_r($match, true));exit;
    	foreach ($match[1] as $i => $uri) {
    		$info = pathinfo(basename($uri));
    		$ext = $info['extension'];
    		$name = $match[2][$i];
    		$img = str_replace('/s128/', '/', $uri);
    		echo "<a href='$img'>$name.$ext</a><br />\n";
    	}
    	exit;
    	*/
    if ($alias) {
        foreach ($hasil as $img) {
            $img = str_replace('/s128/', '/d/', $img);
            echo "<a href='{$img}'>{$alias}</a><br/>\n";
            flush();
        }
    } else {
        foreach ($hasil as $img) {
            $img = str_replace('/s128/', '/d/', $img);
            $basename = urldecode(basename($img));
            echo "<a href='{$img}'>{$basename}</a><br/>\n";
            flush();
        }
    }
}
 public function go()
 {
     $mark1 = '<a target="_blank" title="Show fullsized image" href=';
     $mark2 = '<a title="Next Image" rel="next" href=';
     $host = 'http://lu.scio.us';
     $finish = false;
     $number = 0;
     $url = $this->url;
     preg_match('/\\/([^\\/]+)\\/page\\/1/', $url, $m);
     $text = $m[1];
     while (!$finish) {
         echo $url . "<br/>\n";
         flush();
         $c = new Crawler($url);
         $c->go_to('id="pid_');
         while ($line = $c->readline()) {
             if (Crawler::is_there($line, 'src="')) {
                 $img = Crawler::extract($line, 'src="', '"');
                 $img = str_replace('thumb_100_', @$_GET['big'] ? '' : 'normal__', $img);
                 $num = Crawler::pad(++$number, 3);
                 $filnm = basename($img);
                 $ext = Crawler::cutafter($filnm, '.');
                 // $text = $num . $ext;
                 // preg_match('/\/(\d+\/\d+)\//', $img, $m);
                 // $text = $m[1];
                 echo "<a href='{$img}'>{$text}</a><br/>\n";
                 flush();
             } else {
                 if (Crawler::is_there($line, '</ul>')) {
                     break;
                 }
             }
         }
         $c->go_to('class="pager"');
         $c->readline();
         if (Crawler::is_there($c->curline, 'Pager_next')) {
             $finish = false;
             $url = $host . Crawler::extract($c->curline, '<a rel="next" href="', '"');
         } else {
             $finish = true;
         }
         $c->close();
     }
 }
 public function go()
 {
     // http://www.fakku.net/viewonline.php?id=2589
     // pake curl
     $base = 'http://www.fakku.net';
     // $this->url = str_replace('viewmanga.php', 'viewonline.php', $this->url);
     if (!preg_match('/\\/read$/', $this->url)) {
         $this->url .= '/read';
     }
     /*
     $ch = curl_init($this->url);
     curl_setopt($ch, CURLOPT_HEADER, 0);
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
     file_put_contents('fakku.temp', curl_exec($ch));
     curl_close($ch);
     */
     $craw = new Crawler($this->url, array('use_curl' => true, 'agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'));
     $craw->go_to('var data = {');
     $json = Crawler::extract($craw->curline, ' = ', ';');
     $obj = json_decode($json);
     /*
     $craw->go_to('var mirror = ');
     $mirror = $craw->getbetween("'", "'");
     $craw->go_to('var mirror = ');
     $mirror2 = $craw->getbetween("'", "'");
     if ($mirror2) $mirror = $mirror2;
     */
     // 2012-05-06 fakku berubah
     $craw->go_to('function imgpath(');
     $craw->go_to('return \'');
     $imgpath = $craw->getbetween("return '", "';");
     $craw->close();
     $dir = basename(dirname($this->url));
     foreach ($obj->thumbs as $key => $val) {
         $filename = Crawler::pad($key + 1, 3);
         // $img = $mirror . '/' . $obj->meta->dir . 'images/' . $filename;
         $img = str_replace("' + x + '", $filename, $imgpath);
         $text = $dir;
         echo "<a href='{$img}'>{$text}</a><br/>\n";
         flush();
     }
 }
 public function mangareader_1_page($fil, $url, $prefix, $chapter)
 {
     $chapter = Crawler::pad($chapter, 3);
     $c = new Crawler($fil);
     $c->go_to('width="800"');
     $img = $c->getbetween('src="', '"');
     preg_match('/(\\d+\\.\\w+)$/', basename($img), $m);
     $iname = $m[1];
     $c->close();
     $name = $prefix . '-' . $chapter . '-' . $iname;
     return array($name => $img);
 }
 public function mangareader_1_page($fil, $url, $chapter)
 {
     $prefix = $this->prefix;
     $chapter = Crawler::pad($chapter, 3);
     $c = new Crawler($fil);
     $c->go_to('width="800"');
     $img = $c->getbetween('src="', '"');
     // if (@$_GET['show_url']) echo "<a href='$url'>URL</a> ";
     preg_match('/(\\d+\\.\\w+)$/', basename($img), $m);
     $iname = $m[1];
     echo '<li><a href="' . $img . '">' . $prefix . '-' . $chapter . '-' . $iname . '</a>' . "</li>\n";
     $c->close();
 }
 private function download_all($base, $destination)
 {
     $sitename = "http://www.imagefap.com";
     $finish = false;
     $firstbase = $base;
     $i = 1;
     while (!$finish) {
         $c = new Crawler($base);
         echo $base . "\n";
         $c->go_to(array('<table style=', ':: next ::'));
         if (Crawler::is_there($c->curline, ':: next ::')) {
             $finish = false;
             $urld = Crawler::extract($c->curline, 'href="', '"');
             $base = $firstbase . html_entity_decode($urld);
             $c->go_to('<table style=');
         } else {
             $finish = true;
         }
         while ($line = $c->readline()) {
             if (Crawler::is_there($line, 'border=0')) {
                 $img = Crawler::extract($line, 'src="', '"');
                 $img = str_replace('/thumb/', '/full/', $img);
                 $img = preg_replace('/\\/x\\d\\./', '/', $img);
                 $filename = basename($img);
                 $ext = Crawler::cutfromlast($filename, '.');
                 $text = Crawler::n($i++, 4);
                 $this->save_to($img, "{$destination}/{$text}{$ext}");
             } else {
                 if (Crawler::is_there($line, '</form>')) {
                     break;
                 }
             }
         }
         $c->close();
     }
 }
function crawl_1_chapter($url, $chapter)
{
    global $sitename;
    global $prefix;
    // http://ani-haven.net/hr-alpha/Psyren/145/
    // @todo
    $chapter = Crawler::pad($chapter, 3);
    $c = new Crawler($url);
    $c->go_to('id="myselectbox3"');
    $c->readline();
    $pages = Crawler::extract_to_array($c->curline, 'value="', '"');
    $c->close();
    // append $url ke $pages
    foreach ($pages as $i => $page) {
        $pages[$i] = $url . $page;
    }
    Crawler::multiProcess(4, $pages, 'crawl_1_page', array($chapter));
}
Exemple #11
0
function foolreader_1_chapter($url, $chapter)
{
    global $sitename;
    global $prefix;
    $chapter = Crawler::pad($chapter, 3);
    $c = new Crawler($url);
    $c->go_to('imageArray = new Array');
    while ($line = $c->readline()) {
        if (Crawler::is_there($line, 'imageArray[')) {
            $img = Crawler::extract($line, "'", "'");
            if (strpos($img, 'http://') !== 0) {
                $img = $sitename . $img;
            }
            $fname = basename($img);
            echo "<a href='{$img}'>{$prefix}-{$chapter}-{$fname}</a><br/>\n";
        } else {
            if (Crawler::is_there($line, 'function loadImage')) {
                break;
            }
        }
    }
    $c->close();
    /*
    // @TODO
    $pages = array();
    while ($line = $c->readline()) {
    	if (Crawler::is_there($line, '<option')) {
    		$pages[] = $sitename . Crawler::extract($line, 'value=\'', "'");
    	} else if (Crawler::is_there($line, '</select>')) {
    		break;
    	}
    }
    //$pages = Crawler::extract_to_array($c->curline, 'value="', '"');
    $c->close();
    
    //Crawler::multiProcess(4, $pages, 'foolreader_1_page', array($chapter));
    */
}
Exemple #12
0
 $c->go_to('class="ehggt"');
 $pages = array();
 while ($line = $c->readline()) {
     if (Crawler::is_there($line, '"ehga"')) {
         $pages[] = Crawler::extract($line, 'href="', '"');
     } else {
         if (Crawler::is_there($line, '</table>')) {
             break;
         }
     }
 }
 $c->close();
 foreach ($pages as $url) {
     echo "URL:{$url}<br/>\n";
     $c = new Crawler($url, array('use_curl' => true));
     $c->go_to('</span>');
     // ambil image source
     $raws = Crawler::extract_to_array($c->curline, 'src="', '"');
     echo '<pre>';
     print_r($raws);
     echo '</pre>';
     // gambar image biasanya berada di $raws[4] atau $raws[5]
     if (Crawler::is_there($raws[0], '/n/next.png')) {
         array_shift($raws);
     }
     // gambar image namanya lebih panjang
     $base1 = basename($raws[4]);
     $base2 = basename($raws[5]);
     if (strlen($base1) > strlen($base2)) {
         $img = $raws[4];
     } else {
require_once 'crawler.php';
$base = 'http://disneycomics.free.fr/';
$tree = array('Carl Barks' => 'index_barks_date.php', 'Don Rosa' => 'index_rosa_date.php', 'Marco Rota' => 'index_rota_date.php', 'Romano Scarpa' => 'index_scarpa_date.php', 'Tony Strobl' => 'index_strobl_date.php', 'Al Taliaferro' => 'index_taliaferro.php', 'Vicar' => 'index_vicar_date.php', 'William Van Horn' => 'index_vanhorn_date.php', 'Paul Murry' => 'index_murry_date.php', 'Daily Strips' => 'index_dailies.php', 'Sunday Strips' => 'index_sunday.php');
function download_it($img_url, $output_file)
{
    $dir = dirname($output_file) . '\\';
    //exec("mkdir \"$dir\"");
    exec("wget -t 0 --retry-connrefused -O \"{$output_file}\" {$img_url}");
}
$mode = 'phase3b';
switch ($mode) {
    case 'beginning':
        $result = array();
        foreach ($tree as $name => $link) {
            $c = new Crawler($base . $link);
            $c->go_to('<tbody>');
            while ($line = $c->readline()) {
                if (Crawler::is_there($line, '<tr>')) {
                    $line = $c->readline();
                    // nomor urut
                    if (!Crawler::is_there($line, '<a href')) {
                        $line = $c->readline();
                        // Hero
                        if (!Crawler::is_there($line, '<a href')) {
                            $line = $c->readline();
                            // Title dan link
                        }
                    }
                    // ada yg berupa original/reprint, ...
                    if (preg_match('/class="red">(.*)<\\/h4>.*class="blue" href="([^"]*)">original<.*href="([^"]*)">reprint</', $line, $matches)) {
                        $result[$name][strip_tags($matches[1]) . '-original'] = html_entity_decode($matches[2]);
Exemple #14
0
function omfggscans_chapters($chapters, $infixs)
{
    global $base;
    global $sitename;
    global $prefix;
    foreach ($chapters as $key => $val) {
        $url = $base . "&c={$val}";
        $ifx = Crawler::pad($infixs[$key], 3);
        echo "{$url}<br/>\n";
        $c = new Crawler($url);
        // retrieve pages
        $c->go_to("name='page'");
        $pages = array();
        while ($line = $c->readline()) {
            if (Crawler::is_there($line, '<option')) {
                $pg = Crawler::extract($line, "value='", "'");
                $pgtext = Crawler::extract($line, "'>", "</");
                $pages[$pg] = $pgtext;
            } else {
                if (Crawler::is_there($line, '</select>')) {
                    break;
                }
            }
        }
        // sample image url
        $c->go_to("class='manga-img'");
        $src = Crawler::extract($c->curline, 'src="', '"');
        $pre_src = dirname($src) . '/';
        $post_src = '.png';
        $c->close();
        foreach ($pages as $k => $v) {
            $href = $pre_src . $v . $post_src;
            $text = "{$prefix}-{$ifx}-{$v}{$post_src}";
            echo "<a href='{$href}'>{$text}</a><br />\n";
        }
    }
}
Exemple #15
0
function rule34($url)
{
    $text = rawurldecode(basename(dirname($url)));
    $site = 'http://rule34.paheal.net';
    $continue = true;
    while ($continue) {
        echo "{$url}<br/>";
        $c = new Crawler($url);
        $c->go_to("id='Navigationleft'");
        // $c->readline();
        // $c->readline();
        $line = $c->curline;
        if (preg_match('/<a href="([^\'"]+)">Next/', $line, $m)) {
            $url = $site . $m[1];
        } else {
            $continue = false;
        }
        $c->go_to("id='image-list'");
        while ($line = $c->readline()) {
            if (Crawler::is_there($line, '>Image Only<')) {
                $href = Crawler::extract($line, '<br><a href="', '"');
                echo "<a href='{$href}'>{$text}</a><br/>\n";
            } else {
                if (Crawler::is_there($line, '<footer>')) {
                    break;
                }
            }
        }
    }
}
Exemple #16
0
<?php

require 'crawler.php';
//http://www.viraindo.com/
$site = 'http://www.viraindo.com/';
$c = new Crawler($site);
$c->go_to('WIDTH=273');
while ($line = $c->readline()) {
    if (Crawler::is_there($line, 'href="')) {
        $page = Crawler::extract($line, 'href="', '"');
        $ket = Crawler::extract($line, '">', '</a');
        $d = new Crawler($site . $page);
        $d->go_to('<img src="');
        $img = $d->getbetween('<img src="', '"');
        echo "<a href='{$site}{$img}'>{$ket}</a><br/>\n";
        flush();
        $d->close();
    } else {
        if (Crawler::is_there($line, '<p></TD></TR>')) {
            break;
        }
    }
}
$c->close();
    }
    exit;
}
// http://www.comicgirls.net/thumb.php?photo=categorie01/Madrox_Rog0.JPG&max_size=110
// http://www.comicgirls.net/thumb.php?photo=categorie01/Madrox_Rog0.JPG&max_size=6000&thumb=NO
// Cookie
$base = 'http://www.comicgirls.net';
$imgs = array();
foreach ($targets as $k => $url) {
    $imgs[$k] = array();
    do {
        echo "{$url}<br />\n";
        $c = new Crawler($url);
        // Apakah ada next?
        $next = false;
        $c->go_to('>Navigation');
        while (!Crawler::is_there($line = $c->readline(), '<i>(')) {
            if (Crawler::is_there($line, '>Next<')) {
                $next = true;
                $url = $base . Crawler::extract($line, "href='", "'");
                break;
            }
        }
        // Grab the gallery
        $c->go_to("'catThumb'");
        while (!Crawler::is_there($line = $c->readline(), '</table>')) {
            if (Crawler::is_there($line, 'src=')) {
                $raw = $base . html_entity_decode(Crawler::extract($line, "src='", "'"));
                $new = preg_replace('/&max_size=.*$/', '&max_size=6000&thumb=NO', $raw);
                $imgs[$k][] = $new;
            }
Exemple #18
0
 flush();
 $craw = new Crawler($start_url);
 //first get the pictures
 $craw->go2linewhere('class="photogallery-celeb"');
 $craw->readline();
 $line = $craw->curline;
 //echo $line;
 //echo '<br />HOI<br />';
 $ledakan = explode('<img src="', $line);
 for ($i = 1; $i < count($ledakan); $i++) {
     $imgurl = str_replace('/t/', '/', Crawler::cutuntil($ledakan[$i], '"'));
     $file = basename($imgurl);
     echo "<a href='{$imgurl}'>{$file}</a><br />\n";
 }
 //then check the next link
 $craw->go_to('class="arrow">&#187;</a>');
 $url = $craw->getbetweenlast('<a href="', '"');
 if ($url == '#') {
     $masih = false;
 } else {
     $start_url = dirname($start_url . 'a') . '/' . $url;
 }
 /*
 $craw->go2lineor(array('<span class="global_pref_next_no_link">&#187;', 'class="global_pref_next">&#187;'));
 if (strpos($craw->curline, '<span class="global_pref_next_no_link">&#187;') !== false) {
 	$masih = false;
 } else {
 	$start_url = dirname($start_url.'a').'/'.$craw->getbetweenlast('<a href="', '"');
 }
 */
 $craw->close();
Exemple #19
0
<?php

require 'crawler.php';
$url = 'http://mangastream.com/read/billy_bat/18964888/1';
// The page url
$dir = 'd:/temp/';
// Where to store
$filename = 'test';
// Rename to this
// 1 Buka satu halaman manga reader
$c = new Crawler($url);
$imgs = array();
// 2 Pergi ke baris yang berisi definisi CSS salah satu potongan
$c->go_to('/#.+position.+width.+height.+top.+left/', '', true);
// 3 Iterasi hingga ketemu baris penutup (berisi '-->')
while ($line = $c->readline()) {
    if (preg_match('/#(\\w+) .+width:(\\d+).*height:(\\d+).*top:(\\d+).*left:(\\d+)/', $line, $match)) {
        // 3a Ambil informasi id, z-index, height, width, left, top tiap potongan
        list($all, $id, $width, $height, $top, $left) = $match;
        if (preg_match('/z-index:(\\d+)/', $line, $match)) {
            $zindex = $match[1];
        } else {
            $zindex = 0;
        }
        // 3b Masukkan ke array (var $imgs)
        $imgs[$id] = array('id' => $id, 'zindex' => $zindex, 'width' => $width, 'height' => $height, 'top' => $top, 'left' => $left);
    } else {
        if (Crawler::is_there($line, '-->')) {
            break;
        }
    }