public function crawl_page($url) { // crawl_page $c = new Crawler($url); // get title $c->go_to('<title>'); $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>'); $title = preg_replace('/\\W/', '_', $title); // get the date $c->go_to('date_left.gif'); $c->readline(2); $line = $c->curline; preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches); //print_r($matches);flush(); list($full, $month, $date, $year) = $matches; if (strlen($date) < 2) { $date = '0' . $date; } if (strlen($month) < 2) { $month = '0' . $month; } $fileprefix = "{$year}_{$month}_{$date}_{$title}"; // get the img url $c->go2linewhere('<td bgcolor=#FFFFFF'); $line = $c->curline; preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches); $img = $matches[1]; $filename = basename($img); $ext = substr($filename, strrpos($filename, '.')); echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>"; flush(); $c->close(); unset($c); }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; $c = new Crawler($url); $c->go_to('name="pagejump"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->go_to('id="nextpage"'); $c->readline(); $img = $c->getbetween('src="', '"'); $c->close(); $img_base = dirname($img); $ext = '.jpg'; $chapter = Crawler::pad($chapter, 3); foreach ($pages as $page) { echo "<a href='{$img_base}/{$page}{$ext}'>{$prefix}-{$chapter}-{$page}{$ext}</a><br/>\n"; flush(); } //print_r($pages);flush(); }
function crawl_1_page($url) { echo "URL2 {$url} <br/>\n"; flush(); $dirname = html_entity_decode(Crawler::cutfromlast1(substr($url, 0, strlen($url) - 1), '/')); $hasil = array(); $c = new Crawler($url); $c->go_to('<div class="entry">'); while ($line = $c->readline()) { if (Crawler::is_there($line, "href='")) { $img = Crawler::extract($line, "href='", "'"); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, 'href="')) { $img = Crawler::extract($line, 'href="', '"'); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</div>')) { break; } } } } $c->close(); }
function crawl_album($url, $alias = false) { $c = new Crawler($url); $c->go_to('<noscript>'); $c->go_to('<noscript>'); $c->readline(); $target = ''; //$c->curline; while ($line = $c->readline()) { if (Crawler::is_there($line, '</noscript>')) { break; } else { $target .= trim($line); } } $hasil = Crawler::extract_to_array($target, 'src="', '"'); $c->close(); /* kalo mo ngambil desc sebagai nama file preg_match_all('/<img src="([^"]+)"><\\/a><p><a [^>]+>([^<]+)<\\/a>/', $target, $match); //file_put_contents('picasaweb.out', print_r($match, true));exit; foreach ($match[1] as $i => $uri) { $info = pathinfo(basename($uri)); $ext = $info['extension']; $name = $match[2][$i]; $img = str_replace('/s128/', '/', $uri); echo "<a href='$img'>$name.$ext</a><br />\n"; } exit; */ if ($alias) { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); echo "<a href='{$img}'>{$alias}</a><br/>\n"; flush(); } } else { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); $basename = urldecode(basename($img)); echo "<a href='{$img}'>{$basename}</a><br/>\n"; flush(); } } }
public function go() { $mark1 = '<a target="_blank" title="Show fullsized image" href='; $mark2 = '<a title="Next Image" rel="next" href='; $host = 'http://lu.scio.us'; $finish = false; $number = 0; $url = $this->url; preg_match('/\\/([^\\/]+)\\/page\\/1/', $url, $m); $text = $m[1]; while (!$finish) { echo $url . "<br/>\n"; flush(); $c = new Crawler($url); $c->go_to('id="pid_'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'src="')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('thumb_100_', @$_GET['big'] ? '' : 'normal__', $img); $num = Crawler::pad(++$number, 3); $filnm = basename($img); $ext = Crawler::cutafter($filnm, '.'); // $text = $num . $ext; // preg_match('/\/(\d+\/\d+)\//', $img, $m); // $text = $m[1]; echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</ul>')) { break; } } } $c->go_to('class="pager"'); $c->readline(); if (Crawler::is_there($c->curline, 'Pager_next')) { $finish = false; $url = $host . Crawler::extract($c->curline, '<a rel="next" href="', '"'); } else { $finish = true; } $c->close(); } }
public function go() { // http://www.fakku.net/viewonline.php?id=2589 // pake curl $base = 'http://www.fakku.net'; // $this->url = str_replace('viewmanga.php', 'viewonline.php', $this->url); if (!preg_match('/\\/read$/', $this->url)) { $this->url .= '/read'; } /* $ch = curl_init($this->url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); file_put_contents('fakku.temp', curl_exec($ch)); curl_close($ch); */ $craw = new Crawler($this->url, array('use_curl' => true, 'agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13')); $craw->go_to('var data = {'); $json = Crawler::extract($craw->curline, ' = ', ';'); $obj = json_decode($json); /* $craw->go_to('var mirror = '); $mirror = $craw->getbetween("'", "'"); $craw->go_to('var mirror = '); $mirror2 = $craw->getbetween("'", "'"); if ($mirror2) $mirror = $mirror2; */ // 2012-05-06 fakku berubah $craw->go_to('function imgpath('); $craw->go_to('return \''); $imgpath = $craw->getbetween("return '", "';"); $craw->close(); $dir = basename(dirname($this->url)); foreach ($obj->thumbs as $key => $val) { $filename = Crawler::pad($key + 1, 3); // $img = $mirror . '/' . $obj->meta->dir . 'images/' . $filename; $img = str_replace("' + x + '", $filename, $imgpath); $text = $dir; echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } }
public function mangareader_1_page($fil, $url, $prefix, $chapter) { $chapter = Crawler::pad($chapter, 3); $c = new Crawler($fil); $c->go_to('width="800"'); $img = $c->getbetween('src="', '"'); preg_match('/(\\d+\\.\\w+)$/', basename($img), $m); $iname = $m[1]; $c->close(); $name = $prefix . '-' . $chapter . '-' . $iname; return array($name => $img); }
public function mangareader_1_page($fil, $url, $chapter) { $prefix = $this->prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($fil); $c->go_to('width="800"'); $img = $c->getbetween('src="', '"'); // if (@$_GET['show_url']) echo "<a href='$url'>URL</a> "; preg_match('/(\\d+\\.\\w+)$/', basename($img), $m); $iname = $m[1]; echo '<li><a href="' . $img . '">' . $prefix . '-' . $chapter . '-' . $iname . '</a>' . "</li>\n"; $c->close(); }
private function download_all($base, $destination) { $sitename = "http://www.imagefap.com"; $finish = false; $firstbase = $base; $i = 1; while (!$finish) { $c = new Crawler($base); echo $base . "\n"; $c->go_to(array('<table style=', ':: next ::')); if (Crawler::is_there($c->curline, ':: next ::')) { $finish = false; $urld = Crawler::extract($c->curline, 'href="', '"'); $base = $firstbase . html_entity_decode($urld); $c->go_to('<table style='); } else { $finish = true; } while ($line = $c->readline()) { if (Crawler::is_there($line, 'border=0')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('/thumb/', '/full/', $img); $img = preg_replace('/\\/x\\d\\./', '/', $img); $filename = basename($img); $ext = Crawler::cutfromlast($filename, '.'); $text = Crawler::n($i++, 4); $this->save_to($img, "{$destination}/{$text}{$ext}"); } else { if (Crawler::is_there($line, '</form>')) { break; } } } $c->close(); } }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; // http://ani-haven.net/hr-alpha/Psyren/145/ // @todo $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('id="myselectbox3"'); $c->readline(); $pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); // append $url ke $pages foreach ($pages as $i => $page) { $pages[$i] = $url . $page; } Crawler::multiProcess(4, $pages, 'crawl_1_page', array($chapter)); }
function foolreader_1_chapter($url, $chapter) { global $sitename; global $prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('imageArray = new Array'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'imageArray[')) { $img = Crawler::extract($line, "'", "'"); if (strpos($img, 'http://') !== 0) { $img = $sitename . $img; } $fname = basename($img); echo "<a href='{$img}'>{$prefix}-{$chapter}-{$fname}</a><br/>\n"; } else { if (Crawler::is_there($line, 'function loadImage')) { break; } } } $c->close(); /* // @TODO $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $sitename . Crawler::extract($line, 'value=\'', "'"); } else if (Crawler::is_there($line, '</select>')) { break; } } //$pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); //Crawler::multiProcess(4, $pages, 'foolreader_1_page', array($chapter)); */ }
$c->go_to('class="ehggt"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '"ehga"')) { $pages[] = Crawler::extract($line, 'href="', '"'); } else { if (Crawler::is_there($line, '</table>')) { break; } } } $c->close(); foreach ($pages as $url) { echo "URL:{$url}<br/>\n"; $c = new Crawler($url, array('use_curl' => true)); $c->go_to('</span>'); // ambil image source $raws = Crawler::extract_to_array($c->curline, 'src="', '"'); echo '<pre>'; print_r($raws); echo '</pre>'; // gambar image biasanya berada di $raws[4] atau $raws[5] if (Crawler::is_there($raws[0], '/n/next.png')) { array_shift($raws); } // gambar image namanya lebih panjang $base1 = basename($raws[4]); $base2 = basename($raws[5]); if (strlen($base1) > strlen($base2)) { $img = $raws[4]; } else {
require_once 'crawler.php'; $base = 'http://disneycomics.free.fr/'; $tree = array('Carl Barks' => 'index_barks_date.php', 'Don Rosa' => 'index_rosa_date.php', 'Marco Rota' => 'index_rota_date.php', 'Romano Scarpa' => 'index_scarpa_date.php', 'Tony Strobl' => 'index_strobl_date.php', 'Al Taliaferro' => 'index_taliaferro.php', 'Vicar' => 'index_vicar_date.php', 'William Van Horn' => 'index_vanhorn_date.php', 'Paul Murry' => 'index_murry_date.php', 'Daily Strips' => 'index_dailies.php', 'Sunday Strips' => 'index_sunday.php'); function download_it($img_url, $output_file) { $dir = dirname($output_file) . '\\'; //exec("mkdir \"$dir\""); exec("wget -t 0 --retry-connrefused -O \"{$output_file}\" {$img_url}"); } $mode = 'phase3b'; switch ($mode) { case 'beginning': $result = array(); foreach ($tree as $name => $link) { $c = new Crawler($base . $link); $c->go_to('<tbody>'); while ($line = $c->readline()) { if (Crawler::is_there($line, '<tr>')) { $line = $c->readline(); // nomor urut if (!Crawler::is_there($line, '<a href')) { $line = $c->readline(); // Hero if (!Crawler::is_there($line, '<a href')) { $line = $c->readline(); // Title dan link } } // ada yg berupa original/reprint, ... if (preg_match('/class="red">(.*)<\\/h4>.*class="blue" href="([^"]*)">original<.*href="([^"]*)">reprint</', $line, $matches)) { $result[$name][strip_tags($matches[1]) . '-original'] = html_entity_decode($matches[2]);
function omfggscans_chapters($chapters, $infixs) { global $base; global $sitename; global $prefix; foreach ($chapters as $key => $val) { $url = $base . "&c={$val}"; $ifx = Crawler::pad($infixs[$key], 3); echo "{$url}<br/>\n"; $c = new Crawler($url); // retrieve pages $c->go_to("name='page'"); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pg = Crawler::extract($line, "value='", "'"); $pgtext = Crawler::extract($line, "'>", "</"); $pages[$pg] = $pgtext; } else { if (Crawler::is_there($line, '</select>')) { break; } } } // sample image url $c->go_to("class='manga-img'"); $src = Crawler::extract($c->curline, 'src="', '"'); $pre_src = dirname($src) . '/'; $post_src = '.png'; $c->close(); foreach ($pages as $k => $v) { $href = $pre_src . $v . $post_src; $text = "{$prefix}-{$ifx}-{$v}{$post_src}"; echo "<a href='{$href}'>{$text}</a><br />\n"; } } }
function rule34($url) { $text = rawurldecode(basename(dirname($url))); $site = 'http://rule34.paheal.net'; $continue = true; while ($continue) { echo "{$url}<br/>"; $c = new Crawler($url); $c->go_to("id='Navigationleft'"); // $c->readline(); // $c->readline(); $line = $c->curline; if (preg_match('/<a href="([^\'"]+)">Next/', $line, $m)) { $url = $site . $m[1]; } else { $continue = false; } $c->go_to("id='image-list'"); while ($line = $c->readline()) { if (Crawler::is_there($line, '>Image Only<')) { $href = Crawler::extract($line, '<br><a href="', '"'); echo "<a href='{$href}'>{$text}</a><br/>\n"; } else { if (Crawler::is_there($line, '<footer>')) { break; } } } } }
<?php require 'crawler.php'; //http://www.viraindo.com/ $site = 'http://www.viraindo.com/'; $c = new Crawler($site); $c->go_to('WIDTH=273'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'href="')) { $page = Crawler::extract($line, 'href="', '"'); $ket = Crawler::extract($line, '">', '</a'); $d = new Crawler($site . $page); $d->go_to('<img src="'); $img = $d->getbetween('<img src="', '"'); echo "<a href='{$site}{$img}'>{$ket}</a><br/>\n"; flush(); $d->close(); } else { if (Crawler::is_there($line, '<p></TD></TR>')) { break; } } } $c->close();
} exit; } // http://www.comicgirls.net/thumb.php?photo=categorie01/Madrox_Rog0.JPG&max_size=110 // http://www.comicgirls.net/thumb.php?photo=categorie01/Madrox_Rog0.JPG&max_size=6000&thumb=NO // Cookie $base = 'http://www.comicgirls.net'; $imgs = array(); foreach ($targets as $k => $url) { $imgs[$k] = array(); do { echo "{$url}<br />\n"; $c = new Crawler($url); // Apakah ada next? $next = false; $c->go_to('>Navigation'); while (!Crawler::is_there($line = $c->readline(), '<i>(')) { if (Crawler::is_there($line, '>Next<')) { $next = true; $url = $base . Crawler::extract($line, "href='", "'"); break; } } // Grab the gallery $c->go_to("'catThumb'"); while (!Crawler::is_there($line = $c->readline(), '</table>')) { if (Crawler::is_there($line, 'src=')) { $raw = $base . html_entity_decode(Crawler::extract($line, "src='", "'")); $new = preg_replace('/&max_size=.*$/', '&max_size=6000&thumb=NO', $raw); $imgs[$k][] = $new; }
flush(); $craw = new Crawler($start_url); //first get the pictures $craw->go2linewhere('class="photogallery-celeb"'); $craw->readline(); $line = $craw->curline; //echo $line; //echo '<br />HOI<br />'; $ledakan = explode('<img src="', $line); for ($i = 1; $i < count($ledakan); $i++) { $imgurl = str_replace('/t/', '/', Crawler::cutuntil($ledakan[$i], '"')); $file = basename($imgurl); echo "<a href='{$imgurl}'>{$file}</a><br />\n"; } //then check the next link $craw->go_to('class="arrow">»</a>'); $url = $craw->getbetweenlast('<a href="', '"'); if ($url == '#') { $masih = false; } else { $start_url = dirname($start_url . 'a') . '/' . $url; } /* $craw->go2lineor(array('<span class="global_pref_next_no_link">»', 'class="global_pref_next">»')); if (strpos($craw->curline, '<span class="global_pref_next_no_link">»') !== false) { $masih = false; } else { $start_url = dirname($start_url.'a').'/'.$craw->getbetweenlast('<a href="', '"'); } */ $craw->close();
<?php require 'crawler.php'; $url = 'http://mangastream.com/read/billy_bat/18964888/1'; // The page url $dir = 'd:/temp/'; // Where to store $filename = 'test'; // Rename to this // 1 Buka satu halaman manga reader $c = new Crawler($url); $imgs = array(); // 2 Pergi ke baris yang berisi definisi CSS salah satu potongan $c->go_to('/#.+position.+width.+height.+top.+left/', '', true); // 3 Iterasi hingga ketemu baris penutup (berisi '-->') while ($line = $c->readline()) { if (preg_match('/#(\\w+) .+width:(\\d+).*height:(\\d+).*top:(\\d+).*left:(\\d+)/', $line, $match)) { // 3a Ambil informasi id, z-index, height, width, left, top tiap potongan list($all, $id, $width, $height, $top, $left) = $match; if (preg_match('/z-index:(\\d+)/', $line, $match)) { $zindex = $match[1]; } else { $zindex = 0; } // 3b Masukkan ke array (var $imgs) $imgs[$id] = array('id' => $id, 'zindex' => $zindex, 'width' => $width, 'height' => $height, 'top' => $top, 'left' => $left); } else { if (Crawler::is_there($line, '-->')) { break; } }