function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; $c = new Crawler($url); $c->go_to('name="pagejump"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->go_to('id="nextpage"'); $c->readline(); $img = $c->getbetween('src="', '"'); $c->close(); $img_base = dirname($img); $ext = '.jpg'; $chapter = Crawler::pad($chapter, 3); foreach ($pages as $page) { echo "<a href='{$img_base}/{$page}{$ext}'>{$prefix}-{$chapter}-{$page}{$ext}</a><br/>\n"; flush(); } //print_r($pages);flush(); }
function crawl_1_page($url) { echo "URL2 {$url} <br/>\n"; flush(); $dirname = html_entity_decode(Crawler::cutfromlast1(substr($url, 0, strlen($url) - 1), '/')); $hasil = array(); $c = new Crawler($url); $c->go_to('<div class="entry">'); while ($line = $c->readline()) { if (Crawler::is_there($line, "href='")) { $img = Crawler::extract($line, "href='", "'"); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, 'href="')) { $img = Crawler::extract($line, 'href="', '"'); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</div>')) { break; } } } } $c->close(); }
public function go() { $mark1 = '<a target="_blank" title="Show fullsized image" href='; $mark2 = '<a title="Next Image" rel="next" href='; $host = 'http://lu.scio.us'; $finish = false; $number = 0; $url = $this->url; preg_match('/\\/([^\\/]+)\\/page\\/1/', $url, $m); $text = $m[1]; while (!$finish) { echo $url . "<br/>\n"; flush(); $c = new Crawler($url); $c->go_to('id="pid_'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'src="')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('thumb_100_', @$_GET['big'] ? '' : 'normal__', $img); $num = Crawler::pad(++$number, 3); $filnm = basename($img); $ext = Crawler::cutafter($filnm, '.'); // $text = $num . $ext; // preg_match('/\/(\d+\/\d+)\//', $img, $m); // $text = $m[1]; echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</ul>')) { break; } } } $c->go_to('class="pager"'); $c->readline(); if (Crawler::is_there($c->curline, 'Pager_next')) { $finish = false; $url = $host . Crawler::extract($c->curline, '<a rel="next" href="', '"'); } else { $finish = true; } $c->close(); } }
function crawl_album($url, $alias = false) { $c = new Crawler($url); $c->go_to('<noscript>'); $c->go_to('<noscript>'); $c->readline(); $target = ''; //$c->curline; while ($line = $c->readline()) { if (Crawler::is_there($line, '</noscript>')) { break; } else { $target .= trim($line); } } $hasil = Crawler::extract_to_array($target, 'src="', '"'); $c->close(); /* kalo mo ngambil desc sebagai nama file preg_match_all('/<img src="([^"]+)"><\\/a><p><a [^>]+>([^<]+)<\\/a>/', $target, $match); //file_put_contents('picasaweb.out', print_r($match, true));exit; foreach ($match[1] as $i => $uri) { $info = pathinfo(basename($uri)); $ext = $info['extension']; $name = $match[2][$i]; $img = str_replace('/s128/', '/', $uri); echo "<a href='$img'>$name.$ext</a><br />\n"; } exit; */ if ($alias) { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); echo "<a href='{$img}'>{$alias}</a><br/>\n"; flush(); } } else { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); $basename = urldecode(basename($img)); echo "<a href='{$img}'>{$basename}</a><br/>\n"; flush(); } } }
public function get_images($chapter_url, $prefix, $infix) { $sitename = $this->sitename($chapter_url); $c = new Crawler($chapter_url); $c->go_to('id="pageMenu"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $sitename . Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->close(); $result = array(); foreach ($pages as $page) { $result = $result + $this->mangareader_1_page($page, $page, $prefix, $infix); } return $result; }
function foolreader_1_chapter($url, $chapter) { global $sitename; global $prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('imageArray = new Array'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'imageArray[')) { $img = Crawler::extract($line, "'", "'"); if (strpos($img, 'http://') !== 0) { $img = $sitename . $img; } $fname = basename($img); echo "<a href='{$img}'>{$prefix}-{$chapter}-{$fname}</a><br/>\n"; } else { if (Crawler::is_there($line, 'function loadImage')) { break; } } } $c->close(); /* // @TODO $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $sitename . Crawler::extract($line, 'value=\'', "'"); } else if (Crawler::is_there($line, '</select>')) { break; } } //$pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); //Crawler::multiProcess(4, $pages, 'foolreader_1_page', array($chapter)); */ }
public function crawl_chapter($v) { $c = new Crawler($v['url']); $c->go_to('id="pageMenu"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $this->sitename . Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } // $pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); echo '<ul>'; // Crawler::multiProcess(4, $pages, array($this, 'mangareader_1_page'), array($v['infix'])); foreach ($pages as $page) { $this->mangareader_1_page($page, $page, $v['infix']); } echo '</ul>'; }
private function download_all($base, $destination) { $sitename = "http://www.imagefap.com"; $finish = false; $firstbase = $base; $i = 1; while (!$finish) { $c = new Crawler($base); echo $base . "\n"; $c->go_to(array('<table style=', ':: next ::')); if (Crawler::is_there($c->curline, ':: next ::')) { $finish = false; $urld = Crawler::extract($c->curline, 'href="', '"'); $base = $firstbase . html_entity_decode($urld); $c->go_to('<table style='); } else { $finish = true; } while ($line = $c->readline()) { if (Crawler::is_there($line, 'border=0')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('/thumb/', '/full/', $img); $img = preg_replace('/\\/x\\d\\./', '/', $img); $filename = basename($img); $ext = Crawler::cutfromlast($filename, '.'); $text = Crawler::n($i++, 4); $this->save_to($img, "{$destination}/{$text}{$ext}"); } else { if (Crawler::is_there($line, '</form>')) { break; } } } $c->close(); } }
if (!Crawler::is_there($line, '<a href')) { $line = $c->readline(); // Title dan link } } // ada yg berupa original/reprint, ... if (preg_match('/class="red">(.*)<\\/h4>.*class="blue" href="([^"]*)">original<.*href="([^"]*)">reprint</', $line, $matches)) { $result[$name][strip_tags($matches[1]) . '-original'] = html_entity_decode($matches[2]); $result[$name][strip_tags($matches[1]) . '-reprint'] = html_entity_decode($matches[3]); } else { if (preg_match('/href="([^"]*)">(.*)<\\/a>/', $line, $matches)) { $result[$name][strip_tags($matches[2])] = html_entity_decode($matches[1]); } } } else { if (Crawler::is_there($line, '</tbody>')) { break; } } } $c->close(); } ob_start(); echo "<?php\n"; echo '$a = '; var_export($result); echo ';'; file_put_contents('disneycomics.phase1', ob_get_clean()); break; case 'phase2': require 'disneycomics.phase1';
function omfggscans_chapters($chapters, $infixs) { global $base; global $sitename; global $prefix; foreach ($chapters as $key => $val) { $url = $base . "&c={$val}"; $ifx = Crawler::pad($infixs[$key], 3); echo "{$url}<br/>\n"; $c = new Crawler($url); // retrieve pages $c->go_to("name='page'"); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pg = Crawler::extract($line, "value='", "'"); $pgtext = Crawler::extract($line, "'>", "</"); $pages[$pg] = $pgtext; } else { if (Crawler::is_there($line, '</select>')) { break; } } } // sample image url $c->go_to("class='manga-img'"); $src = Crawler::extract($c->curline, 'src="', '"'); $pre_src = dirname($src) . '/'; $post_src = '.png'; $c->close(); foreach ($pages as $k => $v) { $href = $pre_src . $v . $post_src; $text = "{$prefix}-{$ifx}-{$v}{$post_src}"; echo "<a href='{$href}'>{$text}</a><br />\n"; } } }
$base = $_POST['base']; $prefix = $_POST['prefix']; $sitename = "http://eatmanga.com"; $pref = $_POST['base']; if (!Crawler::is_there($pref, '/index.php/')) { $pref = str_replace($sitename . '/Manga', $sitename . '/index.php/Manga', $pref); } if ($base) { $finish = false; $page = 1; while (!$finish) { echo "{$base}<br/>\n"; flush(); $c = new Crawler($base); $c->go2linewhere('mangaviewer_toppest_navig'); if (Crawler::is_there($c->curline, ' ›')) { $finish = false; $base = $pref . '/?page=' . ++$page; } else { $finish = true; } $ledak = explode('<img src="', $c->curline); $c->close(); for ($i = 1; $i < count($ledak); ++$i) { $segm = $ledak[$i]; $parturl = Crawler::cutuntil($segm, '"'); $parturl = str_replace('index.php', 'mangas', $parturl); $parturl = str_replace('?action=thumb', '', $parturl); echo '<a href="' . $sitename . $parturl . '">' . $prefix . '-' . Crawler::n($chapter, 3) . '-' . basename($parturl) . '</a><br/>' . "\n"; flush(); }
<?php require 'crawler.php'; //http://www.viraindo.com/ $site = 'http://www.viraindo.com/'; $c = new Crawler($site); $c->go_to('WIDTH=273'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'href="')) { $page = Crawler::extract($line, 'href="', '"'); $ket = Crawler::extract($line, '">', '</a'); $d = new Crawler($site . $page); $d->go_to('<img src="'); $img = $d->getbetween('<img src="', '"'); echo "<a href='{$site}{$img}'>{$ket}</a><br/>\n"; flush(); $d->close(); } else { if (Crawler::is_there($line, '<p></TD></TR>')) { break; } } } $c->close();
// 2 Pergi ke baris yang berisi definisi CSS salah satu potongan $c->go_to('/#.+position.+width.+height.+top.+left/', '', true); // 3 Iterasi hingga ketemu baris penutup (berisi '-->') while ($line = $c->readline()) { if (preg_match('/#(\\w+) .+width:(\\d+).*height:(\\d+).*top:(\\d+).*left:(\\d+)/', $line, $match)) { // 3a Ambil informasi id, z-index, height, width, left, top tiap potongan list($all, $id, $width, $height, $top, $left) = $match; if (preg_match('/z-index:(\\d+)/', $line, $match)) { $zindex = $match[1]; } else { $zindex = 0; } // 3b Masukkan ke array (var $imgs) $imgs[$id] = array('id' => $id, 'zindex' => $zindex, 'width' => $width, 'height' => $height, 'top' => $top, 'left' => $left); } else { if (Crawler::is_there($line, '-->')) { break; } } } // 4 Pergi ke baris yang berisi ukuran total gambar $reg = '/<div .+position:relative.+width:(\\d+).+height:(\\d+)/'; $c->go_to($reg, '', true); preg_match($reg, $c->curline, $match); // 5 Ambil $tot_width dan $tot_height dari baris ini list($all, $tot_width, $tot_height) = $match; // 6 Iterasi hingga ketemu baris penutup (regex '/^\\s+<\\/div>/') while ($line = $c->readline()) { if (preg_match('/<div id="([^"]+)".+src="([^"]+)"/', $line, $match)) { // 6a Ambil informasi id, src tiap potongan list($all, $id, $src) = $match;
// Cookie $base = 'http://www.comicgirls.net'; $imgs = array(); foreach ($targets as $k => $url) { $imgs[$k] = array(); do { echo "{$url}<br />\n"; $c = new Crawler($url); // Apakah ada next? $next = false; $c->go_to('>Navigation'); while (!Crawler::is_there($line = $c->readline(), '<i>(')) { if (Crawler::is_there($line, '>Next<')) { $next = true; $url = $base . Crawler::extract($line, "href='", "'"); break; } } // Grab the gallery $c->go_to("'catThumb'"); while (!Crawler::is_there($line = $c->readline(), '</table>')) { if (Crawler::is_there($line, 'src=')) { $raw = $base . html_entity_decode(Crawler::extract($line, "src='", "'")); $new = preg_replace('/&max_size=.*$/', '&max_size=6000&thumb=NO', $raw); $imgs[$k][] = $new; } } $c->close(); } while ($next); } var_export($imgs);
function rule34($url) { $text = rawurldecode(basename(dirname($url))); $site = 'http://rule34.paheal.net'; $continue = true; while ($continue) { echo "{$url}<br/>"; $c = new Crawler($url); $c->go_to("id='Navigationleft'"); // $c->readline(); // $c->readline(); $line = $c->curline; if (preg_match('/<a href="([^\'"]+)">Next/', $line, $m)) { $url = $site . $m[1]; } else { $continue = false; } $c->go_to("id='image-list'"); while ($line = $c->readline()) { if (Crawler::is_there($line, '>Image Only<')) { $href = Crawler::extract($line, '<br><a href="', '"'); echo "<a href='{$href}'>{$text}</a><br/>\n"; } else { if (Crawler::is_there($line, '<footer>')) { break; } } } } }
break; } } } $c->close(); foreach ($pages as $url) { echo "URL:{$url}<br/>\n"; $c = new Crawler($url, array('use_curl' => true)); $c->go_to('</span>'); // ambil image source $raws = Crawler::extract_to_array($c->curline, 'src="', '"'); echo '<pre>'; print_r($raws); echo '</pre>'; // gambar image biasanya berada di $raws[4] atau $raws[5] if (Crawler::is_there($raws[0], '/n/next.png')) { array_shift($raws); } // gambar image namanya lebih panjang $base1 = basename($raws[4]); $base2 = basename($raws[5]); if (strlen($base1) > strlen($base2)) { $img = $raws[4]; } else { $img = $raws[5]; } $fname = basename($img); echo "<a href='{$img}'>{$fname}</a><br/>\n"; // Download original if (preg_match('/href="([^"]+)">Download original/', $c->curline, $matches)) { $img = $matches[1];