function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; $c = new Crawler($url); $c->go_to('name="pagejump"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->go_to('id="nextpage"'); $c->readline(); $img = $c->getbetween('src="', '"'); $c->close(); $img_base = dirname($img); $ext = '.jpg'; $chapter = Crawler::pad($chapter, 3); foreach ($pages as $page) { echo "<a href='{$img_base}/{$page}{$ext}'>{$prefix}-{$chapter}-{$page}{$ext}</a><br/>\n"; flush(); } //print_r($pages);flush(); }
function crawl_one_page($url) { $nims = array(); $kraw = new Crawler($url); $kraw->go2linewhere('------------------------------------------'); $kraw->go2linewhere('------------------------------------------'); $kraw->readline(); while ($kraw->strpos('------------------------------------------') === false) { $nims[] = $kraw->getbetween(' ', ' '); $kraw->readline(); } $kraw->close(); return $nims; }
public function crawl_page($url) { // crawl_page $c = new Crawler($url); // get title $c->go_to('<title>'); $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>'); $title = preg_replace('/\\W/', '_', $title); // get the date $c->go_to('date_left.gif'); $c->readline(2); $line = $c->curline; preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches); //print_r($matches);flush(); list($full, $month, $date, $year) = $matches; if (strlen($date) < 2) { $date = '0' . $date; } if (strlen($month) < 2) { $month = '0' . $month; } $fileprefix = "{$year}_{$month}_{$date}_{$title}"; // get the img url $c->go2linewhere('<td bgcolor=#FFFFFF'); $line = $c->curline; preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches); $img = $matches[1]; $filename = basename($img); $ext = substr($filename, strrpos($filename, '.')); echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>"; flush(); $c->close(); unset($c); }
function crawl_1_page($url) { echo "URL2 {$url} <br/>\n"; flush(); $dirname = html_entity_decode(Crawler::cutfromlast1(substr($url, 0, strlen($url) - 1), '/')); $hasil = array(); $c = new Crawler($url); $c->go_to('<div class="entry">'); while ($line = $c->readline()) { if (Crawler::is_there($line, "href='")) { $img = Crawler::extract($line, "href='", "'"); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, 'href="')) { $img = Crawler::extract($line, 'href="', '"'); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</div>')) { break; } } } } $c->close(); }
function crawl_album($url, $alias = false) { $c = new Crawler($url); $c->go_to('<noscript>'); $c->go_to('<noscript>'); $c->readline(); $target = ''; //$c->curline; while ($line = $c->readline()) { if (Crawler::is_there($line, '</noscript>')) { break; } else { $target .= trim($line); } } $hasil = Crawler::extract_to_array($target, 'src="', '"'); $c->close(); /* kalo mo ngambil desc sebagai nama file preg_match_all('/<img src="([^"]+)"><\\/a><p><a [^>]+>([^<]+)<\\/a>/', $target, $match); //file_put_contents('picasaweb.out', print_r($match, true));exit; foreach ($match[1] as $i => $uri) { $info = pathinfo(basename($uri)); $ext = $info['extension']; $name = $match[2][$i]; $img = str_replace('/s128/', '/', $uri); echo "<a href='$img'>$name.$ext</a><br />\n"; } exit; */ if ($alias) { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); echo "<a href='{$img}'>{$alias}</a><br/>\n"; flush(); } } else { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); $basename = urldecode(basename($img)); echo "<a href='{$img}'>{$basename}</a><br/>\n"; flush(); } } }
public function go() { $mark1 = '<a target="_blank" title="Show fullsized image" href='; $mark2 = '<a title="Next Image" rel="next" href='; $host = 'http://lu.scio.us'; $finish = false; $number = 0; $url = $this->url; preg_match('/\\/([^\\/]+)\\/page\\/1/', $url, $m); $text = $m[1]; while (!$finish) { echo $url . "<br/>\n"; flush(); $c = new Crawler($url); $c->go_to('id="pid_'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'src="')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('thumb_100_', @$_GET['big'] ? '' : 'normal__', $img); $num = Crawler::pad(++$number, 3); $filnm = basename($img); $ext = Crawler::cutafter($filnm, '.'); // $text = $num . $ext; // preg_match('/\/(\d+\/\d+)\//', $img, $m); // $text = $m[1]; echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</ul>')) { break; } } } $c->go_to('class="pager"'); $c->readline(); if (Crawler::is_there($c->curline, 'Pager_next')) { $finish = false; $url = $host . Crawler::extract($c->curline, '<a rel="next" href="', '"'); } else { $finish = true; } $c->close(); } }
function crawl1page($url) { echo 'Entering ' . $url . '<br/>'; flush(); $c = new Crawler($url); $c->go2linewhere('<div class="ngg-gallery-thumbnail"'); $c->readline(); $sample = $c->getbetween('href="', '"'); $c->close(); $dir = dirname($sample); if (!$dir) { return; } $folder = substr($dir, strrpos($dir, '/') + 1); $dir = dirname($dir) . '/' . rawurlencode($folder) . '/'; echo 'Dir:' . $dir . '<br/>' . "\n"; flush(); $c = new Crawler($dir); $c->go2linewhere('<ul>'); $c->readline(); while ($line = $c->readline()) { //echo $line;flush(); if (strpos($line, '</ul>') !== false) { break; } else { if (strpos($line, '"thumbs/"')) { break; } } $filename = Crawler::extract($line, 'href="', '"'); echo '<a href="' . $dir . $filename . '">' . rawurldecode($filename) . '</a><br/>' . "\n"; flush(); } $c->close(); echo '<br/>' . "\n"; flush(); }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base_url = 'http://gallery1.hentaifromhell.net'; } else { $base_url = 'http://gallery.hentaifromhell.net'; } $finish = false; while (!$finish) { $finish = true; echo $start_url, "<br />\n"; flush(); $c = new Crawler($start_url); $c->go2linewhere('<li class="thumbnail">'); while ($line = $c->readline()) { //echo "<pre>$line</pre><br/>\n";flush(); if (strpos($line, 'src="') !== false) { //ambil gambar $uri = Crawler::extract($line, 'src="', '"'); $uri = str_replace('/thumbs/', '/images/', $uri); preg_match('/(\\/small\\/\\d+-)/', $uri, $matches); $uri = str_replace($matches[1], '/', $uri); //$uri = html_entity_decode($uri); //$this->extract_page($uri); $file = basename(dirname($uri)); echo "<a href='{$uri}'>{$file}</a><br/>\n"; flush(); } else { if (strpos($line, 'class="pagNext"') !== false) { //next page $finish = false; $start_url = html_entity_decode(Crawler::extract($line, 'class="pagNext" href="', '"')); break; } else { if (strpos($line, '</table>') !== false) { // selesai break; } } } } $c->close(); } }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; // http://ani-haven.net/hr-alpha/Psyren/145/ // @todo $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('id="myselectbox3"'); $c->readline(); $pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); // append $url ke $pages foreach ($pages as $i => $page) { $pages[$i] = $url . $page; } Crawler::multiProcess(4, $pages, 'crawl_1_page', array($chapter)); }
public function get_images($chapter_url, $prefix, $infix) { $sitename = $this->sitename($chapter_url); $c = new Crawler($chapter_url); $c->go_to('id="pageMenu"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $sitename . Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->close(); $result = array(); foreach ($pages as $page) { $result = $result + $this->mangareader_1_page($page, $page, $prefix, $infix); } return $result; }
function foolreader_1_chapter($url, $chapter) { global $sitename; global $prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('imageArray = new Array'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'imageArray[')) { $img = Crawler::extract($line, "'", "'"); if (strpos($img, 'http://') !== 0) { $img = $sitename . $img; } $fname = basename($img); echo "<a href='{$img}'>{$prefix}-{$chapter}-{$fname}</a><br/>\n"; } else { if (Crawler::is_there($line, 'function loadImage')) { break; } } } $c->close(); /* // @TODO $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $sitename . Crawler::extract($line, 'value=\'', "'"); } else if (Crawler::is_there($line, '</select>')) { break; } } //$pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); //Crawler::multiProcess(4, $pages, 'foolreader_1_page', array($chapter)); */ }
public function crawl_chapter($v) { $c = new Crawler($v['url']); $c->go_to('id="pageMenu"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $this->sitename . Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } // $pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); echo '<ul>'; // Crawler::multiProcess(4, $pages, array($this, 'mangareader_1_page'), array($v['infix'])); foreach ($pages as $page) { $this->mangareader_1_page($page, $page, $v['infix']); } echo '</ul>'; }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base = 'http://gallery1.hentaifromhell.net'; } else { $base = 'http://gallery.hentaifromhell.net'; } $selesai = false; while (!$selesai) { echo "{$start_url}<br/>\n"; $craw = new Crawler($start_url); $craw->go2linewhere('showimg.php?c='); while ($line = $craw->readline()) { if (strpos($line, 'showimg.php?c=') !== false) { $raw = Crawler::extract_to_array($line, '<a href="', '"'); foreach ($raw as $r) { $href = str_replace('showimg.php?c=', '', $r); $text = basename(dirname($href)); // basename($href); echo '<a href="' . $href . '">' . $text . '</a>' . "<br />\n"; } // $href = Crawler::extract($line, '<a href="', '"'); } else { if (strpos($line, 'Next»') !== false) { if (strpos($line, '<a href') !== false) { $start_url = $base . Crawler::extract($line, '<a href="', '"'); } else { $selesai = true; } break; } } } $craw->close(); } }
private function download_all($base, $destination) { $sitename = "http://www.imagefap.com"; $finish = false; $firstbase = $base; $i = 1; while (!$finish) { $c = new Crawler($base); echo $base . "\n"; $c->go_to(array('<table style=', ':: next ::')); if (Crawler::is_there($c->curline, ':: next ::')) { $finish = false; $urld = Crawler::extract($c->curline, 'href="', '"'); $base = $firstbase . html_entity_decode($urld); $c->go_to('<table style='); } else { $finish = true; } while ($line = $c->readline()) { if (Crawler::is_there($line, 'border=0')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('/thumb/', '/full/', $img); $img = preg_replace('/\\/x\\d\\./', '/', $img); $filename = basename($img); $ext = Crawler::cutfromlast($filename, '.'); $text = Crawler::n($i++, 4); $this->save_to($img, "{$destination}/{$text}{$ext}"); } else { if (Crawler::is_there($line, '</form>')) { break; } } } $c->close(); } }
require_once "crawler.php"; $start_date = '2009-03-10'; $base_url = 'http://www.dilbert.com'; $middle_url = '/strips/comic/'; extract($_GET); $selesai = false; $url = $base_url . $middle_url . $start_date; while (!$selesai) { $ada_next = false; $c = new Crawler($url); echo "URL is {$url}<br />\n"; flush(); $c->go2lineor(array('STR_Content', 'STR_Prev')); //echo "go2lineor selesai\n";flush(); if ($c->strpos('STR_Prev') !== false) { //masih ada next $ada_next = true; $url = $base_url . $c->getbetween('<a href="', '"'); $c->go2linewhere('STR_Content'); } else { $ada_next = false; $selesai = true; } $c->readline(); $img = $c->getbetween('<img src="', '"'); echo "<a href='{$base_url}{$img}'>{$start_date}</a><br />\n"; $start_date = Crawler::extract($url, 'comic/', '/'); $c->close(); echo "Closed\n"; flush(); }
$base = 'http://disneycomics.free.fr/'; $tree = array('Carl Barks' => 'index_barks_date.php', 'Don Rosa' => 'index_rosa_date.php', 'Marco Rota' => 'index_rota_date.php', 'Romano Scarpa' => 'index_scarpa_date.php', 'Tony Strobl' => 'index_strobl_date.php', 'Al Taliaferro' => 'index_taliaferro.php', 'Vicar' => 'index_vicar_date.php', 'William Van Horn' => 'index_vanhorn_date.php', 'Paul Murry' => 'index_murry_date.php', 'Daily Strips' => 'index_dailies.php', 'Sunday Strips' => 'index_sunday.php'); function download_it($img_url, $output_file) { $dir = dirname($output_file) . '\\'; //exec("mkdir \"$dir\""); exec("wget -t 0 --retry-connrefused -O \"{$output_file}\" {$img_url}"); } $mode = 'phase3b'; switch ($mode) { case 'beginning': $result = array(); foreach ($tree as $name => $link) { $c = new Crawler($base . $link); $c->go_to('<tbody>'); while ($line = $c->readline()) { if (Crawler::is_there($line, '<tr>')) { $line = $c->readline(); // nomor urut if (!Crawler::is_there($line, '<a href')) { $line = $c->readline(); // Hero if (!Crawler::is_there($line, '<a href')) { $line = $c->readline(); // Title dan link } } // ada yg berupa original/reprint, ... if (preg_match('/class="red">(.*)<\\/h4>.*class="blue" href="([^"]*)">original<.*href="([^"]*)">reprint</', $line, $matches)) { $result[$name][strip_tags($matches[1]) . '-original'] = html_entity_decode($matches[2]); $result[$name][strip_tags($matches[1]) . '-reprint'] = html_entity_decode($matches[3]);
function omfggscans_chapters($chapters, $infixs) { global $base; global $sitename; global $prefix; foreach ($chapters as $key => $val) { $url = $base . "&c={$val}"; $ifx = Crawler::pad($infixs[$key], 3); echo "{$url}<br/>\n"; $c = new Crawler($url); // retrieve pages $c->go_to("name='page'"); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pg = Crawler::extract($line, "value='", "'"); $pgtext = Crawler::extract($line, "'>", "</"); $pages[$pg] = $pgtext; } else { if (Crawler::is_there($line, '</select>')) { break; } } } // sample image url $c->go_to("class='manga-img'"); $src = Crawler::extract($c->curline, 'src="', '"'); $pre_src = dirname($src) . '/'; $post_src = '.png'; $c->close(); foreach ($pages as $k => $v) { $href = $pre_src . $v . $post_src; $text = "{$prefix}-{$ifx}-{$v}{$post_src}"; echo "<a href='{$href}'>{$text}</a><br />\n"; } } }
function rule34($url) { $text = rawurldecode(basename(dirname($url))); $site = 'http://rule34.paheal.net'; $continue = true; while ($continue) { echo "{$url}<br/>"; $c = new Crawler($url); $c->go_to("id='Navigationleft'"); // $c->readline(); // $c->readline(); $line = $c->curline; if (preg_match('/<a href="([^\'"]+)">Next/', $line, $m)) { $url = $site . $m[1]; } else { $continue = false; } $c->go_to("id='image-list'"); while ($line = $c->readline()) { if (Crawler::is_there($line, '>Image Only<')) { $href = Crawler::extract($line, '<br><a href="', '"'); echo "<a href='{$href}'>{$text}</a><br/>\n"; } else { if (Crawler::is_there($line, '<footer>')) { break; } } } } }
public function crawl_page($url, $text) { echo "Entering '{$url}'<br/>\n"; flush(); $c = new Crawler($url, true); $dah_gambar = false; $i = 1; while ($line = $c->readline()) { if (preg_match('/pic dashedOn/i', $line)) { $dah_gambar = true; $line = $c->readline(); preg_match('/<img src="([^"]+)"/i', $line, $matches); $img = $matches[1]; $tempi = Crawler::n($i++, 3) . substr($img, strrpos($img, '.')); echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } else { if (preg_match('/commentButton/i', $line) && $dah_gambar) { break; } } } $c->close(); unset($c); }
exit; } // http://www.comicgirls.net/thumb.php?photo=categorie01/Madrox_Rog0.JPG&max_size=110 // http://www.comicgirls.net/thumb.php?photo=categorie01/Madrox_Rog0.JPG&max_size=6000&thumb=NO // Cookie $base = 'http://www.comicgirls.net'; $imgs = array(); foreach ($targets as $k => $url) { $imgs[$k] = array(); do { echo "{$url}<br />\n"; $c = new Crawler($url); // Apakah ada next? $next = false; $c->go_to('>Navigation'); while (!Crawler::is_there($line = $c->readline(), '<i>(')) { if (Crawler::is_there($line, '>Next<')) { $next = true; $url = $base . Crawler::extract($line, "href='", "'"); break; } } // Grab the gallery $c->go_to("'catThumb'"); while (!Crawler::is_there($line = $c->readline(), '</table>')) { if (Crawler::is_there($line, 'src=')) { $raw = $base . html_entity_decode(Crawler::extract($line, "src='", "'")); $new = preg_replace('/&max_size=.*$/', '&max_size=6000&thumb=NO', $raw); $imgs[$k][] = $new; } }
<?php require 'crawler.php'; $url = 'http://mangastream.com/read/billy_bat/18964888/1'; // The page url $dir = 'd:/temp/'; // Where to store $filename = 'test'; // Rename to this // 1 Buka satu halaman manga reader $c = new Crawler($url); $imgs = array(); // 2 Pergi ke baris yang berisi definisi CSS salah satu potongan $c->go_to('/#.+position.+width.+height.+top.+left/', '', true); // 3 Iterasi hingga ketemu baris penutup (berisi '-->') while ($line = $c->readline()) { if (preg_match('/#(\\w+) .+width:(\\d+).*height:(\\d+).*top:(\\d+).*left:(\\d+)/', $line, $match)) { // 3a Ambil informasi id, z-index, height, width, left, top tiap potongan list($all, $id, $width, $height, $top, $left) = $match; if (preg_match('/z-index:(\\d+)/', $line, $match)) { $zindex = $match[1]; } else { $zindex = 0; } // 3b Masukkan ke array (var $imgs) $imgs[$id] = array('id' => $id, 'zindex' => $zindex, 'width' => $width, 'height' => $height, 'top' => $top, 'left' => $left); } else { if (Crawler::is_there($line, '-->')) { break; } }
<form action="" method="post"> Starting URL: <input type="text" name="start_url" value="<?php echo isset($start_url) ? $start_url : ''; ?> " /> <input type="submit" value="Submit" /> </form> <?php $masih = true; while ($masih) { echo "{$start_url}<br/>\n"; flush(); $craw = new Crawler($start_url); //first get the pictures $craw->go2linewhere('class="photogallery-celeb"'); $craw->readline(); $line = $craw->curline; //echo $line; //echo '<br />HOI<br />'; $ledakan = explode('<img src="', $line); for ($i = 1; $i < count($ledakan); $i++) { $imgurl = str_replace('/t/', '/', Crawler::cutuntil($ledakan[$i], '"')); $file = basename($imgurl); echo "<a href='{$imgurl}'>{$file}</a><br />\n"; } //then check the next link $craw->go_to('class="arrow">»</a>'); $url = $craw->getbetweenlast('<a href="', '"'); if ($url == '#') { $masih = false; } else {
function crawl_1_page($start_url) { global $prefix; global $bas; $cr = new Crawler($start_url); /* echo $cr->readline();flush(); while(!feof($cr->stream)) { } exit(0); */ $cr->go2linewhere('headerSelect'); $cr->readline(); $line = $cr->curline; $cr->close(); $chap = Crawler::cutfromlast1($start_url, '/'); if (strpos($chap, '.') === false) { $chap = Crawler::n($chap, 3); } else { $a = explode('.', $chap); $a[0] = Crawler::n($a[0], 3); $chap = implode('.', $a); } $pi = 1; // page i $ledak = explode('value="', $line); $pages = array(); for ($i = 1; $i < count($ledak); $i++) { $uurl = Crawler::cutuntil($ledak[$i], '"'); $key = Crawler::cutfromlast1($uurl, '/'); $pages[$key] = strpos($uurl, 'http://') === 0 ? $uurl : $bas . $uurl; } //print_r($pages);flush(); $results = array(); foreach ($pages as $pagenum => $new_url) { $berhasil = false; while (!$berhasil) { $cr = new Crawler($new_url); //echo "URL:$new_url<br/>\n";flush(); $cr->go2linewhere('id="readerPage"'); if ($cr->strpos('<img src="') === false) { $cr->readline(); } $line = $cr->curline; $cr->close(); $img_url = Crawler::extract($line, '<img src="', '"'); //echo "IMG:$img_url<br/>\n";flush(); $real_name = basename($img_url); $ext = Crawler::cutfromlast($img_url, '.'); //$filename = $prefix . '-' . $chap . '-' . Crawler::n($pagenum, 2) . $ext; $filename = $prefix . '-' . $chap . '-' . urldecode($real_name); $val = $img_url; if (!empty($val)) { $berhasil = true; $key = $filename; $results[$filename] = $img_url; ?> <a href="<?php echo $val; ?> "><?php echo $key; ?> </a><br /> <?php flush(); } else { } } } }