public function crawl_page($url) { // crawl_page $c = new Crawler($url); // get title $c->go_to('<title>'); $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>'); $title = preg_replace('/\\W/', '_', $title); // get the date $c->go_to('date_left.gif'); $c->readline(2); $line = $c->curline; preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches); //print_r($matches);flush(); list($full, $month, $date, $year) = $matches; if (strlen($date) < 2) { $date = '0' . $date; } if (strlen($month) < 2) { $month = '0' . $month; } $fileprefix = "{$year}_{$month}_{$date}_{$title}"; // get the img url $c->go2linewhere('<td bgcolor=#FFFFFF'); $line = $c->curline; preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches); $img = $matches[1]; $filename = basename($img); $ext = substr($filename, strrpos($filename, '.')); echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>"; flush(); $c->close(); unset($c); }
function crawl_1_page($url) { echo "URL2 {$url} <br/>\n"; flush(); $dirname = html_entity_decode(Crawler::cutfromlast1(substr($url, 0, strlen($url) - 1), '/')); $hasil = array(); $c = new Crawler($url); $c->go_to('<div class="entry">'); while ($line = $c->readline()) { if (Crawler::is_there($line, "href='")) { $img = Crawler::extract($line, "href='", "'"); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, 'href="')) { $img = Crawler::extract($line, 'href="', '"'); echo "<a href='{$img}'>{$dirname}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</div>')) { break; } } } } $c->close(); }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; $c = new Crawler($url); $c->go_to('name="pagejump"'); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = Crawler::extract($line, 'value="', '"'); } else { if (Crawler::is_there($line, '</select>')) { break; } } } $c->go_to('id="nextpage"'); $c->readline(); $img = $c->getbetween('src="', '"'); $c->close(); $img_base = dirname($img); $ext = '.jpg'; $chapter = Crawler::pad($chapter, 3); foreach ($pages as $page) { echo "<a href='{$img_base}/{$page}{$ext}'>{$prefix}-{$chapter}-{$page}{$ext}</a><br/>\n"; flush(); } //print_r($pages);flush(); }
public function go() { $start_url = $this->url; $c = new Crawler($start_url); $c->go2linewhere('<p><a href="'); $c->close(); $ledak = explode('<a href="', $c->curline); for ($i = 1; $i < count($ledak); ++$i) { $aurl = Crawler::cutuntil($ledak[$i], '"'); $aurl = str_replace('http://hentaifromhell.net/redirect.html?', '', $aurl); echo "<a href='{$aurl}'>{$aurl}</a><br />\n"; flush(); /* $basename = Crawler::cutuntillast($aurl, '/'); if (!in_array($basename, $this->blacklist)) { $c = new Crawler($aurl); $c->go2linewhere('id="thepic"'); $imgurl = $c->getbetween('SRC="', '"'); $c->close(); echo "<a href='$basename/$imgurl'>".Crawler::n($i,3).".jpg</a><br />\n"; flush(); } else { echo "$i blacklisted server<br/>";flush(); } */ } }
function crawl_indowebster($url) { //echo "'$url'"; $craw = new Crawler($url); $craw->go2lineregexor('/(<\\/div><\\/a><\\/div><\\/div>)/', 1, 'href="#idws7"'); $setring = $craw->getbetween('location.href=\'', '\''); $path = Crawler::extract($setring, 'path=', '&'); $file_orig = Crawler::cutafter($setring, 'file_orig='); $craw->close(); return '<a href="' . dirname($setring) . '/' . $path . '">' . rawurldecode($file_orig) . '</a>'; }
public function mangareader_1_page($fil, $url, $prefix, $chapter) { $chapter = Crawler::pad($chapter, 3); $c = new Crawler($fil); $c->go_to('width="800"'); $img = $c->getbetween('src="', '"'); preg_match('/(\\d+\\.\\w+)$/', basename($img), $m); $iname = $m[1]; $c->close(); $name = $prefix . '-' . $chapter . '-' . $iname; return array($name => $img); }
public function mangareader_1_page($fil, $url, $chapter) { $prefix = $this->prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($fil); $c->go_to('width="800"'); $img = $c->getbetween('src="', '"'); // if (@$_GET['show_url']) echo "<a href='$url'>URL</a> "; preg_match('/(\\d+\\.\\w+)$/', basename($img), $m); $iname = $m[1]; echo '<li><a href="' . $img . '">' . $prefix . '-' . $chapter . '-' . $iname . '</a>' . "</li>\n"; $c->close(); }
function crawl_one_page($url) { $nims = array(); $kraw = new Crawler($url); $kraw->go2linewhere('------------------------------------------'); $kraw->go2linewhere('------------------------------------------'); $kraw->readline(); while ($kraw->strpos('------------------------------------------') === false) { $nims[] = $kraw->getbetween(' ', ' '); $kraw->readline(); } $kraw->close(); return $nims; }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base_url = 'http://gallery1.hentaifromhell.net'; } else { $base_url = 'http://gallery.hentaifromhell.net'; } $finish = false; while (!$finish) { $finish = true; echo $start_url, "<br />\n"; flush(); $c = new Crawler($start_url); $c->go2linewhere('<li class="thumbnail">'); while ($line = $c->readline()) { //echo "<pre>$line</pre><br/>\n";flush(); if (strpos($line, 'src="') !== false) { //ambil gambar $uri = Crawler::extract($line, 'src="', '"'); $uri = str_replace('/thumbs/', '/images/', $uri); preg_match('/(\\/small\\/\\d+-)/', $uri, $matches); $uri = str_replace($matches[1], '/', $uri); //$uri = html_entity_decode($uri); //$this->extract_page($uri); $file = basename(dirname($uri)); echo "<a href='{$uri}'>{$file}</a><br/>\n"; flush(); } else { if (strpos($line, 'class="pagNext"') !== false) { //next page $finish = false; $start_url = html_entity_decode(Crawler::extract($line, 'class="pagNext" href="', '"')); break; } else { if (strpos($line, '</table>') !== false) { // selesai break; } } } } $c->close(); } }
function crawl_1_chapter($url, $chapter) { global $sitename; global $prefix; // http://ani-haven.net/hr-alpha/Psyren/145/ // @todo $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('id="myselectbox3"'); $c->readline(); $pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); // append $url ke $pages foreach ($pages as $i => $page) { $pages[$i] = $url . $page; } Crawler::multiProcess(4, $pages, 'crawl_1_page', array($chapter)); }
public function go() { $mark1 = '<a target="_blank" title="Show fullsized image" href='; $mark2 = '<a title="Next Image" rel="next" href='; $host = 'http://lu.scio.us'; $finish = false; $number = 0; $url = $this->url; preg_match('/\\/([^\\/]+)\\/page\\/1/', $url, $m); $text = $m[1]; while (!$finish) { echo $url . "<br/>\n"; flush(); $c = new Crawler($url); $c->go_to('id="pid_'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'src="')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('thumb_100_', @$_GET['big'] ? '' : 'normal__', $img); $num = Crawler::pad(++$number, 3); $filnm = basename($img); $ext = Crawler::cutafter($filnm, '.'); // $text = $num . $ext; // preg_match('/\/(\d+\/\d+)\//', $img, $m); // $text = $m[1]; echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } else { if (Crawler::is_there($line, '</ul>')) { break; } } } $c->go_to('class="pager"'); $c->readline(); if (Crawler::is_there($c->curline, 'Pager_next')) { $finish = false; $url = $host . Crawler::extract($c->curline, '<a rel="next" href="', '"'); } else { $finish = true; } $c->close(); } }
function crawl_album($url, $alias = false) { $c = new Crawler($url); $c->go_to('<noscript>'); $c->go_to('<noscript>'); $c->readline(); $target = ''; //$c->curline; while ($line = $c->readline()) { if (Crawler::is_there($line, '</noscript>')) { break; } else { $target .= trim($line); } } $hasil = Crawler::extract_to_array($target, 'src="', '"'); $c->close(); /* kalo mo ngambil desc sebagai nama file preg_match_all('/<img src="([^"]+)"><\\/a><p><a [^>]+>([^<]+)<\\/a>/', $target, $match); //file_put_contents('picasaweb.out', print_r($match, true));exit; foreach ($match[1] as $i => $uri) { $info = pathinfo(basename($uri)); $ext = $info['extension']; $name = $match[2][$i]; $img = str_replace('/s128/', '/', $uri); echo "<a href='$img'>$name.$ext</a><br />\n"; } exit; */ if ($alias) { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); echo "<a href='{$img}'>{$alias}</a><br/>\n"; flush(); } } else { foreach ($hasil as $img) { $img = str_replace('/s128/', '/d/', $img); $basename = urldecode(basename($img)); echo "<a href='{$img}'>{$basename}</a><br/>\n"; flush(); } } }
public function go() { // http://www.fakku.net/viewonline.php?id=2589 // pake curl $base = 'http://www.fakku.net'; // $this->url = str_replace('viewmanga.php', 'viewonline.php', $this->url); if (!preg_match('/\\/read$/', $this->url)) { $this->url .= '/read'; } /* $ch = curl_init($this->url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); file_put_contents('fakku.temp', curl_exec($ch)); curl_close($ch); */ $craw = new Crawler($this->url, array('use_curl' => true, 'agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13')); $craw->go_to('var data = {'); $json = Crawler::extract($craw->curline, ' = ', ';'); $obj = json_decode($json); /* $craw->go_to('var mirror = '); $mirror = $craw->getbetween("'", "'"); $craw->go_to('var mirror = '); $mirror2 = $craw->getbetween("'", "'"); if ($mirror2) $mirror = $mirror2; */ // 2012-05-06 fakku berubah $craw->go_to('function imgpath('); $craw->go_to('return \''); $imgpath = $craw->getbetween("return '", "';"); $craw->close(); $dir = basename(dirname($this->url)); foreach ($obj->thumbs as $key => $val) { $filename = Crawler::pad($key + 1, 3); // $img = $mirror . '/' . $obj->meta->dir . 'images/' . $filename; $img = str_replace("' + x + '", $filename, $imgpath); $text = $dir; echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } }
function foolreader_1_chapter($url, $chapter) { global $sitename; global $prefix; $chapter = Crawler::pad($chapter, 3); $c = new Crawler($url); $c->go_to('imageArray = new Array'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'imageArray[')) { $img = Crawler::extract($line, "'", "'"); if (strpos($img, 'http://') !== 0) { $img = $sitename . $img; } $fname = basename($img); echo "<a href='{$img}'>{$prefix}-{$chapter}-{$fname}</a><br/>\n"; } else { if (Crawler::is_there($line, 'function loadImage')) { break; } } } $c->close(); /* // @TODO $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pages[] = $sitename . Crawler::extract($line, 'value=\'', "'"); } else if (Crawler::is_there($line, '</select>')) { break; } } //$pages = Crawler::extract_to_array($c->curline, 'value="', '"'); $c->close(); //Crawler::multiProcess(4, $pages, 'foolreader_1_page', array($chapter)); */ }
function crawl1page($url) { echo 'Entering ' . $url . '<br/>'; flush(); $c = new Crawler($url); $c->go2linewhere('<div class="ngg-gallery-thumbnail"'); $c->readline(); $sample = $c->getbetween('href="', '"'); $c->close(); $dir = dirname($sample); if (!$dir) { return; } $folder = substr($dir, strrpos($dir, '/') + 1); $dir = dirname($dir) . '/' . rawurlencode($folder) . '/'; echo 'Dir:' . $dir . '<br/>' . "\n"; flush(); $c = new Crawler($dir); $c->go2linewhere('<ul>'); $c->readline(); while ($line = $c->readline()) { //echo $line;flush(); if (strpos($line, '</ul>') !== false) { break; } else { if (strpos($line, '"thumbs/"')) { break; } } $filename = Crawler::extract($line, 'href="', '"'); echo '<a href="' . $dir . $filename . '">' . rawurldecode($filename) . '</a><br/>' . "\n"; flush(); } $c->close(); echo '<br/>' . "\n"; flush(); }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base = 'http://gallery1.hentaifromhell.net'; } else { $base = 'http://gallery.hentaifromhell.net'; } $selesai = false; while (!$selesai) { echo "{$start_url}<br/>\n"; $craw = new Crawler($start_url); $craw->go2linewhere('showimg.php?c='); while ($line = $craw->readline()) { if (strpos($line, 'showimg.php?c=') !== false) { $raw = Crawler::extract_to_array($line, '<a href="', '"'); foreach ($raw as $r) { $href = str_replace('showimg.php?c=', '', $r); $text = basename(dirname($href)); // basename($href); echo '<a href="' . $href . '">' . $text . '</a>' . "<br />\n"; } // $href = Crawler::extract($line, '<a href="', '"'); } else { if (strpos($line, 'Next»') !== false) { if (strpos($line, '<a href') !== false) { $start_url = $base . Crawler::extract($line, '<a href="', '"'); } else { $selesai = true; } break; } } } $craw->close(); } }
private function download_all($base, $destination) { $sitename = "http://www.imagefap.com"; $finish = false; $firstbase = $base; $i = 1; while (!$finish) { $c = new Crawler($base); echo $base . "\n"; $c->go_to(array('<table style=', ':: next ::')); if (Crawler::is_there($c->curline, ':: next ::')) { $finish = false; $urld = Crawler::extract($c->curline, 'href="', '"'); $base = $firstbase . html_entity_decode($urld); $c->go_to('<table style='); } else { $finish = true; } while ($line = $c->readline()) { if (Crawler::is_there($line, 'border=0')) { $img = Crawler::extract($line, 'src="', '"'); $img = str_replace('/thumb/', '/full/', $img); $img = preg_replace('/\\/x\\d\\./', '/', $img); $filename = basename($img); $ext = Crawler::cutfromlast($filename, '.'); $text = Crawler::n($i++, 4); $this->save_to($img, "{$destination}/{$text}{$ext}"); } else { if (Crawler::is_there($line, '</form>')) { break; } } } $c->close(); } }
$c = new Crawler($url); $c->readline(); while ($line = $c->readline()) { if ($c->strpos('nodeTitle') !== false) { $href = $c->getbetween('<a href="', '"'); $c2 = new Crawler($base . $href); $c2->go2linewhere('pageTitle'); $title = $c2->getbetween('>', '<'); $c2->go2linewhere('node_images'); $ledak = explode('<a href="', $c2->curline); for ($i = 1; $i < count($ledak); $i++) { $ahref = substr($ledak[$i], 0, strpos($ledak[$i], '"')); echo "<a href='{$ahref}'>{$title}</a><br />\n"; } //echo $c2->curline; $c2->close(); } else { if ($c->strpos('Go to next page') !== false) { echo "\nADA NEXT\n"; $url = $base . $c->getbetweenlast('</span><a href="', '"'); break; } else { if ($c->strpos('Go to previous page') !== false) { echo "\nADA PREVIOUS TANPA NEXT\n"; $berhenti = true; break; } } } } $c->close();
// ada yg berupa original/reprint, ... if (preg_match('/class="red">(.*)<\\/h4>.*class="blue" href="([^"]*)">original<.*href="([^"]*)">reprint</', $line, $matches)) { $result[$name][strip_tags($matches[1]) . '-original'] = html_entity_decode($matches[2]); $result[$name][strip_tags($matches[1]) . '-reprint'] = html_entity_decode($matches[3]); } else { if (preg_match('/href="([^"]*)">(.*)<\\/a>/', $line, $matches)) { $result[$name][strip_tags($matches[2])] = html_entity_decode($matches[1]); } } } else { if (Crawler::is_there($line, '</tbody>')) { break; } } } $c->close(); } ob_start(); echo "<?php\n"; echo '$a = '; var_export($result); echo ';'; file_put_contents('disneycomics.phase1', ob_get_clean()); break; case 'phase2': require 'disneycomics.phase1'; //print_r(array_keys($a));exit; $hasil = array(); foreach ($a as $name => $comics) { foreach ($comics as $title => $url) { $url = preg_replace('/show.*\\.php.*loc=/', '', $url);
function omfggscans_chapters($chapters, $infixs) { global $base; global $sitename; global $prefix; foreach ($chapters as $key => $val) { $url = $base . "&c={$val}"; $ifx = Crawler::pad($infixs[$key], 3); echo "{$url}<br/>\n"; $c = new Crawler($url); // retrieve pages $c->go_to("name='page'"); $pages = array(); while ($line = $c->readline()) { if (Crawler::is_there($line, '<option')) { $pg = Crawler::extract($line, "value='", "'"); $pgtext = Crawler::extract($line, "'>", "</"); $pages[$pg] = $pgtext; } else { if (Crawler::is_there($line, '</select>')) { break; } } } // sample image url $c->go_to("class='manga-img'"); $src = Crawler::extract($c->curline, 'src="', '"'); $pre_src = dirname($src) . '/'; $post_src = '.png'; $c->close(); foreach ($pages as $k => $v) { $href = $pre_src . $v . $post_src; $text = "{$prefix}-{$ifx}-{$v}{$post_src}"; echo "<a href='{$href}'>{$text}</a><br />\n"; } } }
//echo $line; //echo '<br />HOI<br />'; $ledakan = explode('<img src="', $line); for ($i = 1; $i < count($ledakan); $i++) { $imgurl = str_replace('/t/', '/', Crawler::cutuntil($ledakan[$i], '"')); $file = basename($imgurl); echo "<a href='{$imgurl}'>{$file}</a><br />\n"; } //then check the next link $craw->go_to('class="arrow">»</a>'); $url = $craw->getbetweenlast('<a href="', '"'); if ($url == '#') { $masih = false; } else { $start_url = dirname($start_url . 'a') . '/' . $url; } /* $craw->go2lineor(array('<span class="global_pref_next_no_link">»', 'class="global_pref_next">»')); if (strpos($craw->curline, '<span class="global_pref_next_no_link">»') !== false) { $masih = false; } else { $start_url = dirname($start_url.'a').'/'.$craw->getbetweenlast('<a href="', '"'); } */ $craw->close(); unset($craw); flush(); } ?> </body> </html>
function crawl_1_page($start_url) { global $prefix; global $bas; $cr = new Crawler($start_url); /* echo $cr->readline();flush(); while(!feof($cr->stream)) { } exit(0); */ $cr->go2linewhere('headerSelect'); $cr->readline(); $line = $cr->curline; $cr->close(); $chap = Crawler::cutfromlast1($start_url, '/'); if (strpos($chap, '.') === false) { $chap = Crawler::n($chap, 3); } else { $a = explode('.', $chap); $a[0] = Crawler::n($a[0], 3); $chap = implode('.', $a); } $pi = 1; // page i $ledak = explode('value="', $line); $pages = array(); for ($i = 1; $i < count($ledak); $i++) { $uurl = Crawler::cutuntil($ledak[$i], '"'); $key = Crawler::cutfromlast1($uurl, '/'); $pages[$key] = strpos($uurl, 'http://') === 0 ? $uurl : $bas . $uurl; } //print_r($pages);flush(); $results = array(); foreach ($pages as $pagenum => $new_url) { $berhasil = false; while (!$berhasil) { $cr = new Crawler($new_url); //echo "URL:$new_url<br/>\n";flush(); $cr->go2linewhere('id="readerPage"'); if ($cr->strpos('<img src="') === false) { $cr->readline(); } $line = $cr->curline; $cr->close(); $img_url = Crawler::extract($line, '<img src="', '"'); //echo "IMG:$img_url<br/>\n";flush(); $real_name = basename($img_url); $ext = Crawler::cutfromlast($img_url, '.'); //$filename = $prefix . '-' . $chap . '-' . Crawler::n($pagenum, 2) . $ext; $filename = $prefix . '-' . $chap . '-' . urldecode($real_name); $val = $img_url; if (!empty($val)) { $berhasil = true; $key = $filename; $results[$filename] = $img_url; ?> <a href="<?php echo $val; ?> "><?php echo $key; ?> </a><br /> <?php flush(); } else { } } } }
<?php require 'crawler.php'; //http://www.viraindo.com/ $site = 'http://www.viraindo.com/'; $c = new Crawler($site); $c->go_to('WIDTH=273'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'href="')) { $page = Crawler::extract($line, 'href="', '"'); $ket = Crawler::extract($line, '">', '</a'); $d = new Crawler($site . $page); $d->go_to('<img src="'); $img = $d->getbetween('<img src="', '"'); echo "<a href='{$site}{$img}'>{$ket}</a><br/>\n"; flush(); $d->close(); } else { if (Crawler::is_there($line, '<p></TD></TR>')) { break; } } } $c->close();
public function crawl_page($url, $text) { echo "Entering '{$url}'<br/>\n"; flush(); $c = new Crawler($url, true); $dah_gambar = false; $i = 1; while ($line = $c->readline()) { if (preg_match('/pic dashedOn/i', $line)) { $dah_gambar = true; $line = $c->readline(); preg_match('/<img src="([^"]+)"/i', $line, $matches); $img = $matches[1]; $tempi = Crawler::n($i++, 3) . substr($img, strrpos($img, '.')); echo "<a href='{$img}'>{$text}</a><br/>\n"; flush(); } else { if (preg_match('/commentButton/i', $line) && $dah_gambar) { break; } } } $c->close(); unset($c); }
function thedoujin_realm($url) { $c = new Crawler($url); $c->go_to('class="content"'); while ($line = $c->readline()) { if (Crawler::is_there($line, 'src=')) { $thumb = Crawler::extract($line, 'src="', '"'); $img = str_replace('/www.', '/img1.', $thumb); $img = str_replace('/thumbnails/', '/images/', $img); $img = str_replace('/thumbnail_', '/', $img); $name = basename($img); echo "<a href='{$img}'>{$name}</a><br />\n"; } else { if (Crawler::is_there($line, 'id="paginator"')) { break; } } } $c->close(); }