function crawl_one_page($url) { $nims = array(); $kraw = new Crawler($url); $kraw->go2linewhere('------------------------------------------'); $kraw->go2linewhere('------------------------------------------'); $kraw->readline(); while ($kraw->strpos('------------------------------------------') === false) { $nims[] = $kraw->getbetween(' ', ' '); $kraw->readline(); } $kraw->close(); return $nims; }
public function go() { $start_url = $this->url; $c = new Crawler($start_url); $c->go2linewhere('<p><a href="'); $c->close(); $ledak = explode('<a href="', $c->curline); for ($i = 1; $i < count($ledak); ++$i) { $aurl = Crawler::cutuntil($ledak[$i], '"'); $aurl = str_replace('http://hentaifromhell.net/redirect.html?', '', $aurl); echo "<a href='{$aurl}'>{$aurl}</a><br />\n"; flush(); /* $basename = Crawler::cutuntillast($aurl, '/'); if (!in_array($basename, $this->blacklist)) { $c = new Crawler($aurl); $c->go2linewhere('id="thepic"'); $imgurl = $c->getbetween('SRC="', '"'); $c->close(); echo "<a href='$basename/$imgurl'>".Crawler::n($i,3).".jpg</a><br />\n"; flush(); } else { echo "$i blacklisted server<br/>";flush(); } */ } }
public function crawl_page($url) { // crawl_page $c = new Crawler($url); // get title $c->go_to('<title>'); $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>'); $title = preg_replace('/\\W/', '_', $title); // get the date $c->go_to('date_left.gif'); $c->readline(2); $line = $c->curline; preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches); //print_r($matches);flush(); list($full, $month, $date, $year) = $matches; if (strlen($date) < 2) { $date = '0' . $date; } if (strlen($month) < 2) { $month = '0' . $month; } $fileprefix = "{$year}_{$month}_{$date}_{$title}"; // get the img url $c->go2linewhere('<td bgcolor=#FFFFFF'); $line = $c->curline; preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches); $img = $matches[1]; $filename = basename($img); $ext = substr($filename, strrpos($filename, '.')); echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>"; flush(); $c->close(); unset($c); }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base_url = 'http://gallery1.hentaifromhell.net'; } else { $base_url = 'http://gallery.hentaifromhell.net'; } $finish = false; while (!$finish) { $finish = true; echo $start_url, "<br />\n"; flush(); $c = new Crawler($start_url); $c->go2linewhere('<li class="thumbnail">'); while ($line = $c->readline()) { //echo "<pre>$line</pre><br/>\n";flush(); if (strpos($line, 'src="') !== false) { //ambil gambar $uri = Crawler::extract($line, 'src="', '"'); $uri = str_replace('/thumbs/', '/images/', $uri); preg_match('/(\\/small\\/\\d+-)/', $uri, $matches); $uri = str_replace($matches[1], '/', $uri); //$uri = html_entity_decode($uri); //$this->extract_page($uri); $file = basename(dirname($uri)); echo "<a href='{$uri}'>{$file}</a><br/>\n"; flush(); } else { if (strpos($line, 'class="pagNext"') !== false) { //next page $finish = false; $start_url = html_entity_decode(Crawler::extract($line, 'class="pagNext" href="', '"')); break; } else { if (strpos($line, '</table>') !== false) { // selesai break; } } } } $c->close(); } }
function crawl1page($url) { echo 'Entering ' . $url . '<br/>'; flush(); $c = new Crawler($url); $c->go2linewhere('<div class="ngg-gallery-thumbnail"'); $c->readline(); $sample = $c->getbetween('href="', '"'); $c->close(); $dir = dirname($sample); if (!$dir) { return; } $folder = substr($dir, strrpos($dir, '/') + 1); $dir = dirname($dir) . '/' . rawurlencode($folder) . '/'; echo 'Dir:' . $dir . '<br/>' . "\n"; flush(); $c = new Crawler($dir); $c->go2linewhere('<ul>'); $c->readline(); while ($line = $c->readline()) { //echo $line;flush(); if (strpos($line, '</ul>') !== false) { break; } else { if (strpos($line, '"thumbs/"')) { break; } } $filename = Crawler::extract($line, 'href="', '"'); echo '<a href="' . $dir . $filename . '">' . rawurldecode($filename) . '</a><br/>' . "\n"; flush(); } $c->close(); echo '<br/>' . "\n"; flush(); }
public function go() { $start_url = $this->url; if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) { $base = 'http://gallery1.hentaifromhell.net'; } else { $base = 'http://gallery.hentaifromhell.net'; } $selesai = false; while (!$selesai) { echo "{$start_url}<br/>\n"; $craw = new Crawler($start_url); $craw->go2linewhere('showimg.php?c='); while ($line = $craw->readline()) { if (strpos($line, 'showimg.php?c=') !== false) { $raw = Crawler::extract_to_array($line, '<a href="', '"'); foreach ($raw as $r) { $href = str_replace('showimg.php?c=', '', $r); $text = basename(dirname($href)); // basename($href); echo '<a href="' . $href . '">' . $text . '</a>' . "<br />\n"; } // $href = Crawler::extract($line, '<a href="', '"'); } else { if (strpos($line, 'Next»') !== false) { if (strpos($line, '<a href') !== false) { $start_url = $base . Crawler::extract($line, '<a href="', '"'); } else { $selesai = true; } break; } } } $craw->close(); } }
//http://eatmanga.com/Manga-Scan/Berserk/Berserk-315 $base = $_POST['base']; $prefix = $_POST['prefix']; $sitename = "http://eatmanga.com"; $pref = $_POST['base']; if (!Crawler::is_there($pref, '/index.php/')) { $pref = str_replace($sitename . '/Manga', $sitename . '/index.php/Manga', $pref); } if ($base) { $finish = false; $page = 1; while (!$finish) { echo "{$base}<br/>\n"; flush(); $c = new Crawler($base); $c->go2linewhere('mangaviewer_toppest_navig'); if (Crawler::is_there($c->curline, ' ›')) { $finish = false; $base = $pref . '/?page=' . ++$page; } else { $finish = true; } $ledak = explode('<img src="', $c->curline); $c->close(); for ($i = 1; $i < count($ledak); ++$i) { $segm = $ledak[$i]; $parturl = Crawler::cutuntil($segm, '"'); $parturl = str_replace('index.php', 'mangas', $parturl); $parturl = str_replace('?action=thumb', '', $parturl); echo '<a href="' . $sitename . $parturl . '">' . $prefix . '-' . Crawler::n($chapter, 3) . '-' . basename($parturl) . '</a><br/>' . "\n"; flush();
function crawl_1_page($start_url) { global $prefix; global $bas; $cr = new Crawler($start_url); /* echo $cr->readline();flush(); while(!feof($cr->stream)) { } exit(0); */ $cr->go2linewhere('headerSelect'); $cr->readline(); $line = $cr->curline; $cr->close(); $chap = Crawler::cutfromlast1($start_url, '/'); if (strpos($chap, '.') === false) { $chap = Crawler::n($chap, 3); } else { $a = explode('.', $chap); $a[0] = Crawler::n($a[0], 3); $chap = implode('.', $a); } $pi = 1; // page i $ledak = explode('value="', $line); $pages = array(); for ($i = 1; $i < count($ledak); $i++) { $uurl = Crawler::cutuntil($ledak[$i], '"'); $key = Crawler::cutfromlast1($uurl, '/'); $pages[$key] = strpos($uurl, 'http://') === 0 ? $uurl : $bas . $uurl; } //print_r($pages);flush(); $results = array(); foreach ($pages as $pagenum => $new_url) { $berhasil = false; while (!$berhasil) { $cr = new Crawler($new_url); //echo "URL:$new_url<br/>\n";flush(); $cr->go2linewhere('id="readerPage"'); if ($cr->strpos('<img src="') === false) { $cr->readline(); } $line = $cr->curline; $cr->close(); $img_url = Crawler::extract($line, '<img src="', '"'); //echo "IMG:$img_url<br/>\n";flush(); $real_name = basename($img_url); $ext = Crawler::cutfromlast($img_url, '.'); //$filename = $prefix . '-' . $chap . '-' . Crawler::n($pagenum, 2) . $ext; $filename = $prefix . '-' . $chap . '-' . urldecode($real_name); $val = $img_url; if (!empty($val)) { $berhasil = true; $key = $filename; $results[$filename] = $img_url; ?> <a href="<?php echo $val; ?> "><?php echo $key; ?> </a><br /> <?php flush(); } else { } } } }
$c = new Crawler($turl); if ($c->stream) { $lines = $c->getalllineswhere('>>'); $c->close(); unset($c); //echo '$lines:', htmlspecialchars(print_r($lines, true)), '<br />'; flush(); foreach ($lines as $line) { $bigC++; if ($bigC >= $fromC) { $link = Crawler::extract($line, 'href="', '"'); echo 'Opening ', $bigC, ' ', $link, '<br />'; flush(); $c = new Crawler($link); if ($c->stream) { $c->go2linewhere('time SG_txtc'); $time = $c->getbetween('>(', ')<'); $blines = $c->getalllineswhere('/orignal/'); $c->close(); unset($c); //echo '$blines:', htmlspecialchars(print_r($blines, true)),'<br />'; flush(); foreach ($blines as $bline) { if (strpos($bline, 'url=') === false) { $blink = Crawler::extract($bline, 'HREF="', '"'); } else { if (strpos($bline, 'url=') !== false) { $blink = Crawler::extract($bline, 'url=', '"'); } } $blink = str_replace('&690', '', $blink);
<body> <form action="" method="post"> Starting URL: <input type="text" name="start_url" value="<?php echo isset($start_url) ? $start_url : ''; ?> " /> <input type="submit" value="Submit" /> </form> <?php $masih = true; while ($masih) { echo "{$start_url}<br/>\n"; flush(); $craw = new Crawler($start_url); //first get the pictures $craw->go2linewhere('class="photogallery-celeb"'); $craw->readline(); $line = $craw->curline; //echo $line; //echo '<br />HOI<br />'; $ledakan = explode('<img src="', $line); for ($i = 1; $i < count($ledakan); $i++) { $imgurl = str_replace('/t/', '/', Crawler::cutuntil($ledakan[$i], '"')); $file = basename($imgurl); echo "<a href='{$imgurl}'>{$file}</a><br />\n"; } //then check the next link $craw->go_to('class="arrow">»</a>'); $url = $craw->getbetweenlast('<a href="', '"'); if ($url == '#') { $masih = false;
<?php require_once 'crawler.php'; //class Crawler $base = 'http://gravure.ecchi-squad.net/images/gravure/'; $folders = array(); $craw = new Crawler($base); $craw->go2linewhere('<img src="/icons/folder.gif"'); while (strpos($craw->curline, '</pre>') === false) { $folders[] = $craw->getbetween('<a href="', '"'); $craw->readline(); } $craw->close(); //print_r($folders); foreach ($folders as $folder) { unset($craw); $craw = new Crawler($base . $folder); $files = array(); $craw->go2linewhere('<img src="/icons/image2.gif"'); while (strpos($craw->curline, '</pre>') === false) { $files[] = $craw->getbetween('<a href="', '"'); $craw->readline(); } $craw->close(); $fold = substr($folder, 0, strlen($folder) - 1); foreach ($files as $file) { echo "<a href=\"{$base}{$folder}{$file}\">{$fold}</a><br />\n"; } flush(); }
<html> <body> <?php require_once "crawler.php"; $istart = 1; $ifinish = 399; $start = 'http://asianchicki.com/Girl.aspx?ID='; extract($_GET); extract($_POST); for ($i = $istart; $i <= $ifinish; $i++) { $turl = $start . $i; $c = new Crawler($turl); if ($c->stream) { $c->go2linewhere('Thumbnail'); $c->close(); $nama = $c->getbetween('ctl00_ContentPlaceHolder1_lblName">', '</span'); $ledak = explode('FileName="', $c->curline); //echo "<pre>{$c->curline}</pre><br />\n"; $ccount = count($ledak); for ($j = 1; $j < $ccount; $j++) { $iurl = Crawler::extract($ledak[$j], 'src="', '"'); $iurl = str_replace('Thumbnail', 'Viewer', $iurl); $parsed = Crawler::parse_url($iurl); echo '<a href="' . $iurl . '">' . $nama . '</a>' . "<br />\n"; } } flush(); }
<?php require_once "crawler.php"; $base = 'http://www.ez-wallpaper.org'; $berhenti = 0; $url = $base; while (!$berhenti) { echo "\nURL:{$url}\n"; $c = new Crawler($url); $c->readline(); while ($line = $c->readline()) { if ($c->strpos('nodeTitle') !== false) { $href = $c->getbetween('<a href="', '"'); $c2 = new Crawler($base . $href); $c2->go2linewhere('pageTitle'); $title = $c2->getbetween('>', '<'); $c2->go2linewhere('node_images'); $ledak = explode('<a href="', $c2->curline); for ($i = 1; $i < count($ledak); $i++) { $ahref = substr($ledak[$i], 0, strpos($ledak[$i], '"')); echo "<a href='{$ahref}'>{$title}</a><br />\n"; } //echo $c2->curline; $c2->close(); } else { if ($c->strpos('Go to next page') !== false) { echo "\nADA NEXT\n"; $url = $base . $c->getbetweenlast('</span><a href="', '"'); break; } else { if ($c->strpos('Go to previous page') !== false) {
<body> <form action="" method="post"> Start url: <input type="text" name="start_url" value="<?php echo isset($start_url) ? $start_url : ''; ?> " /> <input type="submit" /> </form> <?php $basename = 'http://bluelaguna.net'; if ($start_url) { $c = new Crawler($start_url); $c->go2linewhere('<iframe id="rectangle"'); $c->close(); $ledak = explode('<a href="', $c->curline); $big = array(); for ($i = 1; $i < count($ledak); ++$i) { $aurl = $basename . Crawler::cutuntil($ledak[$i], '"'); echo "{$aurl}<br />"; $c = new Crawler($aurl); $c->go2linewhere('<iframe id="rectangle"'); $c->close(); $ledak2 = explode('<a href="', $c->curline); for ($j = 1; $j < count($ledak2); ++$j) { $burl = Crawler::cutuntil($ledak2[$j], '"'); echo '<a href="' . htmlentities($burl) . '">' . basename($burl) . "</a><br />\n"; } echo "<br />\n"; flush(); } }
require_once "crawler.php"; $start_date = '2009-03-10'; $base_url = 'http://www.dilbert.com'; $middle_url = '/strips/comic/'; extract($_GET); $selesai = false; $url = $base_url . $middle_url . $start_date; while (!$selesai) { $ada_next = false; $c = new Crawler($url); echo "URL is {$url}<br />\n"; flush(); $c->go2lineor(array('STR_Content', 'STR_Prev')); //echo "go2lineor selesai\n";flush(); if ($c->strpos('STR_Prev') !== false) { //masih ada next $ada_next = true; $url = $base_url . $c->getbetween('<a href="', '"'); $c->go2linewhere('STR_Content'); } else { $ada_next = false; $selesai = true; } $c->readline(); $img = $c->getbetween('<img src="', '"'); echo "<a href='{$base_url}{$img}'>{$start_date}</a><br />\n"; $start_date = Crawler::extract($url, 'comic/', '/'); $c->close(); echo "Closed\n"; flush(); }