Exemple #1
0
function crawl_one_page($url)
{
    $nims = array();
    $kraw = new Crawler($url);
    $kraw->go2linewhere('------------------------------------------');
    $kraw->go2linewhere('------------------------------------------');
    $kraw->readline();
    while ($kraw->strpos('------------------------------------------') === false) {
        $nims[] = $kraw->getbetween(' ', '  ');
        $kraw->readline();
    }
    $kraw->close();
    return $nims;
}
 public function go()
 {
     $start_url = $this->url;
     $c = new Crawler($start_url);
     $c->go2linewhere('<p><a href="');
     $c->close();
     $ledak = explode('<a href="', $c->curline);
     for ($i = 1; $i < count($ledak); ++$i) {
         $aurl = Crawler::cutuntil($ledak[$i], '"');
         $aurl = str_replace('http://hentaifromhell.net/redirect.html?', '', $aurl);
         echo "<a href='{$aurl}'>{$aurl}</a><br />\n";
         flush();
         /*
         $basename = Crawler::cutuntillast($aurl, '/');
         if (!in_array($basename, $this->blacklist)) {
         	$c = new Crawler($aurl);
         	$c->go2linewhere('id="thepic"');
         	$imgurl = $c->getbetween('SRC="', '"');
         	$c->close();
         	echo "<a href='$basename/$imgurl'>".Crawler::n($i,3).".jpg</a><br />\n";
         	flush();
         } else {
         	echo "$i blacklisted server<br/>";flush();
         }
         */
     }
 }
Exemple #3
0
 public function crawl_page($url)
 {
     // crawl_page
     $c = new Crawler($url);
     // get title
     $c->go_to('<title>');
     $title = Crawler::extract($c->curline, 'PHD Comics: ', '</title>');
     $title = preg_replace('/\\W/', '_', $title);
     // get the date
     $c->go_to('date_left.gif');
     $c->readline(2);
     $line = $c->curline;
     preg_match('/([0-9]+)\\/([0-9]+)\\/([0-9]+)/mi', $line, $matches);
     //print_r($matches);flush();
     list($full, $month, $date, $year) = $matches;
     if (strlen($date) < 2) {
         $date = '0' . $date;
     }
     if (strlen($month) < 2) {
         $month = '0' . $month;
     }
     $fileprefix = "{$year}_{$month}_{$date}_{$title}";
     // get the img url
     $c->go2linewhere('<td bgcolor=#FFFFFF');
     $line = $c->curline;
     preg_match('/<img src=["\']?([^ ]+)["\']?/i', $line, $matches);
     $img = $matches[1];
     $filename = basename($img);
     $ext = substr($filename, strrpos($filename, '.'));
     echo "<a href='{$img}'>" . $fileprefix . $ext . "</a><br/>";
     flush();
     $c->close();
     unset($c);
 }
 public function go()
 {
     $start_url = $this->url;
     if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) {
         $base_url = 'http://gallery1.hentaifromhell.net';
     } else {
         $base_url = 'http://gallery.hentaifromhell.net';
     }
     $finish = false;
     while (!$finish) {
         $finish = true;
         echo $start_url, "<br />\n";
         flush();
         $c = new Crawler($start_url);
         $c->go2linewhere('<li class="thumbnail">');
         while ($line = $c->readline()) {
             //echo "<pre>$line</pre><br/>\n";flush();
             if (strpos($line, 'src="') !== false) {
                 //ambil gambar
                 $uri = Crawler::extract($line, 'src="', '"');
                 $uri = str_replace('/thumbs/', '/images/', $uri);
                 preg_match('/(\\/small\\/\\d+-)/', $uri, $matches);
                 $uri = str_replace($matches[1], '/', $uri);
                 //$uri = html_entity_decode($uri);
                 //$this->extract_page($uri);
                 $file = basename(dirname($uri));
                 echo "<a href='{$uri}'>{$file}</a><br/>\n";
                 flush();
             } else {
                 if (strpos($line, 'class="pagNext"') !== false) {
                     //next page
                     $finish = false;
                     $start_url = html_entity_decode(Crawler::extract($line, 'class="pagNext" href="', '"'));
                     break;
                 } else {
                     if (strpos($line, '</table>') !== false) {
                         // selesai
                         break;
                     }
                 }
             }
         }
         $c->close();
     }
 }
Exemple #5
0
function crawl1page($url)
{
    echo 'Entering ' . $url . '<br/>';
    flush();
    $c = new Crawler($url);
    $c->go2linewhere('<div class="ngg-gallery-thumbnail"');
    $c->readline();
    $sample = $c->getbetween('href="', '"');
    $c->close();
    $dir = dirname($sample);
    if (!$dir) {
        return;
    }
    $folder = substr($dir, strrpos($dir, '/') + 1);
    $dir = dirname($dir) . '/' . rawurlencode($folder) . '/';
    echo 'Dir:' . $dir . '<br/>' . "\n";
    flush();
    $c = new Crawler($dir);
    $c->go2linewhere('<ul>');
    $c->readline();
    while ($line = $c->readline()) {
        //echo $line;flush();
        if (strpos($line, '</ul>') !== false) {
            break;
        } else {
            if (strpos($line, '"thumbs/"')) {
                break;
            }
        }
        $filename = Crawler::extract($line, 'href="', '"');
        echo '<a href="' . $dir . $filename . '">' . rawurldecode($filename) . '</a><br/>' . "\n";
        flush();
    }
    $c->close();
    echo '<br/>' . "\n";
    flush();
}
 public function go()
 {
     $start_url = $this->url;
     if (preg_match('/gallery1\\.hentaifromhell\\.net/', $start_url)) {
         $base = 'http://gallery1.hentaifromhell.net';
     } else {
         $base = 'http://gallery.hentaifromhell.net';
     }
     $selesai = false;
     while (!$selesai) {
         echo "{$start_url}<br/>\n";
         $craw = new Crawler($start_url);
         $craw->go2linewhere('showimg.php?c=');
         while ($line = $craw->readline()) {
             if (strpos($line, 'showimg.php?c=') !== false) {
                 $raw = Crawler::extract_to_array($line, '<a href="', '"');
                 foreach ($raw as $r) {
                     $href = str_replace('showimg.php?c=', '', $r);
                     $text = basename(dirname($href));
                     // basename($href);
                     echo '<a href="' . $href . '">' . $text . '</a>' . "<br />\n";
                 }
                 // $href = Crawler::extract($line, '<a href="', '"');
             } else {
                 if (strpos($line, 'Next&raquo;') !== false) {
                     if (strpos($line, '<a href') !== false) {
                         $start_url = $base . Crawler::extract($line, '<a href="', '"');
                     } else {
                         $selesai = true;
                     }
                     break;
                 }
             }
         }
         $craw->close();
     }
 }
Exemple #7
0
//http://eatmanga.com/Manga-Scan/Berserk/Berserk-315
$base = $_POST['base'];
$prefix = $_POST['prefix'];
$sitename = "http://eatmanga.com";
$pref = $_POST['base'];
if (!Crawler::is_there($pref, '/index.php/')) {
    $pref = str_replace($sitename . '/Manga', $sitename . '/index.php/Manga', $pref);
}
if ($base) {
    $finish = false;
    $page = 1;
    while (!$finish) {
        echo "{$base}<br/>\n";
        flush();
        $c = new Crawler($base);
        $c->go2linewhere('mangaviewer_toppest_navig');
        if (Crawler::is_there($c->curline, '&nbsp;&nbsp;&rsaquo;')) {
            $finish = false;
            $base = $pref . '/?page=' . ++$page;
        } else {
            $finish = true;
        }
        $ledak = explode('<img src="', $c->curline);
        $c->close();
        for ($i = 1; $i < count($ledak); ++$i) {
            $segm = $ledak[$i];
            $parturl = Crawler::cutuntil($segm, '"');
            $parturl = str_replace('index.php', 'mangas', $parturl);
            $parturl = str_replace('?action=thumb', '', $parturl);
            echo '<a href="' . $sitename . $parturl . '">' . $prefix . '-' . Crawler::n($chapter, 3) . '-' . basename($parturl) . '</a><br/>' . "\n";
            flush();
function crawl_1_page($start_url)
{
    global $prefix;
    global $bas;
    $cr = new Crawler($start_url);
    /*
    	echo $cr->readline();flush();
    while(!feof($cr->stream)) {
    }
    exit(0);
    */
    $cr->go2linewhere('headerSelect');
    $cr->readline();
    $line = $cr->curline;
    $cr->close();
    $chap = Crawler::cutfromlast1($start_url, '/');
    if (strpos($chap, '.') === false) {
        $chap = Crawler::n($chap, 3);
    } else {
        $a = explode('.', $chap);
        $a[0] = Crawler::n($a[0], 3);
        $chap = implode('.', $a);
    }
    $pi = 1;
    // page i
    $ledak = explode('value="', $line);
    $pages = array();
    for ($i = 1; $i < count($ledak); $i++) {
        $uurl = Crawler::cutuntil($ledak[$i], '"');
        $key = Crawler::cutfromlast1($uurl, '/');
        $pages[$key] = strpos($uurl, 'http://') === 0 ? $uurl : $bas . $uurl;
    }
    //print_r($pages);flush();
    $results = array();
    foreach ($pages as $pagenum => $new_url) {
        $berhasil = false;
        while (!$berhasil) {
            $cr = new Crawler($new_url);
            //echo "URL:$new_url<br/>\n";flush();
            $cr->go2linewhere('id="readerPage"');
            if ($cr->strpos('<img src="') === false) {
                $cr->readline();
            }
            $line = $cr->curline;
            $cr->close();
            $img_url = Crawler::extract($line, '<img src="', '"');
            //echo "IMG:$img_url<br/>\n";flush();
            $real_name = basename($img_url);
            $ext = Crawler::cutfromlast($img_url, '.');
            //$filename = $prefix . '-' . $chap . '-' . Crawler::n($pagenum, 2) . $ext;
            $filename = $prefix . '-' . $chap . '-' . urldecode($real_name);
            $val = $img_url;
            if (!empty($val)) {
                $berhasil = true;
                $key = $filename;
                $results[$filename] = $img_url;
                ?>
				<a href="<?php 
                echo $val;
                ?>
"><?php 
                echo $key;
                ?>
</a><br />
				<?php 
                flush();
            } else {
            }
        }
    }
}
Exemple #9
0
 $c = new Crawler($turl);
 if ($c->stream) {
     $lines = $c->getalllineswhere('>>');
     $c->close();
     unset($c);
     //echo '$lines:', htmlspecialchars(print_r($lines, true)), '<br />';
     flush();
     foreach ($lines as $line) {
         $bigC++;
         if ($bigC >= $fromC) {
             $link = Crawler::extract($line, 'href="', '"');
             echo 'Opening ', $bigC, ' ', $link, '<br />';
             flush();
             $c = new Crawler($link);
             if ($c->stream) {
                 $c->go2linewhere('time SG_txtc');
                 $time = $c->getbetween('>(', ')<');
                 $blines = $c->getalllineswhere('/orignal/');
                 $c->close();
                 unset($c);
                 //echo '$blines:', htmlspecialchars(print_r($blines, true)),'<br />';
                 flush();
                 foreach ($blines as $bline) {
                     if (strpos($bline, 'url=') === false) {
                         $blink = Crawler::extract($bline, 'HREF="', '"');
                     } else {
                         if (strpos($bline, 'url=') !== false) {
                             $blink = Crawler::extract($bline, 'url=', '"');
                         }
                     }
                     $blink = str_replace('&amp;690', '', $blink);
Exemple #10
0
<body>
	<form action="" method="post">
		Starting URL: <input type="text" name="start_url" value="<?php 
echo isset($start_url) ? $start_url : '';
?>
" />
		<input type="submit" value="Submit" />
	</form>
<?php 
$masih = true;
while ($masih) {
    echo "{$start_url}<br/>\n";
    flush();
    $craw = new Crawler($start_url);
    //first get the pictures
    $craw->go2linewhere('class="photogallery-celeb"');
    $craw->readline();
    $line = $craw->curline;
    //echo $line;
    //echo '<br />HOI<br />';
    $ledakan = explode('<img src="', $line);
    for ($i = 1; $i < count($ledakan); $i++) {
        $imgurl = str_replace('/t/', '/', Crawler::cutuntil($ledakan[$i], '"'));
        $file = basename($imgurl);
        echo "<a href='{$imgurl}'>{$file}</a><br />\n";
    }
    //then check the next link
    $craw->go_to('class="arrow">&#187;</a>');
    $url = $craw->getbetweenlast('<a href="', '"');
    if ($url == '#') {
        $masih = false;
<?php

require_once 'crawler.php';
//class Crawler
$base = 'http://gravure.ecchi-squad.net/images/gravure/';
$folders = array();
$craw = new Crawler($base);
$craw->go2linewhere('<img src="/icons/folder.gif"');
while (strpos($craw->curline, '</pre>') === false) {
    $folders[] = $craw->getbetween('<a href="', '"');
    $craw->readline();
}
$craw->close();
//print_r($folders);
foreach ($folders as $folder) {
    unset($craw);
    $craw = new Crawler($base . $folder);
    $files = array();
    $craw->go2linewhere('<img src="/icons/image2.gif"');
    while (strpos($craw->curline, '</pre>') === false) {
        $files[] = $craw->getbetween('<a href="', '"');
        $craw->readline();
    }
    $craw->close();
    $fold = substr($folder, 0, strlen($folder) - 1);
    foreach ($files as $file) {
        echo "<a href=\"{$base}{$folder}{$file}\">{$fold}</a><br />\n";
    }
    flush();
}
Exemple #12
0
<html>
<body>
<?php 
require_once "crawler.php";
$istart = 1;
$ifinish = 399;
$start = 'http://asianchicki.com/Girl.aspx?ID=';
extract($_GET);
extract($_POST);
for ($i = $istart; $i <= $ifinish; $i++) {
    $turl = $start . $i;
    $c = new Crawler($turl);
    if ($c->stream) {
        $c->go2linewhere('Thumbnail');
        $c->close();
        $nama = $c->getbetween('ctl00_ContentPlaceHolder1_lblName">', '</span');
        $ledak = explode('FileName="', $c->curline);
        //echo "<pre>{$c->curline}</pre><br />\n";
        $ccount = count($ledak);
        for ($j = 1; $j < $ccount; $j++) {
            $iurl = Crawler::extract($ledak[$j], 'src="', '"');
            $iurl = str_replace('Thumbnail', 'Viewer', $iurl);
            $parsed = Crawler::parse_url($iurl);
            echo '<a href="' . $iurl . '">' . $nama . '</a>' . "<br />\n";
        }
    }
    flush();
}
Exemple #13
0
<?php

require_once "crawler.php";
$base = 'http://www.ez-wallpaper.org';
$berhenti = 0;
$url = $base;
while (!$berhenti) {
    echo "\nURL:{$url}\n";
    $c = new Crawler($url);
    $c->readline();
    while ($line = $c->readline()) {
        if ($c->strpos('nodeTitle') !== false) {
            $href = $c->getbetween('<a href="', '"');
            $c2 = new Crawler($base . $href);
            $c2->go2linewhere('pageTitle');
            $title = $c2->getbetween('>', '<');
            $c2->go2linewhere('node_images');
            $ledak = explode('<a href="', $c2->curline);
            for ($i = 1; $i < count($ledak); $i++) {
                $ahref = substr($ledak[$i], 0, strpos($ledak[$i], '"'));
                echo "<a href='{$ahref}'>{$title}</a><br />\n";
            }
            //echo $c2->curline;
            $c2->close();
        } else {
            if ($c->strpos('Go to next page') !== false) {
                echo "\nADA NEXT\n";
                $url = $base . $c->getbetweenlast('</span><a href="', '"');
                break;
            } else {
                if ($c->strpos('Go to previous page') !== false) {
Exemple #14
0
<body>
<form action="" method="post">
	Start url: <input type="text" name="start_url" value="<?php 
echo isset($start_url) ? $start_url : '';
?>
" />
	<input type="submit" />
</form>
<?php 
$basename = 'http://bluelaguna.net';
if ($start_url) {
    $c = new Crawler($start_url);
    $c->go2linewhere('<iframe id="rectangle"');
    $c->close();
    $ledak = explode('<a href="', $c->curline);
    $big = array();
    for ($i = 1; $i < count($ledak); ++$i) {
        $aurl = $basename . Crawler::cutuntil($ledak[$i], '"');
        echo "{$aurl}<br />";
        $c = new Crawler($aurl);
        $c->go2linewhere('<iframe id="rectangle"');
        $c->close();
        $ledak2 = explode('<a href="', $c->curline);
        for ($j = 1; $j < count($ledak2); ++$j) {
            $burl = Crawler::cutuntil($ledak2[$j], '"');
            echo '<a href="' . htmlentities($burl) . '">' . basename($burl) . "</a><br />\n";
        }
        echo "<br />\n";
        flush();
    }
}
Exemple #15
0
require_once "crawler.php";
$start_date = '2009-03-10';
$base_url = 'http://www.dilbert.com';
$middle_url = '/strips/comic/';
extract($_GET);
$selesai = false;
$url = $base_url . $middle_url . $start_date;
while (!$selesai) {
    $ada_next = false;
    $c = new Crawler($url);
    echo "URL is {$url}<br />\n";
    flush();
    $c->go2lineor(array('STR_Content', 'STR_Prev'));
    //echo "go2lineor selesai\n";flush();
    if ($c->strpos('STR_Prev') !== false) {
        //masih ada next
        $ada_next = true;
        $url = $base_url . $c->getbetween('<a href="', '"');
        $c->go2linewhere('STR_Content');
    } else {
        $ada_next = false;
        $selesai = true;
    }
    $c->readline();
    $img = $c->getbetween('<img src="', '"');
    echo "<a href='{$base_url}{$img}'>{$start_date}</a><br />\n";
    $start_date = Crawler::extract($url, 'comic/', '/');
    $c->close();
    echo "Closed\n";
    flush();
}