public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url);
     // grab total page
     $p->go_line('id="top_bar"');
     $p->go_line_regex('/of \\d+\\w+/');
     $tot = $p->curr_line()->regex_match('/of (\\d+)/');
     $tot = $tot[1];
     // grab first image
     $p->go_line('id="viewer"');
     $p->next_line(2);
     $src = $p->curr_line()->cut_between('src="', '"');
     $name = basename($src);
     $result = array("{$prefix}-{$ifx}-{$name}" => $src);
     for ($i = 2; $i <= $tot; $i++) {
         $p = new Page(dirname($chapter_url) . '/' . $i . '.html');
         $p->go_line('id="viewer"');
         $p->next_line(2);
         $src = $p->curr_line()->cut_between('src="', '"');
         $name = basename($src);
         $result["{$prefix}-{$ifx}-{$name}"] = $src;
     }
     return $result;
 }
Example #2
0
 public function crawl_chapter($v)
 {
     $ifx = Text::create($v['infix'])->pad(3)->to_s();
     $prefix = $this->prefix;
     $p = new Page($v['url']);
     // grab total page
     $p->go_line('id="top_bar"');
     $p->go_line_regex('/of \\d+\\w+/');
     $tot = $p->curr_line()->regex_match('/of (\\d+)/');
     $tot = $tot[1];
     // grab first image
     $p->go_line('id="viewer"');
     $p->next_line(2);
     $src = $p->curr_line()->cut_between('src="', '"');
     $name = basename($src);
     echo "<a href='{$src}'>{$prefix}-{$ifx}-{$name}</a><br>\n";
     // iterate
     for ($i = 2; $i <= $tot; $i++) {
         $p = new Page(dirname($v['url']) . '/' . $i . '.html');
         $p->go_line('id="viewer"');
         $p->next_line(2);
         $src = $p->curr_line()->cut_between('src="', '"');
         $name = basename($src);
         echo "<a href='{$src}'>{$prefix}-{$ifx}-{$name}</a><br>\n";
     }
 }
 private function collect_images($url, $dir)
 {
     $continue = true;
     $domain = 'http://rule34.xxx/';
     $base = 'http://rule34.xxx/index.php';
     do {
         echo $url . "\n";
         $p = new Page($url);
         $p->go_line('class="thumb"');
         do {
             if ($p->curr_line()->contain('href="')) {
                 $href = $p->curr_line()->cut_between('href="', '"')->to_s();
                 $href = htmlspecialchars_decode($href);
                 echo "{$domain}{$href}\n";
                 $p2 = new Page($domain . $href);
                 $p2->go_line('Original image');
                 $src = $p2->curr_line()->cut_between('href="http:', '"')->to_s();
                 $src = 'http:' . $src;
                 $outpath = $dir . basename($src);
                 download_it($src, $outpath, "--header=\"Accept: image/*\"");
                 // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>';
             }
         } while (!$p->next_line()->contain('<center>'));
         $p->reset_line();
         $p->go_line('id="paginator"');
         if ($p->curr_line()->contain('alt="next"')) {
             $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/');
             $url = $base . html_entity_decode($m[1]);
         } else {
             $continue = false;
         }
     } while ($continue);
 }
Example #4
0
 public function extract_info($base)
 {
     $p = new Page($base);
     $p->go_line('<!-- START FILE LOOP -->');
     $list = array();
     do {
         if ($p->curr_line()->contain('href=')) {
             $url = $p->curr_line()->cut_between("</td><td><a href='", "'");
             $desc = $url->dirname()->dirname()->basename();
             $ifx = $desc->substr(-3);
             $list[] = array('url' => $url->to_s(), 'desc' => $desc->to_s(), 'infix' => $ifx->to_s());
         }
     } while (!$p->next_line()->contain('<!-- END DOWNLOADS -->'));
     return $list;
 }
 public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url);
     // grab list of pages
     $p->go_line('id="page_select"');
     $pages = $p->next_line()->extract_to_array('value="', '"');
     // grab current image
     $result = $this->crawl_page($p, $prefix, $ifx, 1);
     array_shift($pages);
     foreach ($pages as $i => $purl) {
         $p = new Page($purl);
         $result = $result + $this->crawl_page($p, $prefix, $ifx, $i + 2);
     }
     return $result;
 }
Example #6
0
 public function extract_info($base)
 {
     // crawl chapters
     $p = new Page($base);
     $p->go_line('class="list"');
     $list = array();
     do {
         if ($p->curr_line()->contain('class="title"') && $p->curr_line()->contain('title=')) {
             $line = $p->curr_line()->dup();
             $href = $line->dup()->cut_between('href="', '"')->to_s();
             $desc = $line->dup()->cut_between('title="', '">')->to_s();
             $infix = basename($href);
             $list[] = array('url' => $href, 'desc' => $desc, 'infix' => $infix);
         }
     } while (!$p->next_line()->contain('</article>'));
     return $list;
 }
 public function get_info($base)
 {
     // crawl chapters
     $p = new Page($base);
     $p->go_line('<table class="table table-striped">');
     $list = array();
     do {
         if ($p->curr_line()->contain('href="')) {
             $line = $p->curr_line();
             $href = $line->cut_between('href="', '"');
             $desc = $line->cut_between('">', '</a');
             $infix = $desc->regex_match('/(\\d+)/');
             $infix = $infix[1];
             $list[] = array('url' => $href->to_s(), 'desc' => $desc->to_s(), 'infix' => $infix);
         }
     } while (!$p->next_line()->contain('</table>'));
     return $list;
 }
Example #8
0
 public function crawl_page($url, $ifx)
 {
     $p = new Page($url);
     $p->go_line('data[pages]');
     $pages = array();
     do {
         $line = $p->curr_line();
         if ($line->contain('</option>')) {
             $pages[] = $line->cut_between('>', '</option')->to_s();
         }
     } while (!$p->next_line()->contain('</select>'));
     $p->go_line('scanlations');
     $imgurl = $p->curr_line()->cut_between('<img src="', '"')->to_s();
     $imgbase = dirname($imgurl);
     foreach ($pages as $page) {
         echo "<a href='{$imgbase}/{$page}'>{$this->prefix}-{$ifx}-{$page}</a><br/>\n";
     }
 }
 public function get_info($base)
 {
     $p = new Page($base);
     $p->go_line('<div class="divThickBorder" style="padding:7px">');
     $raw = $p->next_line()->dup();
     $list = array();
     foreach (explode('<tr>', $raw->to_s()) as $line) {
         $tline = new Text($line);
         if ($tline->contain('href="')) {
             $href = $tline->dup()->cut_between('href="', '"')->to_s();
             $desc = $tline->dup()->cut_between('">', '</a')->to_s();
             preg_match('/([\\.\\d]+) :/', $desc, $m);
             $infix = $m[1];
             $list[] = array('url' => $href, 'desc' => strip_tags($desc), 'infix' => $infix);
         }
     }
     return $list;
 }
Example #10
0
 public function crawl_chapter($v)
 {
     $ifx = Text::create($v['infix'])->pad(3)->to_s();
     $p = new Page($v['url']);
     // grab list of pages
     $p->go_line('id="page_select"');
     $pages = $p->next_line()->extract_to_array('value="', '"');
     // grab current image
     $this->crawl_page($p, $ifx);
     array_shift($pages);
     foreach ($pages as $purl) {
         $p = new Page($purl);
         $this->crawl_page($p, $ifx);
     }
     /*
     Manga_Crawler::multiProcess(4, $pages, array($this, 'crawl_page'), array($ifx));
     */
 }
Example #11
0
 public function crawl_chapter($v)
 {
     $ifx = Text::create($v['infix'])->pad(3)->to_s();
     $p = new Page($v['url']);
     // grab total page
     $p->go_line('select class="cbo_wpm_pag"');
     $p->next_line();
     $p->go_line('select class="cbo_wpm_pag"');
     $pages = $p->curr_line()->extract_to_array('value="', '"');
     // grab first image
     $p->reset_line();
     $this->crawl_page($p, $ifx);
     // iterate
     array_shift($pages);
     foreach ($pages as $page) {
         $purl = $v['url'] . $page . '/';
         $q = new Page($purl);
         $this->crawl_page($q, $ifx);
     }
 }
 public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url, array('become_firefox' => true));
     // grab list of pages
     $p->go_line('var lstImages');
     $i = 1;
     $result = array();
     do {
         if ($p->curr_line()->contain('lstImages.push')) {
             $line = $p->curr_line();
             $img = $line->cut_between('push("', '")');
             $iname = Text::create($i++)->pad(3)->to_s() . Text::create(basename($img))->cut_rfrom('.')->cut_before('?')->to_s();
             $name = "{$prefix}-{$ifx}-{$iname}";
             $result[$name] = $img;
         }
     } while (!$p->next_line()->contain('new Array()'));
     $pages = $p->curr_line()->extract_to_array('href="', '"');
     return $result;
 }
 public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url);
     // grab total page
     $p->go_line('select class="cbo_wpm_pag"');
     $p->next_line();
     $p->go_line('select class="cbo_wpm_pag"');
     $pages = $p->curr_line()->extract_to_array('value="', '"');
     // grab first image
     $p->reset_line();
     $result = $this->crawl_page($p, $prefix, $ifx);
     // iterate
     array_shift($pages);
     foreach ($pages as $page) {
         $purl = $chapter_url . $page . '/';
         $q = new Page($purl);
         $result = $result + $this->crawl_page($q, $prefix, $ifx);
     }
     return $result;
 }
 private function download_chapters($chapters)
 {
     $prefix = 'Birth_of_Levi';
     $dir = '/home/khandar-gdp/tmp/birth of levi/';
     foreach ($chapters as $infix => $url) {
         echo "Opening {$url}...\n";
         $p = new Page($url);
         $p->go_line('pages[1]=');
         $i = 1;
         do {
             $line = $p->curr_line();
             $img = $line->cut_between('="', '"');
             $suffix = Text::create($i++)->pad(3)->to_s();
             $ext = $img->cut_rafter('.');
             $filename = "{$dir}{$prefix}-{$infix}-{$suffix}.{$ext}";
             if (!is_file($filename)) {
                 download_it($img->to_s(), $filename);
             }
         } while ($p->next_line()->contain('pages['));
     }
 }
Example #15
0
 public function grab_volume_chapters()
 {
     $p = new Page('http://en.wikipedia.org/wiki/List_of_Hajime_no_Ippo_chapters');
     $list = array();
     while (!$p->end_of_line()) {
         $p->go_line('Main article:');
         if ($p->end_of_line()) {
             break;
         }
         $href = 'http://en.wikipedia.org' . $p->curr_line()->dup()->cut_between('href="', '"')->to_s();
         $p2 = new Page($href);
         while (!$p2->end_of_line()) {
             try {
                 $p2->go_line('<td id="vol');
                 $vol = $p2->curr_line()->dup()->cut_between('">', '<')->to_s();
                 do {
                     if ($p2->curr_line()->contain('<li>Round ')) {
                         $last_chapter = $p2->curr_line()->dup()->cut_between('Round ', ':')->to_s();
                     }
                 } while (!$p2->next_line()->contain('</table>'));
                 $list[$vol] = $last_chapter;
                 // echo "v $vol c $last_chapter <br/>\n";
             } catch (Exception $e) {
                 break;
             }
         }
         $p->next_line();
     }
     return $list;
 }
Example #16
0
 public function action_update()
 {
     $start = Hbrowse::$update;
     $stop = false;
     $links = array();
     while (!$stop) {
         $p = new Page($start);
         $p->go_line('id="main"');
         do {
             $line = $p->curr_line();
             if ($line->contain('class="browseDescription"')) {
                 $arr = $line->extract_to_array('href="', '"');
                 $href = rtrim(end($arr), '/');
                 if ($this->is_already_exist($href)) {
                     $stop = true;
                     break;
                 }
                 $links[] = $href;
             }
         } while (!$p->next_line()->contain('jump to next page'));
         $line = $p->curr_line();
         // Cek ada next/tidak
         if ($line->contain('"jump to next page')) {
             $arr = $line->extract_to_array('href="', '"');
             $start = end($arr);
         } else {
             $stop = true;
         }
     }
     $links = array_reverse(array_unique($links));
     // $links berisi link2 yg siap dimasukkan
     foreach ($links as $link) {
         echo $link . "<br>\n";
         flush();
         // http://www.hbrowse.com/10001/c00001
         // Cek dah ada di DB belum
         $p = new Page($link);
         $data = $this->extract_from_page($p);
         // Masukkan ke database
         $this->add_hmanga($data);
     }
 }
Example #17
0
function stage_update()
{
    // http://www.redflava.com/gallery/thumbnails.php?album=lastup&cat=0
    // akses halaman ini dan dari awal gambar2 yg belum ada di database,
    // pivot sampe ketemu gambat yg sudah ada (asumsi setelahnya sudah ada semua)
    // Note: mungkin aja ada album yg belum ada sebelumnya
    // @TODO
    $url = 'http://www.redflava.com/gallery/thumbnails.php?album=lastup&cat=0';
    $hit_pivot = false;
    // sudah ketemu batas yg baru?
    while (!$hit_pivot) {
        // fetch page
        $p = new Page($url);
        $p->go_line('class="maintable');
        do {
            if ($p->curr_line()->contain('class="image')) {
                // fetch per item
                $m = $p->curr_line()->regex_match('/href="([^"]+)".+src="([^"]+)"/');
                list($all, $href, $src) = $m;
                $full = str_replace('/thumb_', '/', $src);
                // pastikan tidak ada di database
                echo "{$full}\n";
                $check = G::$db->from('image')->where('full', $full)->doFetchRow();
                if ($check) {
                    $hit_pivot = true;
                    // sudah nemu 1 yg ada di database, saatnya berhenti
                    continue;
                }
                // now we get image full url and detail url
                // lets find out the album id
                $ap = new Page(G::$base . htmlspecialchars_decode($href));
                $ap->go_line('class="alblink"');
                $m = $ap->curr_line()->regex_match('/href="(thumb[^"]+)"/');
                $link = $m[1];
                $album = G::$db->from('album')->where('link', $link)->doFetchRow();
                if (!$album) {
                    // wah album belum ada
                    echo "ALBUM BARU!! {$link}\n";
                    $album = fetch_new_album($link);
                    continue;
                    // skip dah
                }
                // lastly, insert to database
                G::$db->from('image')->set(array('album_id' => $album['id'], 'full' => $full))->doInsert();
            }
        } while (!$hit_pivot && !$p->next_line()->contain('<script'));
        // fetch next page
        $p->go_line('title="Next"');
        if ($p->curr_line()->contain('title="Next"')) {
            // masih ada next
            $m = $p->curr_line()->regex_match('/href="([^"]+)"/');
            $url = G::$base . htmlspecialchars_decode($m[1]);
        } else {
            // ga ada next, pasti hit_pivot
            $hit_pivot = true;
            // regardless sebelumnya true/false
        }
    }
}
Example #18
0
    ?>
<h1>3</h1>
<?php 
    foreach ($info as $k => $v) {
        $ifx = Text::create($v['infix'])->pad(3)->to_s();
        $b = break_url($v['url']);
        extract($b);
        $P = new Page($v['url']);
        // Grab all pages
        $pages = array();
        $P->go_line('id="Serie_pages"');
        do {
            if ($P->curr_line()->contain('<option')) {
                $pages[] = $P->curr_line()->dup()->cut_between('">', '</')->to_s();
            }
        } while (!$P->next_line()->contain('</select>'));
        array_shift($pages);
        // Grab this page's image
        $P->go_line('id="manga_img"');
        $src = $P->curr_line()->dup()->cut_between('src="', '"')->to_s();
        $name = basename($src);
        echo "<a href='{$domain}{$src}'>{$prefix}-{$ifx}-{$name}</a><br/>\n";
        // Now for the other pages
        foreach ($pages as $p) {
            $the_url = "{$basic_url}{$title}/{$chapter_id}/{$chapter_text}/{$p}/";
            $P = new Page($the_url);
            $P->go_line('id="manga_img"');
            $src = $P->curr_line()->dup()->cut_between('src="', '"')->to_s();
            $name = basename($src);
            echo "<a href='{$domain}{$src}'>{$prefix}-{$ifx}-{$name}</a><br/>\n";
        }
 public function crawl_page($url, $prefix, $ifx)
 {
     $p = new Page($url);
     $p->go_line('data[pages]');
     $pages = array();
     do {
         $line = $p->curr_line();
         if ($line->contain('</option>')) {
             $pages[] = $line->cut_between('>', '</option')->to_s();
         }
     } while (!$p->next_line()->contain('</select>'));
     $p->go_line('scanlations');
     $imgurl = $p->curr_line()->cut_between('<img src="', '"')->to_s();
     $imgbase = dirname($imgurl);
     $result = array();
     foreach ($pages as $page) {
         $result["{$prefix}-{$ifx}-{$page}"] = "{$imgbase}/{$page}";
     }
     return $result;
 }
Example #20
0
function pururin($url)
{
    // http://pururin.com/hentai-manga/1673/pai-zuri.html
    // http://pururin.com/hentai-manga/1673/gallery/pai-zuri_1.html gallery
    // http://pururin.com/hentai-manga/1673/gallery/pai-zuri_2.html gallery next page
    // http://pururin.com/hentai-manga/1673/view/pai-zuri_1.html view full image
    $base = 'http://pururin.com';
    $title = substr(basename($url), 0, -5);
    // if (strpos($url, '_1.html') === false) {
    // 	$url = dirname($url) . '/gallery/' . str_replace('.html', '_1.html', basename($url));
    // }
    $url = str_replace('/gallery/', '/thumbs/', $url);
    // collect more than 100 images
    $next = true;
    $i = 1;
    while ($next) {
        $p = new Page($url);
        $p->go_line('class="thumblist"');
        $hrefs = $p->next_line()->extract_to_array('href="', '"');
        foreach ($hrefs as $href) {
            $q = new Page($base . $href);
            $q->go_line('class="b"');
            $src = $q->curr_line()->cut_between('src="', '"');
            echo "<a href='{$base}{$src}'>{$title}</a><br>\n";
        }
        // $thumbs = $p->next_line()->extract_to_array('src="', '"');
        // foreach ($thumbs as $k => $v) {
        // 	$f = preg_replace('/([^-]+)t\//', '$1f/', $v);
        // 	echo "<a href='$base$f'>$title</a><br>\n";
        // }
        //now all in 1 page
        $next = false;
        /*
        $p->go_line('class="thumbnail_list"');
        do { if ($p->curr_line()->contain('class="pageNumber"')) {
        	$href = $p->curr_line()->dup()->cut_between('href="', '"')->to_s();
        	$p2 = new Page($base . $href);
        	$p2->go_line('id="i1"');
        	$src = $p2->curr_line()->dup()->cut_between('src="', '"')->to_s();
        	echo "<a href='$base$src'>$title</a><br>\n";
        }} while (!$p->next_line()->contain('class="clear"'));
        if (strpos($p->content(), '">&rsaquo;</a>') === false) {
        	$next = false;
        }
        */
        $url = str_replace('_' . $i . '.html', '_' . ($i + 1) . '.html', $url);
        $i++;
    }
}
Example #21
0
 public function process()
 {
     $this->go_line('Hayley Marie');
     do {
         if ($this->curr_line()->exist('href=')) {
             $m = $this->curr_line()->regex_match('/' . self::REG_HREF . '\\s+title=["\']([^"\']*)["\']/');
             $url = $m[1];
             $name = $m[2];
             if (!$name) {
                 $name = 'asdf' . $this->current_i;
             }
             $p = new Page($url);
             $p->go_line("id='form'");
             do {
                 if ($p->curr_line()->exist("href='")) {
                     $img = $p->curr_line()->dup()->cut_between("href='", "'");
                     if ($img->exist('imageboss.net')) {
                         $img->replace('/view/', '/img/')->replace('-', '/');
                     }
                     echo "<a href='{$img}'>{$name}</a><br />\n";
                 }
             } while (!$p->next_line()->exist('</form>'));
         }
     } while (!$this->next_line()->exist('id="vr_nav"'));
 }