public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab total page $p->go_line('id="top_bar"'); $p->go_line_regex('/of \\d+\\w+/'); $tot = $p->curr_line()->regex_match('/of (\\d+)/'); $tot = $tot[1]; // grab first image $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); $result = array("{$prefix}-{$ifx}-{$name}" => $src); for ($i = 2; $i <= $tot; $i++) { $p = new Page(dirname($chapter_url) . '/' . $i . '.html'); $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); $result["{$prefix}-{$ifx}-{$name}"] = $src; } return $result; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $prefix = $this->prefix; $p = new Page($v['url']); // grab total page $p->go_line('id="top_bar"'); $p->go_line_regex('/of \\d+\\w+/'); $tot = $p->curr_line()->regex_match('/of (\\d+)/'); $tot = $tot[1]; // grab first image $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); echo "<a href='{$src}'>{$prefix}-{$ifx}-{$name}</a><br>\n"; // iterate for ($i = 2; $i <= $tot; $i++) { $p = new Page(dirname($v['url']) . '/' . $i . '.html'); $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); echo "<a href='{$src}'>{$prefix}-{$ifx}-{$name}</a><br>\n"; } }
private function collect_images($url, $dir) { $continue = true; $domain = 'http://rule34.xxx/'; $base = 'http://rule34.xxx/index.php'; do { echo $url . "\n"; $p = new Page($url); $p->go_line('class="thumb"'); do { if ($p->curr_line()->contain('href="')) { $href = $p->curr_line()->cut_between('href="', '"')->to_s(); $href = htmlspecialchars_decode($href); echo "{$domain}{$href}\n"; $p2 = new Page($domain . $href); $p2->go_line('Original image'); $src = $p2->curr_line()->cut_between('href="http:', '"')->to_s(); $src = 'http:' . $src; $outpath = $dir . basename($src); download_it($src, $outpath, "--header=\"Accept: image/*\""); // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>'; } } while (!$p->next_line()->contain('<center>')); $p->reset_line(); $p->go_line('id="paginator"'); if ($p->curr_line()->contain('alt="next"')) { $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/'); $url = $base . html_entity_decode($m[1]); } else { $continue = false; } } while ($continue); }
public function extract_info($base) { $p = new Page($base); $p->go_line('<!-- START FILE LOOP -->'); $list = array(); do { if ($p->curr_line()->contain('href=')) { $url = $p->curr_line()->cut_between("</td><td><a href='", "'"); $desc = $url->dirname()->dirname()->basename(); $ifx = $desc->substr(-3); $list[] = array('url' => $url->to_s(), 'desc' => $desc->to_s(), 'infix' => $ifx->to_s()); } } while (!$p->next_line()->contain('<!-- END DOWNLOADS -->')); return $list; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab list of pages $p->go_line('id="page_select"'); $pages = $p->next_line()->extract_to_array('value="', '"'); // grab current image $result = $this->crawl_page($p, $prefix, $ifx, 1); array_shift($pages); foreach ($pages as $i => $purl) { $p = new Page($purl); $result = $result + $this->crawl_page($p, $prefix, $ifx, $i + 2); } return $result; }
public function extract_info($base) { // crawl chapters $p = new Page($base); $p->go_line('class="list"'); $list = array(); do { if ($p->curr_line()->contain('class="title"') && $p->curr_line()->contain('title=')) { $line = $p->curr_line()->dup(); $href = $line->dup()->cut_between('href="', '"')->to_s(); $desc = $line->dup()->cut_between('title="', '">')->to_s(); $infix = basename($href); $list[] = array('url' => $href, 'desc' => $desc, 'infix' => $infix); } } while (!$p->next_line()->contain('</article>')); return $list; }
public function get_info($base) { // crawl chapters $p = new Page($base); $p->go_line('<table class="table table-striped">'); $list = array(); do { if ($p->curr_line()->contain('href="')) { $line = $p->curr_line(); $href = $line->cut_between('href="', '"'); $desc = $line->cut_between('">', '</a'); $infix = $desc->regex_match('/(\\d+)/'); $infix = $infix[1]; $list[] = array('url' => $href->to_s(), 'desc' => $desc->to_s(), 'infix' => $infix); } } while (!$p->next_line()->contain('</table>')); return $list; }
public function crawl_page($url, $ifx) { $p = new Page($url); $p->go_line('data[pages]'); $pages = array(); do { $line = $p->curr_line(); if ($line->contain('</option>')) { $pages[] = $line->cut_between('>', '</option')->to_s(); } } while (!$p->next_line()->contain('</select>')); $p->go_line('scanlations'); $imgurl = $p->curr_line()->cut_between('<img src="', '"')->to_s(); $imgbase = dirname($imgurl); foreach ($pages as $page) { echo "<a href='{$imgbase}/{$page}'>{$this->prefix}-{$ifx}-{$page}</a><br/>\n"; } }
public function get_info($base) { $p = new Page($base); $p->go_line('<div class="divThickBorder" style="padding:7px">'); $raw = $p->next_line()->dup(); $list = array(); foreach (explode('<tr>', $raw->to_s()) as $line) { $tline = new Text($line); if ($tline->contain('href="')) { $href = $tline->dup()->cut_between('href="', '"')->to_s(); $desc = $tline->dup()->cut_between('">', '</a')->to_s(); preg_match('/([\\.\\d]+) :/', $desc, $m); $infix = $m[1]; $list[] = array('url' => $href, 'desc' => strip_tags($desc), 'infix' => $infix); } } return $list; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab list of pages $p->go_line('id="page_select"'); $pages = $p->next_line()->extract_to_array('value="', '"'); // grab current image $this->crawl_page($p, $ifx); array_shift($pages); foreach ($pages as $purl) { $p = new Page($purl); $this->crawl_page($p, $ifx); } /* Manga_Crawler::multiProcess(4, $pages, array($this, 'crawl_page'), array($ifx)); */ }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab total page $p->go_line('select class="cbo_wpm_pag"'); $p->next_line(); $p->go_line('select class="cbo_wpm_pag"'); $pages = $p->curr_line()->extract_to_array('value="', '"'); // grab first image $p->reset_line(); $this->crawl_page($p, $ifx); // iterate array_shift($pages); foreach ($pages as $page) { $purl = $v['url'] . $page . '/'; $q = new Page($purl); $this->crawl_page($q, $ifx); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url, array('become_firefox' => true)); // grab list of pages $p->go_line('var lstImages'); $i = 1; $result = array(); do { if ($p->curr_line()->contain('lstImages.push')) { $line = $p->curr_line(); $img = $line->cut_between('push("', '")'); $iname = Text::create($i++)->pad(3)->to_s() . Text::create(basename($img))->cut_rfrom('.')->cut_before('?')->to_s(); $name = "{$prefix}-{$ifx}-{$iname}"; $result[$name] = $img; } } while (!$p->next_line()->contain('new Array()')); $pages = $p->curr_line()->extract_to_array('href="', '"'); return $result; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab total page $p->go_line('select class="cbo_wpm_pag"'); $p->next_line(); $p->go_line('select class="cbo_wpm_pag"'); $pages = $p->curr_line()->extract_to_array('value="', '"'); // grab first image $p->reset_line(); $result = $this->crawl_page($p, $prefix, $ifx); // iterate array_shift($pages); foreach ($pages as $page) { $purl = $chapter_url . $page . '/'; $q = new Page($purl); $result = $result + $this->crawl_page($q, $prefix, $ifx); } return $result; }
private function download_chapters($chapters) { $prefix = 'Birth_of_Levi'; $dir = '/home/khandar-gdp/tmp/birth of levi/'; foreach ($chapters as $infix => $url) { echo "Opening {$url}...\n"; $p = new Page($url); $p->go_line('pages[1]='); $i = 1; do { $line = $p->curr_line(); $img = $line->cut_between('="', '"'); $suffix = Text::create($i++)->pad(3)->to_s(); $ext = $img->cut_rafter('.'); $filename = "{$dir}{$prefix}-{$infix}-{$suffix}.{$ext}"; if (!is_file($filename)) { download_it($img->to_s(), $filename); } } while ($p->next_line()->contain('pages[')); } }
public function grab_volume_chapters() { $p = new Page('http://en.wikipedia.org/wiki/List_of_Hajime_no_Ippo_chapters'); $list = array(); while (!$p->end_of_line()) { $p->go_line('Main article:'); if ($p->end_of_line()) { break; } $href = 'http://en.wikipedia.org' . $p->curr_line()->dup()->cut_between('href="', '"')->to_s(); $p2 = new Page($href); while (!$p2->end_of_line()) { try { $p2->go_line('<td id="vol'); $vol = $p2->curr_line()->dup()->cut_between('">', '<')->to_s(); do { if ($p2->curr_line()->contain('<li>Round ')) { $last_chapter = $p2->curr_line()->dup()->cut_between('Round ', ':')->to_s(); } } while (!$p2->next_line()->contain('</table>')); $list[$vol] = $last_chapter; // echo "v $vol c $last_chapter <br/>\n"; } catch (Exception $e) { break; } } $p->next_line(); } return $list; }
public function action_update() { $start = Hbrowse::$update; $stop = false; $links = array(); while (!$stop) { $p = new Page($start); $p->go_line('id="main"'); do { $line = $p->curr_line(); if ($line->contain('class="browseDescription"')) { $arr = $line->extract_to_array('href="', '"'); $href = rtrim(end($arr), '/'); if ($this->is_already_exist($href)) { $stop = true; break; } $links[] = $href; } } while (!$p->next_line()->contain('jump to next page')); $line = $p->curr_line(); // Cek ada next/tidak if ($line->contain('"jump to next page')) { $arr = $line->extract_to_array('href="', '"'); $start = end($arr); } else { $stop = true; } } $links = array_reverse(array_unique($links)); // $links berisi link2 yg siap dimasukkan foreach ($links as $link) { echo $link . "<br>\n"; flush(); // http://www.hbrowse.com/10001/c00001 // Cek dah ada di DB belum $p = new Page($link); $data = $this->extract_from_page($p); // Masukkan ke database $this->add_hmanga($data); } }
function stage_update() { // http://www.redflava.com/gallery/thumbnails.php?album=lastup&cat=0 // akses halaman ini dan dari awal gambar2 yg belum ada di database, // pivot sampe ketemu gambat yg sudah ada (asumsi setelahnya sudah ada semua) // Note: mungkin aja ada album yg belum ada sebelumnya // @TODO $url = 'http://www.redflava.com/gallery/thumbnails.php?album=lastup&cat=0'; $hit_pivot = false; // sudah ketemu batas yg baru? while (!$hit_pivot) { // fetch page $p = new Page($url); $p->go_line('class="maintable'); do { if ($p->curr_line()->contain('class="image')) { // fetch per item $m = $p->curr_line()->regex_match('/href="([^"]+)".+src="([^"]+)"/'); list($all, $href, $src) = $m; $full = str_replace('/thumb_', '/', $src); // pastikan tidak ada di database echo "{$full}\n"; $check = G::$db->from('image')->where('full', $full)->doFetchRow(); if ($check) { $hit_pivot = true; // sudah nemu 1 yg ada di database, saatnya berhenti continue; } // now we get image full url and detail url // lets find out the album id $ap = new Page(G::$base . htmlspecialchars_decode($href)); $ap->go_line('class="alblink"'); $m = $ap->curr_line()->regex_match('/href="(thumb[^"]+)"/'); $link = $m[1]; $album = G::$db->from('album')->where('link', $link)->doFetchRow(); if (!$album) { // wah album belum ada echo "ALBUM BARU!! {$link}\n"; $album = fetch_new_album($link); continue; // skip dah } // lastly, insert to database G::$db->from('image')->set(array('album_id' => $album['id'], 'full' => $full))->doInsert(); } } while (!$hit_pivot && !$p->next_line()->contain('<script')); // fetch next page $p->go_line('title="Next"'); if ($p->curr_line()->contain('title="Next"')) { // masih ada next $m = $p->curr_line()->regex_match('/href="([^"]+)"/'); $url = G::$base . htmlspecialchars_decode($m[1]); } else { // ga ada next, pasti hit_pivot $hit_pivot = true; // regardless sebelumnya true/false } } }
?> <h1>3</h1> <?php foreach ($info as $k => $v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $b = break_url($v['url']); extract($b); $P = new Page($v['url']); // Grab all pages $pages = array(); $P->go_line('id="Serie_pages"'); do { if ($P->curr_line()->contain('<option')) { $pages[] = $P->curr_line()->dup()->cut_between('">', '</')->to_s(); } } while (!$P->next_line()->contain('</select>')); array_shift($pages); // Grab this page's image $P->go_line('id="manga_img"'); $src = $P->curr_line()->dup()->cut_between('src="', '"')->to_s(); $name = basename($src); echo "<a href='{$domain}{$src}'>{$prefix}-{$ifx}-{$name}</a><br/>\n"; // Now for the other pages foreach ($pages as $p) { $the_url = "{$basic_url}{$title}/{$chapter_id}/{$chapter_text}/{$p}/"; $P = new Page($the_url); $P->go_line('id="manga_img"'); $src = $P->curr_line()->dup()->cut_between('src="', '"')->to_s(); $name = basename($src); echo "<a href='{$domain}{$src}'>{$prefix}-{$ifx}-{$name}</a><br/>\n"; }
public function crawl_page($url, $prefix, $ifx) { $p = new Page($url); $p->go_line('data[pages]'); $pages = array(); do { $line = $p->curr_line(); if ($line->contain('</option>')) { $pages[] = $line->cut_between('>', '</option')->to_s(); } } while (!$p->next_line()->contain('</select>')); $p->go_line('scanlations'); $imgurl = $p->curr_line()->cut_between('<img src="', '"')->to_s(); $imgbase = dirname($imgurl); $result = array(); foreach ($pages as $page) { $result["{$prefix}-{$ifx}-{$page}"] = "{$imgbase}/{$page}"; } return $result; }
function pururin($url) { // http://pururin.com/hentai-manga/1673/pai-zuri.html // http://pururin.com/hentai-manga/1673/gallery/pai-zuri_1.html gallery // http://pururin.com/hentai-manga/1673/gallery/pai-zuri_2.html gallery next page // http://pururin.com/hentai-manga/1673/view/pai-zuri_1.html view full image $base = 'http://pururin.com'; $title = substr(basename($url), 0, -5); // if (strpos($url, '_1.html') === false) { // $url = dirname($url) . '/gallery/' . str_replace('.html', '_1.html', basename($url)); // } $url = str_replace('/gallery/', '/thumbs/', $url); // collect more than 100 images $next = true; $i = 1; while ($next) { $p = new Page($url); $p->go_line('class="thumblist"'); $hrefs = $p->next_line()->extract_to_array('href="', '"'); foreach ($hrefs as $href) { $q = new Page($base . $href); $q->go_line('class="b"'); $src = $q->curr_line()->cut_between('src="', '"'); echo "<a href='{$base}{$src}'>{$title}</a><br>\n"; } // $thumbs = $p->next_line()->extract_to_array('src="', '"'); // foreach ($thumbs as $k => $v) { // $f = preg_replace('/([^-]+)t\//', '$1f/', $v); // echo "<a href='$base$f'>$title</a><br>\n"; // } //now all in 1 page $next = false; /* $p->go_line('class="thumbnail_list"'); do { if ($p->curr_line()->contain('class="pageNumber"')) { $href = $p->curr_line()->dup()->cut_between('href="', '"')->to_s(); $p2 = new Page($base . $href); $p2->go_line('id="i1"'); $src = $p2->curr_line()->dup()->cut_between('src="', '"')->to_s(); echo "<a href='$base$src'>$title</a><br>\n"; }} while (!$p->next_line()->contain('class="clear"')); if (strpos($p->content(), '">›</a>') === false) { $next = false; } */ $url = str_replace('_' . $i . '.html', '_' . ($i + 1) . '.html', $url); $i++; } }
public function process() { $this->go_line('Hayley Marie'); do { if ($this->curr_line()->exist('href=')) { $m = $this->curr_line()->regex_match('/' . self::REG_HREF . '\\s+title=["\']([^"\']*)["\']/'); $url = $m[1]; $name = $m[2]; if (!$name) { $name = 'asdf' . $this->current_i; } $p = new Page($url); $p->go_line("id='form'"); do { if ($p->curr_line()->exist("href='")) { $img = $p->curr_line()->dup()->cut_between("href='", "'"); if ($img->exist('imageboss.net')) { $img->replace('/view/', '/img/')->replace('-', '/'); } echo "<a href='{$img}'>{$name}</a><br />\n"; } } while (!$p->next_line()->exist('</form>')); } } while (!$this->next_line()->exist('id="vr_nav"')); }