public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $prefix = $this->prefix; $p = new Page($v['url']); // grab total page $p->go_line('id="top_bar"'); $p->go_line_regex('/of \\d+\\w+/'); $tot = $p->curr_line()->regex_match('/of (\\d+)/'); $tot = $tot[1]; // grab first image $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); echo "<a href='{$src}'>{$prefix}-{$ifx}-{$name}</a><br>\n"; // iterate for ($i = 2; $i <= $tot; $i++) { $p = new Page(dirname($v['url']) . '/' . $i . '.html'); $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); echo "<a href='{$src}'>{$prefix}-{$ifx}-{$name}</a><br>\n"; } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $sitename = "http://mangahead.com"; $pref = Text::create($chapter_url); if (!$pref->contain('index.php')) { $pref = $pref->replace($sitename . '/Manga', $sitename . '/index.php/Manga'); } $finish = false; if ($pref->contain('?page=')) { $page = (int) $pref->cut_after('?page=')->to_s(); $pref = $pref->cut_until('?page='); } else { $page = 1; } $pages = array(); while (!$finish) { // file_put_contents('/tmp/head', $chapter_url."\n", FILE_APPEND); $p = new Page($chapter_url); $p->go_line('<blockquote>'); if ($p->curr_line()->contain(' ›')) { $finish = false; $chapter_url = $pref . '/?page=' . ++$page; } else { $finish = true; } $srcs = $p->curr_line()->extract_to_array('<img src="', '"'); foreach ($srcs as $src) { $parturl = Text::create($src)->replace('index.php', 'mangas')->replace('?action=thumb', '')->to_s(); $name = basename($parturl); $pages["{$prefix}-{$ifx}-{$name}"] = $sitename . $parturl; } } return $pages; }
public function grab_chapter_infix($url) { $p = new Page($url); $p->go_line('id="gotoMangaInfo"'); $m = $p->curr_line()->regex_match('/Chapter (\\w*)<\\//'); return $m[1]; }
private function collect_images($url, $dir) { $continue = true; $domain = 'http://rule34.xxx/'; $base = 'http://rule34.xxx/index.php'; do { echo $url . "\n"; $p = new Page($url); $p->go_line('class="thumb"'); do { if ($p->curr_line()->contain('href="')) { $href = $p->curr_line()->cut_between('href="', '"')->to_s(); $href = htmlspecialchars_decode($href); echo "{$domain}{$href}\n"; $p2 = new Page($domain . $href); $p2->go_line('Original image'); $src = $p2->curr_line()->cut_between('href="http:', '"')->to_s(); $src = 'http:' . $src; $outpath = $dir . basename($src); download_it($src, $outpath, "--header=\"Accept: image/*\""); // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>'; } } while (!$p->next_line()->contain('<center>')); $p->reset_line(); $p->go_line('id="paginator"'); if ($p->curr_line()->contain('alt="next"')) { $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/'); $url = $base . html_entity_decode($m[1]); } else { $continue = false; } } while ($continue); }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab total page $p->go_line('id="top_bar"'); $p->go_line_regex('/of \\d+\\w+/'); $tot = $p->curr_line()->regex_match('/of (\\d+)/'); $tot = $tot[1]; // grab first image $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); $result = array("{$prefix}-{$ifx}-{$name}" => $src); for ($i = 2; $i <= $tot; $i++) { $p = new Page(dirname($chapter_url) . '/' . $i . '.html'); $p->go_line('id="viewer"'); $p->next_line(2); $src = $p->curr_line()->cut_between('src="', '"'); $name = basename($src); $result["{$prefix}-{$ifx}-{$name}"] = $src; } return $result; }
public function extract($columns, $s, $n, $url) { $result = array(); $pattern_url = 'http://nn4b.com/?webcomic1=%s'; for ($i = $s; $i <= $n; $i++) { $purl = sprintf($pattern_url, $i); $p = new Page($purl); $p->go_line('"og:image"'); $src = $p->curr_line()->cut_between('content="', '"')->to_s(); $p->reset_line(); $p->go_line("link rel='next'"); $next = $p->curr_line()->cut_between("href='", "'")->to_s(); $item = array('image' => "<img src='{$src}'>", 'link' => "<a href='{$purl}'>Link</a>", 'next' => "<a href='{$next}'>Next</a>"); $result[] = $item; } return $result; }
public function crawl_page($url, $ifx) { $p = new Page($url); $p->go_line('data[pages]'); $pages = array(); do { $line = $p->curr_line(); if ($line->contain('</option>')) { $pages[] = $line->cut_between('>', '</option')->to_s(); } } while (!$p->next_line()->contain('</select>')); $p->go_line('scanlations'); $imgurl = $p->curr_line()->cut_between('<img src="', '"')->to_s(); $imgbase = dirname($imgurl); foreach ($pages as $page) { echo "<a href='{$imgbase}/{$page}'>{$this->prefix}-{$ifx}-{$page}</a><br/>\n"; } }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab total page $p->go_line('select class="cbo_wpm_pag"'); $p->next_line(); $p->go_line('select class="cbo_wpm_pag"'); $pages = $p->curr_line()->extract_to_array('value="', '"'); // grab first image $p->reset_line(); $this->crawl_page($p, $ifx); // iterate array_shift($pages); foreach ($pages as $page) { $purl = $v['url'] . $page . '/'; $q = new Page($purl); $this->crawl_page($q, $ifx); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab total page $p->go_line('select class="cbo_wpm_pag"'); $p->next_line(); $p->go_line('select class="cbo_wpm_pag"'); $pages = $p->curr_line()->extract_to_array('value="', '"'); // grab first image $p->reset_line(); $result = $this->crawl_page($p, $prefix, $ifx); // iterate array_shift($pages); foreach ($pages as $page) { $purl = $chapter_url . $page . '/'; $q = new Page($purl); $result = $result + $this->crawl_page($q, $prefix, $ifx); } return $result; }
private function collect_streams($url) { $p = new Page($url); $p->go_line('"streams":[{'); $json_part = $p->curr_line()->cut_between('"streams":[{', '}]'); $streams = '[{' . $json_part . '}]'; $streams = json_decode($streams); $result = array(); foreach ($streams as $stream) { $result[] = (object) array('res' => $stream->width . 'x' . $stream->height, 'url' => $stream->host . $stream->path, 'ext' => $stream->format); } return $result; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab list of pages $p->go_line('="changePage('); $pages = $p->curr_line()->extract_to_array('href="', '"'); // grab current image $this->crawl_page($p, $ifx); array_shift($pages); foreach ($pages as $purl) { $this->crawl_page(new Page($purl), $ifx); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab list of pages $p->go_line('="changePage('); $pages = $p->curr_line()->extract_to_array('href="', '"'); // grab current image $result = $this->crawl_page($p, $prefix, $ifx); array_shift($pages); foreach ($pages as $purl) { $result = $result + $this->crawl_page(new Page($purl), $prefix, $ifx); } return $result; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab list of pages $p->go_line('Last Page ('); $n = $p->curr_line()->cut_between('Last Page (', ')')->to_s(); $dir_url = dirname($v['url']); // grab current image $this->crawl_page($p, $ifx, 1); for ($i = 2; $i <= $n; $i++) { $p = new Page($dir_url . '/' . $i); $this->crawl_page($p, $ifx, $i); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab list of pages $p->go_line('id="page_select"'); $pages = $p->next_line()->extract_to_array('value="', '"'); // grab current image $result = $this->crawl_page($p, $prefix, $ifx, 1); array_shift($pages); foreach ($pages as $i => $purl) { $p = new Page($purl); $result = $result + $this->crawl_page($p, $prefix, $ifx, $i + 2); } return $result; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab list of pages $p->go_line('id="page_select"'); $pages = $p->next_line()->extract_to_array('value="', '"'); // grab current image $this->crawl_page($p, $ifx); array_shift($pages); foreach ($pages as $purl) { $p = new Page($purl); $this->crawl_page($p, $ifx); } /* Manga_Crawler::multiProcess(4, $pages, array($this, 'crawl_page'), array($ifx)); */ }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab list of pages $p->go_line('Last Page ('); $n = $p->curr_line()->cut_between('Last Page (', ')')->to_s(); $dir_url = dirname($chapter_url); // grab current image $result = array(); list($img_name, $img_url) = $this->crawl_page($p, $prefix, $ifx, 1); $result[$img_name] = $img_url; for ($i = 2; $i <= $n; $i++) { $p = new Page($dir_url . '/' . $i); list($img_name, $img_url) = $this->crawl_page($p, $prefix, $ifx, $i); $result[$img_name] = $img_url; } return $result; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); $p->go_line('var pages'); $json = $p->curr_line()->cut_between(' = ', ';'); $list = json_decode($json); foreach ($list as $page) { $purl = new Text($page->url); $name = new Text($page->filename); if ($name->strlen() < 15) { $name = $this->prefix . '-' . $ifx . '-' . $name; } if ($purl->contain('resize_img.php')) { $purl = $purl->cut_between('resize_img.php?url=', '&width'); } echo "<a href='{$purl}'>{$name}</a><br/>\n"; } }
public function crawl_page($url, $prefix, $ifx) { $p = new Page($url); $p->go_line('data[pages]'); $pages = array(); do { $line = $p->curr_line(); if ($line->contain('</option>')) { $pages[] = $line->cut_between('>', '</option')->to_s(); } } while (!$p->next_line()->contain('</select>')); $p->go_line('scanlations'); $imgurl = $p->curr_line()->cut_between('<img src="', '"')->to_s(); $imgbase = dirname($imgurl); $result = array(); foreach ($pages as $page) { $result["{$prefix}-{$ifx}-{$page}"] = "{$imgbase}/{$page}"; } return $result; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url, array('become_firefox' => true)); // grab list of pages $p->go_line('var lstImages'); $i = 1; $result = array(); do { if ($p->curr_line()->contain('lstImages.push')) { $line = $p->curr_line(); $img = $line->cut_between('push("', '")'); $iname = Text::create($i++)->pad(3)->to_s() . Text::create(basename($img))->cut_rfrom('.')->cut_before('?')->to_s(); $name = "{$prefix}-{$ifx}-{$iname}"; $result[$name] = $img; } } while (!$p->next_line()->contain('new Array()')); $pages = $p->curr_line()->extract_to_array('href="', '"'); return $result; }
private function download_chapters($chapters) { $prefix = 'Birth_of_Levi'; $dir = '/home/khandar-gdp/tmp/birth of levi/'; foreach ($chapters as $infix => $url) { echo "Opening {$url}...\n"; $p = new Page($url); $p->go_line('pages[1]='); $i = 1; do { $line = $p->curr_line(); $img = $line->cut_between('="', '"'); $suffix = Text::create($i++)->pad(3)->to_s(); $ext = $img->cut_rafter('.'); $filename = "{$dir}{$prefix}-{$infix}-{$suffix}.{$ext}"; if (!is_file($filename)) { download_it($img->to_s(), $filename); } } while ($p->next_line()->contain('pages[')); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab list of pages $p->go_line('id="pageInfo"'); $n = $p->curr_line()->cut_between(' of ', '<')->to_s(); $pages = array(); for ($i = 1; $i <= $n; $i++) { $pages[] = preg_replace('/\\/1\\/$/', '/' . $i . '/', $chapter_url); } // grab current image $p->reset_line(); $result = $this->crawl_page($p, $prefix, $ifx, 1); array_shift($pages); // grab the rest of pages foreach ($pages as $i => $page) { $p = new Page($page); $result = $result + $this->crawl_page($p, $prefix, $ifx, $i + 2); } return $result; }
public function get_detail() { $p = new Page(Fakku::$base . $this->url . '/read'); $content = new Text($p->content()); // hack: sometimes old urls gone if ($content->contain('<title>Error Message</title>')) { $this->is_deleted = true; throw new Exception($this->url . ' url is gone'); } if ($content->contain('var data = {')) { $p->go_line('var data = {'); $json = $p->curr_line()->dup()->cut_between(' = ', ';')->to_s(); $obj = json_decode($json); $js_thumbs = $obj->thumbs; } else { if ($content->contain('var data={')) { $p->go_line('var data={'); $json = $p->curr_line()->dup()->cut_between('data=', ';')->to_s(); $obj = json_decode($json); $js_thumbs = $obj->thumbs; } else { if ($content->contain('window.params.thumbs')) { $p->go_line('window.params.thumbs'); $json = $p->curr_line()->cut_between('=', ';')->to_s(); $js_thumbs = json_decode($json); } else { if ($content->contain('This content has been disabled.')) { return; $js_thumbs = array(); } else { if ($content->contain('This content is not available in your country')) { return; $js_thumbs = array(); } else { if ($content->contain('Content does not exist')) { return; $js_thumbs = array(); } else { echo $p->url(); throw new Exception('where is thumbs?'); } } } } } } $thumbs = array(); foreach ($js_thumbs as $tpath) { $thumbs[] = basename($tpath); } $this->thumbs = implode('#', $thumbs); // grab full image pattern $p->go_line('function imgpath('); $p->go_line('return \''); if ($p->curr_line()->contain('return \'')) { $imgpath = $p->curr_line()->dup()->cut_between("return '", "';")->to_s(); $imgpath = str_replace("' + x + '", '%s', $imgpath); } else { $p->reset_line(); $p->go_line('function imgpath('); $p->go_line('return\''); $imgpath = $p->curr_line()->dup()->cut_between("return'", "';")->to_s(); $imgpath = str_replace("'+x+'", '%s', $imgpath); } $imgpath = str_replace("https://", 'http://', $imgpath); $this->pattern = $imgpath; $this->save(); }
public function grab_volume_chapters() { $p = new Page('http://en.wikipedia.org/wiki/List_of_Hajime_no_Ippo_chapters'); $list = array(); while (!$p->end_of_line()) { $p->go_line('Main article:'); if ($p->end_of_line()) { break; } $href = 'http://en.wikipedia.org' . $p->curr_line()->dup()->cut_between('href="', '"')->to_s(); $p2 = new Page($href); while (!$p2->end_of_line()) { try { $p2->go_line('<td id="vol'); $vol = $p2->curr_line()->dup()->cut_between('">', '<')->to_s(); do { if ($p2->curr_line()->contain('<li>Round ')) { $last_chapter = $p2->curr_line()->dup()->cut_between('Round ', ':')->to_s(); } } while (!$p2->next_line()->contain('</table>')); $list[$vol] = $last_chapter; // echo "v $vol c $last_chapter <br/>\n"; } catch (Exception $e) { break; } } $p->next_line(); } return $list; }
public function action_all_pages() { $start = self::$update; // what is the last page? $p = new Page($start); $p->go_line('Page 1 / '); $stop = (int) $p->curr_line()->cut_between('Page 1 / ', '<')->to_s(); $pre_infos = array(); for ($i = $stop; $i >= 1; $i--) { // file_put_contents('mangafap.links', "//Page {$i}\n", FILE_APPEND); $p = new Page($start . ($i > 1 ? 'page/' . $i . '/' : '')); $chunk = array_reverse($this->extract_from_list($p)); // file_put_contents('mangafap.links', "\$links[] = ".var_export($chunk, true).";\n", FILE_APPEND); $pre_infos = array_merge($pre_infos, $chunk); } // Now we have complete books' links $complete_links = '<?php $links=' . var_export($pre_infos, true) . ';'; file_put_contents('hmo.links', $complete_links); }
$b = break_url($v['url']); extract($b); $P = new Page($v['url']); // Grab all pages $pages = array(); $P->go_line('id="Serie_pages"'); do { if ($P->curr_line()->contain('<option')) { $pages[] = $P->curr_line()->dup()->cut_between('">', '</')->to_s(); } } while (!$P->next_line()->contain('</select>')); array_shift($pages); // Grab this page's image $P->go_line('id="manga_img"'); $src = $P->curr_line()->dup()->cut_between('src="', '"')->to_s(); $name = basename($src); echo "<a href='{$domain}{$src}'>{$prefix}-{$ifx}-{$name}</a><br/>\n"; // Now for the other pages foreach ($pages as $p) { $the_url = "{$basic_url}{$title}/{$chapter_id}/{$chapter_text}/{$p}/"; $P = new Page($the_url); $P->go_line('id="manga_img"'); $src = $P->curr_line()->dup()->cut_between('src="', '"')->to_s(); $name = basename($src); echo "<a href='{$domain}{$src}'>{$prefix}-{$ifx}-{$name}</a><br/>\n"; } } } ?> </body></html>
private function get_last_page($url) { $p = new Page($url); $p->go_line('Pages|'); $m = $p->curr_line()->regex_match('/(\\d+) Pages/'); return $m[1]; }
<?php /* ini untuk crawling webcomic http://noneedforbushido.com/2002/comic/1/ supply starting chapter, crawl sampe abis */ require_once 'class/page.php'; require_once 'class/text.php'; $start = 'http://noneedforbushido.com/2002/comic/1/'; $next = true; while ($next) { $p = new Page($start); $p->go_line('class="comic-item'); $src = $p->curr_line()->dup()->cut_between('src="', '"')->to_s(); $n = Text::create(basename($start))->pad(3, 0)->to_s(); $year = Text::create($start)->cut_between('.com/', '/')->to_s(); $text = "{$year}-comic{$n}"; echo "<a href='{$src}'>{$text}</a><br />\n"; // determine $next $p->go_line('class="next-comic-link'); if ($p->curr_line()->contain('current-comic')) { $next = false; } $start = $p->curr_line()->dup()->cut_between('href="', '"')->to_s(); }
public function grab_chapter_urls($start_page_url, $check_database = false) { $p = new Page($start_page_url); // check if there are more pages $p->go_line("class='pages'"); if ($p->curr_line()->exist("class='pages'")) { $m = $p->curr_line()->regex_match("/'>Page 1 \\/ (\\d+)<\\//"); $tot_pages = $m[1]; } else { $tot_pages = 1; } $chapters = array(); if (isset($_GET['limitpage'])) { $tot_pages = $_GET['limitpage']; } for ($i = 1; $i <= $tot_pages; $i++) { $p = new Page($start_page_url . ($i == 1 ? '' : 'page/' . $i . '/')); echo "Grabbing " . $p->url() . "<br/>\n"; // grab all chapter in this page $t_content = new Text($p->content()); $raw = array_unique($t_content->extract_to_array('href="', '"')); foreach ($raw as $e) { if (preg_match('/^http:\\/\\/hentaimangaonline\\.com\\/read-[^\\/]*-hentai-manga-online\\/$/', $e)) { if ($check_database) { if ($this->url_already_exist($e)) { return array_reverse(array_unique($chapters)); } } $chapters[] = $e; } } // return $chapters;//DEBUG } return array_reverse(array_unique($chapters)); }
function fakku($url) { if (!preg_match('/\\/read$/', $url)) { $url .= '/read'; } $title = basename(dirname($url)); $p = new Page($url); $content = new Text($p->content()); $p->go_line('window.params.thumbs'); $json = $p->curr_line()->cut_between('=', ';')->to_s(); $js_thumbs = json_decode($json); foreach ($js_thumbs as $thumb) { $src = Text::create($thumb)->replace('.thumb.', '.')->replace('/thumbs/', '/images/')->to_s(); echo "<a href='{$src}'>{$title}</a><br>\n"; } }
public function process() { $this->go_line('Hayley Marie'); do { if ($this->curr_line()->exist('href=')) { $m = $this->curr_line()->regex_match('/' . self::REG_HREF . '\\s+title=["\']([^"\']*)["\']/'); $url = $m[1]; $name = $m[2]; if (!$name) { $name = 'asdf' . $this->current_i; } $p = new Page($url); $p->go_line("id='form'"); do { if ($p->curr_line()->exist("href='")) { $img = $p->curr_line()->dup()->cut_between("href='", "'"); if ($img->exist('imageboss.net')) { $img->replace('/view/', '/img/')->replace('-', '/'); } echo "<a href='{$img}'>{$name}</a><br />\n"; } } while (!$p->next_line()->exist('</form>')); } } while (!$this->next_line()->exist('id="vr_nav"')); }