private function collect_images($url, $dir) { $continue = true; $domain = 'http://rule34.xxx/'; $base = 'http://rule34.xxx/index.php'; do { echo $url . "\n"; $p = new Page($url); $p->go_line('class="thumb"'); do { if ($p->curr_line()->contain('href="')) { $href = $p->curr_line()->cut_between('href="', '"')->to_s(); $href = htmlspecialchars_decode($href); echo "{$domain}{$href}\n"; $p2 = new Page($domain . $href); $p2->go_line('Original image'); $src = $p2->curr_line()->cut_between('href="http:', '"')->to_s(); $src = 'http:' . $src; $outpath = $dir . basename($src); download_it($src, $outpath, "--header=\"Accept: image/*\""); // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>'; } } while (!$p->next_line()->contain('<center>')); $p->reset_line(); $p->go_line('id="paginator"'); if ($p->curr_line()->contain('alt="next"')) { $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/'); $url = $base . html_entity_decode($m[1]); } else { $continue = false; } } while ($continue); }
public function extract($columns, $s, $n, $url) { $result = array(); $pattern_url = 'http://nn4b.com/?webcomic1=%s'; for ($i = $s; $i <= $n; $i++) { $purl = sprintf($pattern_url, $i); $p = new Page($purl); $p->go_line('"og:image"'); $src = $p->curr_line()->cut_between('content="', '"')->to_s(); $p->reset_line(); $p->go_line("link rel='next'"); $next = $p->curr_line()->cut_between("href='", "'")->to_s(); $item = array('image' => "<img src='{$src}'>", 'link' => "<a href='{$purl}'>Link</a>", 'next' => "<a href='{$next}'>Next</a>"); $result[] = $item; } return $result; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); // grab total page $p->go_line('select class="cbo_wpm_pag"'); $p->next_line(); $p->go_line('select class="cbo_wpm_pag"'); $pages = $p->curr_line()->extract_to_array('value="', '"'); // grab first image $p->reset_line(); $this->crawl_page($p, $ifx); // iterate array_shift($pages); foreach ($pages as $page) { $purl = $v['url'] . $page . '/'; $q = new Page($purl); $this->crawl_page($q, $ifx); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab total page $p->go_line('select class="cbo_wpm_pag"'); $p->next_line(); $p->go_line('select class="cbo_wpm_pag"'); $pages = $p->curr_line()->extract_to_array('value="', '"'); // grab first image $p->reset_line(); $result = $this->crawl_page($p, $prefix, $ifx); // iterate array_shift($pages); foreach ($pages as $page) { $purl = $chapter_url . $page . '/'; $q = new Page($purl); $result = $result + $this->crawl_page($q, $prefix, $ifx); } return $result; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); // grab list of pages $p->go_line('id="pageInfo"'); $n = $p->curr_line()->cut_between(' of ', '<')->to_s(); $pages = array(); for ($i = 1; $i <= $n; $i++) { $pages[] = preg_replace('/\\/1\\/$/', '/' . $i . '/', $chapter_url); } // grab current image $p->reset_line(); $result = $this->crawl_page($p, $prefix, $ifx, 1); array_shift($pages); // grab the rest of pages foreach ($pages as $i => $page) { $p = new Page($page); $result = $result + $this->crawl_page($p, $prefix, $ifx, $i + 2); } return $result; }
function idol_sankaku2($base_url, $from, $to) { $base = 'http://idol.sankakucomplex.com'; for ($i = $from; $i <= $to; $i++) { $P = new Page($base_url . '&page=' . $i); $T = new Text($P->content()); $a = $T->extract_to_array('href="', '"'); foreach ($a as $e) { $E = new Text($e); if ($E->contain('/post/show')) { $url = $base . $e; $P = new Page($url); $P->go_line('id="highres"'); $img = $P->curr_line()->cut_between('href="', '"')->to_s(); $P->reset_line(); $P->go_line('id="post_old_tags"'); $tag = $P->curr_line()->cut_between('value="', '"')->substr(0, 150)->to_s(); // max 100 karakter echo "<a href='{$img}'>{$tag}</a><br />\n"; } } } }
public function get_detail() { $p = new Page(Fakku::$base . $this->url . '/read'); $content = new Text($p->content()); // hack: sometimes old urls gone if ($content->contain('<title>Error Message</title>')) { $this->is_deleted = true; throw new Exception($this->url . ' url is gone'); } if ($content->contain('var data = {')) { $p->go_line('var data = {'); $json = $p->curr_line()->dup()->cut_between(' = ', ';')->to_s(); $obj = json_decode($json); $js_thumbs = $obj->thumbs; } else { if ($content->contain('var data={')) { $p->go_line('var data={'); $json = $p->curr_line()->dup()->cut_between('data=', ';')->to_s(); $obj = json_decode($json); $js_thumbs = $obj->thumbs; } else { if ($content->contain('window.params.thumbs')) { $p->go_line('window.params.thumbs'); $json = $p->curr_line()->cut_between('=', ';')->to_s(); $js_thumbs = json_decode($json); } else { if ($content->contain('This content has been disabled.')) { return; $js_thumbs = array(); } else { if ($content->contain('This content is not available in your country')) { return; $js_thumbs = array(); } else { if ($content->contain('Content does not exist')) { return; $js_thumbs = array(); } else { echo $p->url(); throw new Exception('where is thumbs?'); } } } } } } $thumbs = array(); foreach ($js_thumbs as $tpath) { $thumbs[] = basename($tpath); } $this->thumbs = implode('#', $thumbs); // grab full image pattern $p->go_line('function imgpath('); $p->go_line('return \''); if ($p->curr_line()->contain('return \'')) { $imgpath = $p->curr_line()->dup()->cut_between("return '", "';")->to_s(); $imgpath = str_replace("' + x + '", '%s', $imgpath); } else { $p->reset_line(); $p->go_line('function imgpath('); $p->go_line('return\''); $imgpath = $p->curr_line()->dup()->cut_between("return'", "';")->to_s(); $imgpath = str_replace("'+x+'", '%s', $imgpath); } $imgpath = str_replace("https://", 'http://', $imgpath); $this->pattern = $imgpath; $this->save(); }
function rule34xxx($url) { // img2.rule34.xxx/rule34/thumbnails/1202/thumbnail_fc0d335a14ffbdbb861bdabb8afd8bd6.jpeg?1228439 // http://img.rule34.xxx/rule34//images/1202/fc0d335a14ffbdbb861bdabb8afd8bd6.jpeg $continue = true; $domain = 'http://rule34.xxx/'; $base = 'http://rule34.xxx/index.php'; $tags = Text::create($url)->regex_match('/tags=([^&]+)/'); $tags = $tags[1]; do { echo $url . "<br>\n"; $p = new Page($url); $p->go_line('class="thumb"'); do { if ($p->curr_line()->contain('href="')) { $href = $p->curr_line()->cut_between('href="', '"')->to_s(); $href = htmlspecialchars_decode($href); echo "{$domain}{$href}<br>\n"; $p2 = new Page($domain . $href); $p2->go_line('Original image'); $src = $p2->curr_line()->cut_between('href="http:', '"'); // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>'; echo "<a href='{$src}'>{$tags}</a><br>\n"; } } while (!$p->next_line()->contain('<center>')); $p->reset_line(); $p->go_line('id="paginator"'); if ($p->curr_line()->contain('alt="next"')) { $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/'); $url = $base . html_entity_decode($m[1]); } else { $continue = false; } } while ($continue); }