private function collect_images($url, $dir)
 {
     $continue = true;
     $domain = 'http://rule34.xxx/';
     $base = 'http://rule34.xxx/index.php';
     do {
         echo $url . "\n";
         $p = new Page($url);
         $p->go_line('class="thumb"');
         do {
             if ($p->curr_line()->contain('href="')) {
                 $href = $p->curr_line()->cut_between('href="', '"')->to_s();
                 $href = htmlspecialchars_decode($href);
                 echo "{$domain}{$href}\n";
                 $p2 = new Page($domain . $href);
                 $p2->go_line('Original image');
                 $src = $p2->curr_line()->cut_between('href="http:', '"')->to_s();
                 $src = 'http:' . $src;
                 $outpath = $dir . basename($src);
                 download_it($src, $outpath, "--header=\"Accept: image/*\"");
                 // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>';
             }
         } while (!$p->next_line()->contain('<center>'));
         $p->reset_line();
         $p->go_line('id="paginator"');
         if ($p->curr_line()->contain('alt="next"')) {
             $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/');
             $url = $base . html_entity_decode($m[1]);
         } else {
             $continue = false;
         }
     } while ($continue);
 }
 public function extract($columns, $s, $n, $url)
 {
     $result = array();
     $pattern_url = 'http://nn4b.com/?webcomic1=%s';
     for ($i = $s; $i <= $n; $i++) {
         $purl = sprintf($pattern_url, $i);
         $p = new Page($purl);
         $p->go_line('"og:image"');
         $src = $p->curr_line()->cut_between('content="', '"')->to_s();
         $p->reset_line();
         $p->go_line("link rel='next'");
         $next = $p->curr_line()->cut_between("href='", "'")->to_s();
         $item = array('image' => "<img src='{$src}'>", 'link' => "<a href='{$purl}'>Link</a>", 'next' => "<a href='{$next}'>Next</a>");
         $result[] = $item;
     }
     return $result;
 }
Example #3
0
 public function crawl_chapter($v)
 {
     $ifx = Text::create($v['infix'])->pad(3)->to_s();
     $p = new Page($v['url']);
     // grab total page
     $p->go_line('select class="cbo_wpm_pag"');
     $p->next_line();
     $p->go_line('select class="cbo_wpm_pag"');
     $pages = $p->curr_line()->extract_to_array('value="', '"');
     // grab first image
     $p->reset_line();
     $this->crawl_page($p, $ifx);
     // iterate
     array_shift($pages);
     foreach ($pages as $page) {
         $purl = $v['url'] . $page . '/';
         $q = new Page($purl);
         $this->crawl_page($q, $ifx);
     }
 }
 public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url);
     // grab total page
     $p->go_line('select class="cbo_wpm_pag"');
     $p->next_line();
     $p->go_line('select class="cbo_wpm_pag"');
     $pages = $p->curr_line()->extract_to_array('value="', '"');
     // grab first image
     $p->reset_line();
     $result = $this->crawl_page($p, $prefix, $ifx);
     // iterate
     array_shift($pages);
     foreach ($pages as $page) {
         $purl = $chapter_url . $page . '/';
         $q = new Page($purl);
         $result = $result + $this->crawl_page($q, $prefix, $ifx);
     }
     return $result;
 }
 public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url);
     // grab list of pages
     $p->go_line('id="pageInfo"');
     $n = $p->curr_line()->cut_between(' of ', '<')->to_s();
     $pages = array();
     for ($i = 1; $i <= $n; $i++) {
         $pages[] = preg_replace('/\\/1\\/$/', '/' . $i . '/', $chapter_url);
     }
     // grab current image
     $p->reset_line();
     $result = $this->crawl_page($p, $prefix, $ifx, 1);
     array_shift($pages);
     // grab the rest of pages
     foreach ($pages as $i => $page) {
         $p = new Page($page);
         $result = $result + $this->crawl_page($p, $prefix, $ifx, $i + 2);
     }
     return $result;
 }
Example #6
0
function idol_sankaku2($base_url, $from, $to)
{
    $base = 'http://idol.sankakucomplex.com';
    for ($i = $from; $i <= $to; $i++) {
        $P = new Page($base_url . '&page=' . $i);
        $T = new Text($P->content());
        $a = $T->extract_to_array('href="', '"');
        foreach ($a as $e) {
            $E = new Text($e);
            if ($E->contain('/post/show')) {
                $url = $base . $e;
                $P = new Page($url);
                $P->go_line('id="highres"');
                $img = $P->curr_line()->cut_between('href="', '"')->to_s();
                $P->reset_line();
                $P->go_line('id="post_old_tags"');
                $tag = $P->curr_line()->cut_between('value="', '"')->substr(0, 150)->to_s();
                // max 100 karakter
                echo "<a href='{$img}'>{$tag}</a><br />\n";
            }
        }
    }
}
Example #7
0
 public function get_detail()
 {
     $p = new Page(Fakku::$base . $this->url . '/read');
     $content = new Text($p->content());
     // hack: sometimes old urls gone
     if ($content->contain('<title>Error Message</title>')) {
         $this->is_deleted = true;
         throw new Exception($this->url . ' url is gone');
     }
     if ($content->contain('var data = {')) {
         $p->go_line('var data = {');
         $json = $p->curr_line()->dup()->cut_between(' = ', ';')->to_s();
         $obj = json_decode($json);
         $js_thumbs = $obj->thumbs;
     } else {
         if ($content->contain('var data={')) {
             $p->go_line('var data={');
             $json = $p->curr_line()->dup()->cut_between('data=', ';')->to_s();
             $obj = json_decode($json);
             $js_thumbs = $obj->thumbs;
         } else {
             if ($content->contain('window.params.thumbs')) {
                 $p->go_line('window.params.thumbs');
                 $json = $p->curr_line()->cut_between('=', ';')->to_s();
                 $js_thumbs = json_decode($json);
             } else {
                 if ($content->contain('This content has been disabled.')) {
                     return;
                     $js_thumbs = array();
                 } else {
                     if ($content->contain('This content is not available in your country')) {
                         return;
                         $js_thumbs = array();
                     } else {
                         if ($content->contain('Content does not exist')) {
                             return;
                             $js_thumbs = array();
                         } else {
                             echo $p->url();
                             throw new Exception('where is thumbs?');
                         }
                     }
                 }
             }
         }
     }
     $thumbs = array();
     foreach ($js_thumbs as $tpath) {
         $thumbs[] = basename($tpath);
     }
     $this->thumbs = implode('#', $thumbs);
     // grab full image pattern
     $p->go_line('function imgpath(');
     $p->go_line('return \'');
     if ($p->curr_line()->contain('return \'')) {
         $imgpath = $p->curr_line()->dup()->cut_between("return '", "';")->to_s();
         $imgpath = str_replace("' + x + '", '%s', $imgpath);
     } else {
         $p->reset_line();
         $p->go_line('function imgpath(');
         $p->go_line('return\'');
         $imgpath = $p->curr_line()->dup()->cut_between("return'", "';")->to_s();
         $imgpath = str_replace("'+x+'", '%s', $imgpath);
     }
     $imgpath = str_replace("https://", 'http://', $imgpath);
     $this->pattern = $imgpath;
     $this->save();
 }
Example #8
0
function rule34xxx($url)
{
    // img2.rule34.xxx/rule34/thumbnails/1202/thumbnail_fc0d335a14ffbdbb861bdabb8afd8bd6.jpeg?1228439
    // http://img.rule34.xxx/rule34//images/1202/fc0d335a14ffbdbb861bdabb8afd8bd6.jpeg
    $continue = true;
    $domain = 'http://rule34.xxx/';
    $base = 'http://rule34.xxx/index.php';
    $tags = Text::create($url)->regex_match('/tags=([^&]+)/');
    $tags = $tags[1];
    do {
        echo $url . "<br>\n";
        $p = new Page($url);
        $p->go_line('class="thumb"');
        do {
            if ($p->curr_line()->contain('href="')) {
                $href = $p->curr_line()->cut_between('href="', '"')->to_s();
                $href = htmlspecialchars_decode($href);
                echo "{$domain}{$href}<br>\n";
                $p2 = new Page($domain . $href);
                $p2->go_line('Original image');
                $src = $p2->curr_line()->cut_between('href="http:', '"');
                // echo '<pre>'.htmlspecialchars($p2->curr_line()).'</pre>';
                echo "<a href='{$src}'>{$tags}</a><br>\n";
            }
        } while (!$p->next_line()->contain('<center>'));
        $p->reset_line();
        $p->go_line('id="paginator"');
        if ($p->curr_line()->contain('alt="next"')) {
            $m = $p->curr_line()->regex_match('/href="([^"]+)" alt="next"/');
            $url = $base . html_entity_decode($m[1]);
        } else {
            $continue = false;
        }
    } while ($continue);
}