コード例 #1
0
 private function collect_images($url, $dir)
 {
     if (strpos($url, '/idol.')) {
         $base = 'https://idol.sankakucomplex.com';
     } else {
         $base = 'https://chan.sankakucomplex.com';
     }
     $page = $this->page_from;
     $id = 1;
     $Turl = Text::create($url);
     do {
         if ($page > $this->page_to) {
             break;
         }
         $purl = $url . '&page=' . $page;
         echo "{$purl}\n";
         do {
             $P = new Page($purl, array('become_firefox' => true));
             $T = new Text($P->content());
             sleep(3);
             // 429 too many requests
         } while ($T->contain('429 Too many requests'));
         $a = $T->extract_to_array('href="', '"');
         foreach ($a as $i => $e) {
             $E = new Text($e);
             if (!$E->contain('/post/show')) {
                 unset($a[$i]);
             }
         }
         if (!count($a)) {
             break;
         }
         foreach ($a as $i => $e) {
             $E = new Text($e);
             $kurl = $base . $e;
             echo "{$kurl}\n";
             flush();
             do {
                 $P = new Page($kurl, array('become_firefox' => true));
                 $T = new Text($P->content());
                 sleep(3);
                 // 429 too many requests
             } while ($T->contain('429 Too many requests'));
             $P->go_line('id=highres');
             $img = $P->curr_line()->cut_between('href="', '"');
             /*if ($img->contain('.webm')) {
             			echo "This is WEBM\n";
             		} else*/
             if ($img->to_s()) {
                 $this->download_if_not_exist($img, $dir, $id);
                 $id++;
             } else {
                 echo "No id=highres\n";
             }
         }
         $page++;
     } while (true);
 }
コード例 #2
0
 public function get_info($base)
 {
     $p = new Page($base);
     $p->go_line('<div class="divThickBorder" style="padding:7px">');
     $raw = $p->next_line()->dup();
     $list = array();
     foreach (explode('<tr>', $raw->to_s()) as $line) {
         $tline = new Text($line);
         if ($tline->contain('href="')) {
             $href = $tline->dup()->cut_between('href="', '"')->to_s();
             $desc = $tline->dup()->cut_between('">', '</a')->to_s();
             preg_match('/([\\.\\d]+) :/', $desc, $m);
             $infix = $m[1];
             $list[] = array('url' => $href, 'desc' => strip_tags($desc), 'infix' => $infix);
         }
     }
     return $list;
 }
コード例 #3
0
ファイル: stoptazmo.php プロジェクト: JerryMaheswara/crawler
 public function crawl_chapter($v)
 {
     $ifx = Text::create($v['infix'])->pad(3)->to_s();
     $p = new Page($v['url']);
     $p->go_line('var pages');
     $json = $p->curr_line()->cut_between(' = ', ';');
     $list = json_decode($json);
     foreach ($list as $page) {
         $purl = new Text($page->url);
         $name = new Text($page->filename);
         if ($name->strlen() < 15) {
             $name = $this->prefix . '-' . $ifx . '-' . $name;
         }
         if ($purl->contain('resize_img.php')) {
             $purl = $purl->cut_between('resize_img.php?url=', '&width');
         }
         echo "<a href='{$purl}'>{$name}</a><br/>\n";
     }
 }
コード例 #4
0
ファイル: asdf.php プロジェクト: JerryMaheswara/crawler
function idol_sankaku2($base_url, $from, $to)
{
    $base = 'http://idol.sankakucomplex.com';
    for ($i = $from; $i <= $to; $i++) {
        $P = new Page($base_url . '&page=' . $i);
        $T = new Text($P->content());
        $a = $T->extract_to_array('href="', '"');
        foreach ($a as $e) {
            $E = new Text($e);
            if ($E->contain('/post/show')) {
                $url = $base . $e;
                $P = new Page($url);
                $P->go_line('id="highres"');
                $img = $P->curr_line()->cut_between('href="', '"')->to_s();
                $P->reset_line();
                $P->go_line('id="post_old_tags"');
                $tag = $P->curr_line()->cut_between('value="', '"')->substr(0, 150)->to_s();
                // max 100 karakter
                echo "<a href='{$img}'>{$tag}</a><br />\n";
            }
        }
    }
}
コード例 #5
0
ファイル: Fakku.php プロジェクト: JerryMaheswara/crawler
 public function action_update_tags()
 {
     $hundred_mangas = Model::factory('Hmanga')->order_by_asc('id')->find_many();
     foreach ($hundred_mangas as $m) {
         echo "ID: {$m->id} URL is: {$m->url}<br>\n";
         echo "Old tags: {$m->tags}<br>\n";
         $real_url = self::$base . $m->url;
         $p = new Page($real_url);
         $content = new Text($p->content());
         // hack: sometimes old urls gone
         if ($content->contain('<title>Error Message</title>')) {
             $m->delete();
             echo "DELETED {$m->url} is gone<br>\n";
             continue;
         }
         $tags = $this->grab_all_tags($p);
         $tags = '#' . implode('#', $tags) . '#';
         echo "New tags: {$tags}<br>\n";
         if ($tags !== $m->tags) {
             echo "Tags are different! Save!<br>\n";
             $m->tags = $tags;
             $m->save();
         }
     }
 }
コード例 #6
0
ファイル: what.php プロジェクト: JerryMaheswara/crawler
function deviantart($gal)
{
    $next = true;
    preg_match('/:\\/\\/([^\\/]+)\\//', $gal, $m);
    $base = 'http://' . $m[1];
    do {
        echo "{$gal}<br>";
        $p = new Page($gal);
        $p->go_line('id="gmi-ResourceStream"');
        $raw = explode('<a class="thumb', $p->curr_line()->to_s());
        for ($i = 1, $l = count($raw); $i < $l; $i++) {
            $line = new Text($raw[$i]);
            $name = $line->dup()->cut_between('<b>', '</')->to_s();
            if ($line->contain('super_fullimg=')) {
                $src = $line->dup()->cut_between('super_fullimg="', '"')->to_s();
            } else {
                $src = $line->dup()->cut_between('super_img="', '"')->to_s();
            }
            echo "<a href='{$src}'>{$name}</a><br>\n";
        }
        // ada next?
        $p->go_line('<li class="next">');
        if ($p->curr_line()->contain('<li class="next"><a class="away" href="')) {
            $part = $p->curr_line()->dup()->cut_between('<li class="next"><a class="away" href="', '"');
            $gal = $base . $part;
        } else {
            $next = false;
        }
    } while ($next);
}
コード例 #7
0
ファイル: h.php プロジェクト: JerryMaheswara/crawler
function sankakucomplex($url)
{
    if (strpos($url, '/idol.')) {
        $base = 'https://idol.sankakucomplex.com';
    } else {
        $base = 'https://chan.sankakucomplex.com';
    }
    $page = 1;
    $tag = uniqid();
    $Turl = Text::create($url);
    if ($Turl->contain('tags=')) {
        $tag = $Turl->cut_after('tags=')->urldecode()->to_s();
    }
    do {
        if (isset($_GET['limit'])) {
            if ($page > $_GET['limit']) {
                break;
            }
        }
        $purl = $url . '&page=' . $page;
        echo "{$purl}<br>\n";
        do {
            $P = new Page($purl, array('become_firefox' => true));
            $T = new Text($P->content());
            sleep(3);
            // 429 too many requests
        } while ($T->contain('429 Too many requests'));
        $a = $T->extract_to_array('href="', '"');
        foreach ($a as $i => $e) {
            $E = new Text($e);
            if (!$E->contain('/post/show')) {
                unset($a[$i]);
            }
        }
        if (!count($a)) {
            break;
        }
        foreach ($a as $i => $e) {
            $E = new Text($e);
            $kurl = $base . $e;
            echo "{$kurl}<br>\n";
            flush();
            do {
                $P = new Page($kurl, array('become_firefox' => true));
                $T = new Text($P->content());
                sleep(3);
                // 429 too many requests
            } while ($T->contain('429 Too many requests'));
            // $P->go_line('id="highres"');
            if (isset($_GET['hires'])) {
                $P->go_line('id=highres');
            } else {
                $P->go_line('id=lowres');
            }
            if ($P->end_of_line()) {
                $P->reset_line();
                $P->go_line('id=highres');
            }
            $img = $P->curr_line()->cut_between('href="', '"')->to_s();
            // $P->reset_line();
            // $P->go_line('id="post_old_tags"');
            // $tag = $P->curr_line()->cut_between('value="', '"')->substring(0, 150)->to_s(); // max 100 karakter
            if ($img) {
                echo "<a href='{$img}'>{$tag}</a><br />\n";
                flush();
            } else {
                echo "This is flash<br />\n";
            }
        }
        $page++;
    } while (true);
}
コード例 #8
0
ファイル: Hbrowse.php プロジェクト: JerryMaheswara/crawler
 public function action_fill_reference()
 {
     $p = new Page('http://www.hbrowse.com/advance');
     $raw = new Text($p->content());
     $names = $raw->extract_to_array('name="', '"');
     $names = array_unique($names);
     $ref = array();
     foreach ($names as $raw_name) {
         $name = new Text($raw_name);
         if ($name->contain('_')) {
             $key = $name->cut_before('_')->to_s();
             $val = $name->cut_after('_')->to_s();
             $ref[$key][] = $val;
         }
     }
     // insert
     foreach ($ref as $key => $val) {
         $reference = ORM::for_table('reference')->create();
         $reference->id = $key;
         $reference->val = '#' . implode('#', $val) . '#';
         $reference->save();
     }
 }