private function collect_images($url, $dir) { if (strpos($url, '/idol.')) { $base = 'https://idol.sankakucomplex.com'; } else { $base = 'https://chan.sankakucomplex.com'; } $page = $this->page_from; $id = 1; $Turl = Text::create($url); do { if ($page > $this->page_to) { break; } $purl = $url . '&page=' . $page; echo "{$purl}\n"; do { $P = new Page($purl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); $a = $T->extract_to_array('href="', '"'); foreach ($a as $i => $e) { $E = new Text($e); if (!$E->contain('/post/show')) { unset($a[$i]); } } if (!count($a)) { break; } foreach ($a as $i => $e) { $E = new Text($e); $kurl = $base . $e; echo "{$kurl}\n"; flush(); do { $P = new Page($kurl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); $P->go_line('id=highres'); $img = $P->curr_line()->cut_between('href="', '"'); /*if ($img->contain('.webm')) { echo "This is WEBM\n"; } else*/ if ($img->to_s()) { $this->download_if_not_exist($img, $dir, $id); $id++; } else { echo "No id=highres\n"; } } $page++; } while (true); }
public function get_info($base) { $p = new Page($base); $p->go_line('<div class="divThickBorder" style="padding:7px">'); $raw = $p->next_line()->dup(); $list = array(); foreach (explode('<tr>', $raw->to_s()) as $line) { $tline = new Text($line); if ($tline->contain('href="')) { $href = $tline->dup()->cut_between('href="', '"')->to_s(); $desc = $tline->dup()->cut_between('">', '</a')->to_s(); preg_match('/([\\.\\d]+) :/', $desc, $m); $infix = $m[1]; $list[] = array('url' => $href, 'desc' => strip_tags($desc), 'infix' => $infix); } } return $list; }
public function crawl_chapter($v) { $ifx = Text::create($v['infix'])->pad(3)->to_s(); $p = new Page($v['url']); $p->go_line('var pages'); $json = $p->curr_line()->cut_between(' = ', ';'); $list = json_decode($json); foreach ($list as $page) { $purl = new Text($page->url); $name = new Text($page->filename); if ($name->strlen() < 15) { $name = $this->prefix . '-' . $ifx . '-' . $name; } if ($purl->contain('resize_img.php')) { $purl = $purl->cut_between('resize_img.php?url=', '&width'); } echo "<a href='{$purl}'>{$name}</a><br/>\n"; } }
function idol_sankaku2($base_url, $from, $to) { $base = 'http://idol.sankakucomplex.com'; for ($i = $from; $i <= $to; $i++) { $P = new Page($base_url . '&page=' . $i); $T = new Text($P->content()); $a = $T->extract_to_array('href="', '"'); foreach ($a as $e) { $E = new Text($e); if ($E->contain('/post/show')) { $url = $base . $e; $P = new Page($url); $P->go_line('id="highres"'); $img = $P->curr_line()->cut_between('href="', '"')->to_s(); $P->reset_line(); $P->go_line('id="post_old_tags"'); $tag = $P->curr_line()->cut_between('value="', '"')->substr(0, 150)->to_s(); // max 100 karakter echo "<a href='{$img}'>{$tag}</a><br />\n"; } } } }
public function action_update_tags() { $hundred_mangas = Model::factory('Hmanga')->order_by_asc('id')->find_many(); foreach ($hundred_mangas as $m) { echo "ID: {$m->id} URL is: {$m->url}<br>\n"; echo "Old tags: {$m->tags}<br>\n"; $real_url = self::$base . $m->url; $p = new Page($real_url); $content = new Text($p->content()); // hack: sometimes old urls gone if ($content->contain('<title>Error Message</title>')) { $m->delete(); echo "DELETED {$m->url} is gone<br>\n"; continue; } $tags = $this->grab_all_tags($p); $tags = '#' . implode('#', $tags) . '#'; echo "New tags: {$tags}<br>\n"; if ($tags !== $m->tags) { echo "Tags are different! Save!<br>\n"; $m->tags = $tags; $m->save(); } } }
function deviantart($gal) { $next = true; preg_match('/:\\/\\/([^\\/]+)\\//', $gal, $m); $base = 'http://' . $m[1]; do { echo "{$gal}<br>"; $p = new Page($gal); $p->go_line('id="gmi-ResourceStream"'); $raw = explode('<a class="thumb', $p->curr_line()->to_s()); for ($i = 1, $l = count($raw); $i < $l; $i++) { $line = new Text($raw[$i]); $name = $line->dup()->cut_between('<b>', '</')->to_s(); if ($line->contain('super_fullimg=')) { $src = $line->dup()->cut_between('super_fullimg="', '"')->to_s(); } else { $src = $line->dup()->cut_between('super_img="', '"')->to_s(); } echo "<a href='{$src}'>{$name}</a><br>\n"; } // ada next? $p->go_line('<li class="next">'); if ($p->curr_line()->contain('<li class="next"><a class="away" href="')) { $part = $p->curr_line()->dup()->cut_between('<li class="next"><a class="away" href="', '"'); $gal = $base . $part; } else { $next = false; } } while ($next); }
function sankakucomplex($url) { if (strpos($url, '/idol.')) { $base = 'https://idol.sankakucomplex.com'; } else { $base = 'https://chan.sankakucomplex.com'; } $page = 1; $tag = uniqid(); $Turl = Text::create($url); if ($Turl->contain('tags=')) { $tag = $Turl->cut_after('tags=')->urldecode()->to_s(); } do { if (isset($_GET['limit'])) { if ($page > $_GET['limit']) { break; } } $purl = $url . '&page=' . $page; echo "{$purl}<br>\n"; do { $P = new Page($purl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); $a = $T->extract_to_array('href="', '"'); foreach ($a as $i => $e) { $E = new Text($e); if (!$E->contain('/post/show')) { unset($a[$i]); } } if (!count($a)) { break; } foreach ($a as $i => $e) { $E = new Text($e); $kurl = $base . $e; echo "{$kurl}<br>\n"; flush(); do { $P = new Page($kurl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); // $P->go_line('id="highres"'); if (isset($_GET['hires'])) { $P->go_line('id=highres'); } else { $P->go_line('id=lowres'); } if ($P->end_of_line()) { $P->reset_line(); $P->go_line('id=highres'); } $img = $P->curr_line()->cut_between('href="', '"')->to_s(); // $P->reset_line(); // $P->go_line('id="post_old_tags"'); // $tag = $P->curr_line()->cut_between('value="', '"')->substring(0, 150)->to_s(); // max 100 karakter if ($img) { echo "<a href='{$img}'>{$tag}</a><br />\n"; flush(); } else { echo "This is flash<br />\n"; } } $page++; } while (true); }
public function action_fill_reference() { $p = new Page('http://www.hbrowse.com/advance'); $raw = new Text($p->content()); $names = $raw->extract_to_array('name="', '"'); $names = array_unique($names); $ref = array(); foreach ($names as $raw_name) { $name = new Text($raw_name); if ($name->contain('_')) { $key = $name->cut_before('_')->to_s(); $val = $name->cut_after('_')->to_s(); $ref[$key][] = $val; } } // insert foreach ($ref as $key => $val) { $reference = ORM::for_table('reference')->create(); $reference->id = $key; $reference->val = '#' . implode('#', $val) . '#'; $reference->save(); } }