public function grab_volume_chapters() { $p = new Page('http://en.wikipedia.org/wiki/List_of_Hajime_no_Ippo_chapters'); $list = array(); while (!$p->end_of_line()) { $p->go_line('Main article:'); if ($p->end_of_line()) { break; } $href = 'http://en.wikipedia.org' . $p->curr_line()->dup()->cut_between('href="', '"')->to_s(); $p2 = new Page($href); while (!$p2->end_of_line()) { try { $p2->go_line('<td id="vol'); $vol = $p2->curr_line()->dup()->cut_between('">', '<')->to_s(); do { if ($p2->curr_line()->contain('<li>Round ')) { $last_chapter = $p2->curr_line()->dup()->cut_between('Round ', ':')->to_s(); } } while (!$p2->next_line()->contain('</table>')); $list[$vol] = $last_chapter; // echo "v $vol c $last_chapter <br/>\n"; } catch (Exception $e) { break; } } $p->next_line(); } return $list; }
function sankakucomplex($url) { if (strpos($url, '/idol.')) { $base = 'https://idol.sankakucomplex.com'; } else { $base = 'https://chan.sankakucomplex.com'; } $page = 1; $tag = uniqid(); $Turl = Text::create($url); if ($Turl->contain('tags=')) { $tag = $Turl->cut_after('tags=')->urldecode()->to_s(); } do { if (isset($_GET['limit'])) { if ($page > $_GET['limit']) { break; } } $purl = $url . '&page=' . $page; echo "{$purl}<br>\n"; do { $P = new Page($purl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); $a = $T->extract_to_array('href="', '"'); foreach ($a as $i => $e) { $E = new Text($e); if (!$E->contain('/post/show')) { unset($a[$i]); } } if (!count($a)) { break; } foreach ($a as $i => $e) { $E = new Text($e); $kurl = $base . $e; echo "{$kurl}<br>\n"; flush(); do { $P = new Page($kurl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); // $P->go_line('id="highres"'); if (isset($_GET['hires'])) { $P->go_line('id=highres'); } else { $P->go_line('id=lowres'); } if ($P->end_of_line()) { $P->reset_line(); $P->go_line('id=highres'); } $img = $P->curr_line()->cut_between('href="', '"')->to_s(); // $P->reset_line(); // $P->go_line('id="post_old_tags"'); // $tag = $P->curr_line()->cut_between('value="', '"')->substring(0, 150)->to_s(); // max 100 karakter if ($img) { echo "<a href='{$img}'>{$tag}</a><br />\n"; flush(); } else { echo "This is flash<br />\n"; } } $page++; } while (true); }