private function scrap_page($url) { $base_url = 'http://' . parse_url($url, PHP_URL_HOST); $p = new Page($url); $h = new simple_html_dom(); $h->load($p->content()); $boxes = $h->find('.textbox'); $result = array(); foreach ($boxes as $box) { // image/url $content = $box->find('.textbox-content', 0); $url = $base_url . $content->find('a', 0)->href; $thumb = $base_url . $content->find('img', 0)->src; // other data $label = $box->find('.webcss-label', 0); $title = $label->find('p', 0)->find('a', 0)->innertext; $title = html_entity_decode($title, ENT_COMPAT, 'UTF-8'); $h2 = $label->find('h2', 0); $date = Text::create($h2->innertext)->cut_after('>:')->to_s(); $h5 = $label->find('h5', 0); $tags = Text::create($h5->innertext)->strip_tags()->cut_after(':')->to_s(); $tags = array_filter(explode(',', $tags), 'trim'); $view = $label->find('.webcss_view', 0); $m = Text::create($view->innertext)->regex_match('/(\\d+)/'); $pages = $m[1]; $item = array('title' => $title, 'url' => $url, 'date' => $date, 'pages' => $pages, 'thumb' => $thumb, 'tags' => '#' . implode('#', $tags) . '#'); $result[] = $item; } return array_reverse($result); }
function buildPage($key) { global $dbPages; global $dbUsers; global $Parsedown; global $Site; // Page object, content from FILE. $Page = new Page($key); if (!$Page->isValid()) { Log::set(__METHOD__ . LOG_SEP . 'Error occurred when trying build the page from file with key: ' . $key); return false; } // Page database, content from DATABASE JSON. $db = $dbPages->getPageDB($key); if (!$db) { Log::set(__METHOD__ . LOG_SEP . 'Error occurred when trying build the page from database with key: ' . $key); return false; } // Foreach field from DATABASE. foreach ($db as $field => $value) { $Page->setField($field, $value); } // Content in raw format $contentRaw = $Page->content(); $Page->setField('contentRaw', $Page->content(), true); // Parse markdown content. $content = Text::pre2htmlentities($contentRaw); // Parse pre code with htmlentities $content = $Parsedown->text($content); // Parse Markdown. $content = Text::imgRel2Abs($content, HTML_PATH_UPLOADS); // Parse img src relative to absolute. $Page->setField('content', $content, true); // Pagebrake $explode = explode(PAGE_BREAK, $content); $Page->setField('breakContent', $explode[0], true); $Page->setField('readMore', !empty($explode[1]), true); // Date format $pageDate = $Page->date(); $Page->setField('dateRaw', $pageDate, true); $pageDateFormated = $Page->dateRaw($Site->dateFormat()); $Page->setField('date', $pageDateFormated, true); // User object $username = $Page->username(); $Page->setField('user', $dbUsers->getUser($username)); return $Page; }
public final function content() { // if ($this->isCached($this->id('content'))) { // return $this->get($this->id('content')); // } else { return parent::content(); // } }
private function open($url) { echo "Opening {$url}...\n"; $p = new Page($url); $h = new simple_html_dom(); $h->load($p->content()); return $h; }
function build_page($key) { global $dbPages; global $dbUsers; global $Parsedown; global $Site; // Page object, content from FILE. $Page = new Page($key); if (!$Page->isValid()) { Log::set(__METHOD__ . LOG_SEP . 'Error occurred when trying build the page from file with key: ' . $key); return false; } // Page database, content from DATABASE JSON. $db = $dbPages->getDb($key); if (!$db) { Log::set(__METHOD__ . LOG_SEP . 'Error occurred when trying build the page from database with key: ' . $key); return false; } // Foreach field from DATABASE. foreach ($db as $field => $value) { $Page->setField($field, $value); } // Content in raw format $contentRaw = $Page->content(); $Page->setField('contentRaw', $Page->content(), true); // Parse markdown content. $content = Text::pre2htmlentities($contentRaw); // Parse pre code with htmlentities $content = $Parsedown->text($content); // Parse Markdown. $content = Text::imgRel2Abs($content, HTML_PATH_UPLOADS); // Parse img src relative to absolute. $Page->setField('content', $content, true); // Date format $pageDate = $Page->date(); $Page->setField('dateRaw', $pageDate, true); $pageDateFormated = $Page->dateRaw($Site->dateFormat()); $Page->setField('date', $pageDateFormated, true); // Parse username for the page. if ($dbUsers->userExists($Page->username())) { $user = $dbUsers->getDb($Page->username()); $Page->setField('authorFirstName', $user['firstName'], false); $Page->setField('authorLastName', $user['lastName'], false); } return $Page; }
private function create_dom($url) { echo "Opening {$url}\n"; $p = new Page($url, array(CURLOPT_COOKIE => 'nw=1')); $h = new simple_html_dom(); $h->load($p->content()); return $h; }
private function collect_images($url, $dir) { if (strpos($url, '/idol.')) { $base = 'https://idol.sankakucomplex.com'; } else { $base = 'https://chan.sankakucomplex.com'; } $page = $this->page_from; $id = 1; $Turl = Text::create($url); do { if ($page > $this->page_to) { break; } $purl = $url . '&page=' . $page; echo "{$purl}\n"; do { $P = new Page($purl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); $a = $T->extract_to_array('href="', '"'); foreach ($a as $i => $e) { $E = new Text($e); if (!$E->contain('/post/show')) { unset($a[$i]); } } if (!count($a)) { break; } foreach ($a as $i => $e) { $E = new Text($e); $kurl = $base . $e; echo "{$kurl}\n"; flush(); do { $P = new Page($kurl, array('become_firefox' => true)); $T = new Text($P->content()); sleep(3); // 429 too many requests } while ($T->contain('429 Too many requests')); $P->go_line('id=highres'); $img = $P->curr_line()->cut_between('href="', '"'); /*if ($img->contain('.webm')) { echo "This is WEBM\n"; } else*/ if ($img->to_s()) { $this->download_if_not_exist($img, $dir, $id); $id++; } else { echo "No id=highres\n"; } } $page++; } while (true); }
function build_page($key) { global $dbPages; global $dbUsers; global $Parsedown; // Page object. $Page = new Page($key); if (!$Page->isValid()) { return false; } // Page database. $db = $dbPages->getDb($key); if (!$db) { return false; } // Foreach field from database. foreach ($db as $field => $value) { if ($field == 'unixTimeCreated') { // Format dates, not overwrite from file fields. $Page->setField('unixTimeCreated', $value, false); $Page->setField('date', Date::format($value, '%d %B'), false); $Page->setField('timeago', Date::timeago($value), false); } else { // Other fields, not overwrite from file fields. $Page->setField($field, $value, false); } } // Content in raw format $contentRaw = $Page->content(); $Page->setField('contentRaw', $Page->content(), true); // Parse markdown content. $content = $Parsedown->text($contentRaw); // Parse Markdown. $content = Text::imgRel2Abs($content, HTML_PATH_UPLOADS); // Parse img src relative to absolute. $Page->setField('content', $content, true); // Parse username for the page. if ($dbUsers->userExists($Page->username())) { $user = $dbUsers->getDb($Page->username()); $Page->setField('authorFirstName', $user['firstName'], false); $Page->setField('authorLastName', $user['lastName'], false); } return $Page; }
public function extract($columns, $s, $n, $url) { $result = array(); $p = new Page($url); $h = new simple_html_dom(); $h->load($p->content()); $pinbin = $h->find('.pinbin-copy', 0); foreach ($pinbin->find('img') as $img) { $item = array('image' => $img->outertext()); $result[] = $item; } return $result; }
public function extract($columns, $s, $n, $url) { $result = array(); if (strpos($url, 'http://www.rlsbb.com') === 0) { $url = str_replace('http://www.rlsbb.com', 'http://rlsbb.com', $url); } for ($i = $s; $i <= $n; $i++) { $purl = rtrim($url, '/') . '/'; if ($i > 1) { $purl .= 'page/' . $i . '/'; } $p = new Page($purl); $h = new simple_html_dom(); $h->load($p->content()); foreach ($h->find('div.post') as $post) { $item = array(); $title_a = $post->find('.postTitle', 0)->find('a', 0); $item['link'] = "<a href='{$title_a->href}'>link</a>"; $title_a = $post->find('.postTitle', 0)->find('a', 0); $item['title'] = $title_a->innertext; $subtitle = $post->find('.postSubTitle', 0); $date = Text::create($subtitle->innertext)->regex_match('/Posted on (.*) in </'); $date = $date[1]; $item['date'] = $date; $subtitle = $post->find('.postSubTitle', 0); $categories = array(); foreach ($subtitle->find('a[rel=category tag]') as $c) { $categories[] = $c->innertext; } $item['categories'] = implode(', ', $categories); $content = $post->find('.postContent', 0); if (!$content) { $content = $post->find('.entry-content', 0); } $item['content'] = strip_tags($content->innertext, '<br>'); $item['description'] = $item['content']; $content = $post->find('.postContent', 0); if (!$content) { $content = $post->find('.entry-content', 0); } $img = $content->find('img', 0); $item['image'] = $img ? $img->outertext() : ''; $img2 = $content->find('img', 1); $item['image2'] = $img2 ? $img2->outertext() : ''; $img3 = $content->find('img', 2); $item['image3'] = $img3 ? $img3->outertext() : ''; $result[] = $item; } } return $result; }
function build_page($key) { global $dbPages; global $dbUsers; global $Parsedown; // Page object, content from FILE. $Page = new Page($key); if (!$Page->isValid()) { return false; } // Page database, content from DATABASE JSON. $db = $dbPages->getDb($key); if (!$db) { return false; } // Foreach field from DATABASE. foreach ($db as $field => $value) { $Page->setField($field, $value); } // Content in raw format $contentRaw = $Page->content(); $Page->setField('contentRaw', $Page->content(), true); // Parse markdown content. $content = Text::pre2htmlentities($contentRaw); // Parse pre code with htmlentities $content = $Parsedown->text($content); // Parse Markdown. $content = Text::imgRel2Abs($content, HTML_PATH_UPLOADS); // Parse img src relative to absolute. $Page->setField('content', $content, true); // Parse username for the page. if ($dbUsers->userExists($Page->username())) { $user = $dbUsers->getDb($Page->username()); $Page->setField('authorFirstName', $user['firstName'], false); $Page->setField('authorLastName', $user['lastName'], false); } return $Page; }
public function extract($columns, $s, $n, $url) { $result = array(); for ($i = $s; $i <= $n; $i++) { $purl = rtrim($url, '/') . '/'; if ($i > 1) { $purl .= 'page/' . $i . '/'; } $p = new Page($purl); $h = new simple_html_dom(); $h->load($p->content()); foreach ($h->find('div.post') as $post) { $item = array(); if (in_array('link', $columns)) { $title_a = $post->find('h2', 0)->find('a', 0); $item['link'] = "<a href='{$title_a->href}'>link</a>"; } if (in_array('title', $columns)) { $title_a = $post->find('h2', 0)->find('a', 0); $item['title'] = $title_a->innertext; } if (in_array('date', $columns)) { $date = $post->find('span.date', 0); $item['date'] = $date->innertext; } if (in_array('category', $columns)) { $meta = $post->find('div.meta', 0); $categories = array(); foreach ($meta->find('a[rel=category tag]') as $c) { $categories[] = $c->innertext; } $item['categories'] = implode(', ', $categories); } if (in_array('content', $columns)) { $content = $post->find('.storycontent', 0); $item['content'] = strip_tags($content->innertext, '<br>'); } if (in_array('image', $columns)) { $content = $post->find('.storycontent', 0); $img = $content->find('img', 0); $item['image'] = $img ? $img->outertext() : ''; $img2 = $content->find('img', 1); $item['image2'] = $img2 ? $img2->outertext() : ''; } $result[] = $item; } } return $result; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); $h = new simple_html_dom(); $h->load($p->content()); $select = $h->find('select[name="pagejump"]', 0); $img = $h->find('#page', 0)->find('img', 0); $srcdir = dirname($img->src); $pages = array(); foreach ($select->find('option') as $opt) { $pages["{$prefix}-{$infix}-{$opt->value}.jpg"] = $srcdir . '/' . $opt->value . '.jpg'; } return $pages; }
public function get_info($base) { $domain = 'http://www.mangaeden.com'; $p = new Page($base); $h = new simple_html_dom(); $h->load($p->content()); $links = $h->find('.chapterLink'); $list = array(); foreach ($links as $a) { $url = $domain . $a->href; $desc = $a->find('b', 0)->innertext(); $list[] = array('url' => $url, 'desc' => $desc, 'infix' => $this->get_infix($url)); } return $list; }
public function get_info($base) { $DOMAIN = 'http://kissmanga.com'; // crawl chapters $p = new Page($base, array('become_firefox' => true)); $h = new simple_html_dom(); $h->load($p->content()); $table = $h->find('table.listing', 0); $list = array(); foreach ($table->find('a') as $a) { $href = $DOMAIN . $a->href; $desc = $a->title; $infix = $this->get_infix($href); $list[] = array('url' => $href, 'desc' => $desc, 'infix' => $infix); } return $list; }
public function download() { // grab this page // grab next pages // until meet existing file or at the end $page_url = $this->home; $this->active = true; $this->images = array(); while ($this->active) { echo "Opening {$page_url}...\n"; $p = new Page($page_url); $h = new simple_html_dom(); $h->load($p->content()); $this->collect_images($h); $page_url = $this->next_page($h); } $this->download_images(); }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); $h = new simple_html_dom(); $h->load($p->content()); $pattern_url = Text::create($chapter_url)->regex_replace('#-page-\\d.*.html$#', '.html')->regex_replace('#\\.html$#', '-page-%s.html')->to_s(); $pages = $h->find('#id_page', 0); $n = $pages->last_child()->value; $result = array(); list($img_name, $img_url) = $this->crawl_page($p, $prefix, $ifx, 1); $result[$img_name] = $img_url; for ($i = 2; $i <= $n; $i++) { $p = new Page(sprintf($pattern_url, $i)); list($img_name, $img_url) = $this->crawl_page($p, $prefix, $ifx, $i); $result[$img_name] = $img_url; } return array_reverse($result); }
public function test_parse($page_url) { $p = new Page($page_url); $h = new simple_html_dom(); $h->load($p->content()); foreach ($h->find('div.post') as $post) { $title_a = $post->find('h3.postTitle', 0)->find('a', 0); $subtitle = $post->find('div.postSubTitle', 0); $date = Text::create($subtitle->innertext)->regex_match('/Posted on (.*) in </'); $date = $date[1]; $categories = array(); foreach ($subtitle->find('a[rel=category tag]') as $c) { $categories[] = $c->innertext; } $content = $post->find('div.postContent', 0); $row = array('url' => $title_a->href, 'title' => $title_a->innertext, 'date' => $date, 'category' => $categories, 'content' => htmlspecialchars($content->innertext)); print_r($row); } }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); $h = new simple_html_dom(); $h->load($p->content()); $DOMAIN = 'http://hugemanga.com/'; $first_image = $h->find('img.picture', 0); $pattern = $DOMAIN . dirname($first_image->src) . '/' . str_replace('01', '###', basename($first_image->src)); $result = array(); $pages = $h->find('select[name="page"]', 0); foreach ($pages->find('option') as $option) { $page = Text::create($option->value)->pad(2)->to_s(); $full = str_replace('###', $page, $pattern); $name = basename($full); $result["{$prefix}-{$ifx}-{$name}"] = $full; } return $result; }
private function grab_chapters($start) { $p = new Page($start); $h = new simple_html_dom(); $h->load($p->content()); $chapters = array(); $domain = 'http://www.horizonscans.com'; $boxes = $h->find('.box'); foreach ($boxes as $box) { $a = $box->find('a', 0); $title = $box->find('h3', 0)->plaintext; $url = $domain . $a->href; $infix = $this->determine_infix($title); if ($infix) { $chapters[$infix] = $url; } } return $chapters; }
private function grab_from_list($to_extract) { $result = array(); foreach ($to_extract as $purl) { $p = new Page($purl); $h = new simple_html_dom(); $h->load($p->content()); $item = array('url' => $purl, 'link' => "<a href='{$purl}'>link</a>"); $article = $h->find('.main-area', 0)->find('article', 0); $item['title'] = $article->find('h1', 0)->innertext; $detail = $article->find('.artdet', 0); $item['image'] = $detail->find('img', 0)->outertext(); $item['image'] = str_replace('../', 'http://www.videogamesindonesia.com/', $item['image']); $item['content'] = $detail->innertext; $item['description'] = strip_tags($detail->innertext, '<br><p>'); $result[] = $item; } return $result; }
public function get_images($chapter_url, $prefix, $infix) { $basename = 'http://haven-reader.net'; $ifx = Text::create($infix)->pad(3)->to_s(); // open index page $p = new Page($chapter_url . '&page=0'); $h = new simple_html_dom(); $h->load($p->content()); $main = $h->find('#main', 0); // print_r($p->content()); $pages = array(); foreach ($main->find('img.PageLink') as $img) { // convert thumb to img $thumb = $img->src; $real = Text::create($thumb)->replace('/thumbs/', '/')->regex_replace('/\\.jpg$/', '')->to_s(); $alt = $img->alt; $pages["{$prefix}-{$ifx}-{$alt}"] = $basename . $real; } return $pages; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); $h = new simple_html_dom(); $h->load($p->content()); $pattern_url = Text::create($chapter_url)->regex_replace('#/p1$#', '/p%s')->to_s(); $select = $h->find('#e1', 0); $result = array(); foreach ($select->find('option') as $option) { $p2 = new Page(sprintf($pattern_url, $option->value)); $h2 = new simple_html_dom(); $h2->load($p2->content()); $img = $h2->find('.coverIssue', 0)->find('img', 0)->src; $ext = pathinfo($img, PATHINFO_EXTENSION); $iname = Text::create($option->value)->pad(3)->to_s() . '.' . $ext; $result["{$prefix}-{$ifx}-{$iname}"] = $img; } return $result; }
public function get_images($chapter_url, $prefix, $infix) { $ifx = Text::create($infix)->pad(3)->to_s(); $p = new Page($chapter_url); $h = new simple_html_dom(); $h->load($p->content()); $pages = array(); $select = $h->find('select.wid60', 0); foreach ($select->find('option') as $option) { $pages[] = $option->value; } // grab current image $result = $this->crawl_page($p, $prefix, $ifx); array_shift($pages); // grab the rest of pages foreach ($pages as $i => $page) { $p = new Page($page); $result = $result + $this->crawl_page($p, $prefix, $ifx); } return $result; }
public function download() { echo "Album URL: "; $album_url = trim(fgets(STDIN)); echo "Save Dir: "; $this->dir = trim(fgets(STDIN)); $this->dir = rtrim($this->dir, '/') . '/'; $current_url = $this->album_to_gallery($album_url); $active = true; while ($active) { echo "Opening {$current_url}...\n"; $p = new Page($current_url); $json = json_decode($p->content()); $this->download_images($json); if (isset($json->paging->next)) { $current_url = $json->paging->next; } else { $active = false; } } }
public function traverse($url, $start_sign = '<a href="', $end_sign = '"', $n_skip = 5) { if ($this->traverse_opts['debug']) { echo "In {$url} <br />\n"; } $p = new Page($url); $r = array(); $l = Text::factory($p->content())->extract_to_array($start_sign, $end_sign); $n = count($l); for ($i = $n_skip; $i < $n; $i++) { if (strrpos($l[$i], '/') == strlen($l[$i]) - 1) { if ($this->traverse_dir($l[$i])) { $r[$l[$i]] = $this->traverse($url . $l[$i], $start_sign, $end_sign, $n_skip); } } else { if ($this->traverse_file($l[$i])) { $r[$l[$i]] = $l[$i]; } } } return $r; }
public function extract($columns, $s, $n, $url) { $result = array(); $domain = 'http://www.dorkly.com'; for ($i = $s; $i <= $n; $i++) { $purl = rtrim($url, '/') . '/page:' . $i; $p = new Page($purl); $h = new simple_html_dom(); $h->load($p->content()); foreach ($h->find('.browse_article') as $post) { $item = array(); $title_a = $post->find('h3', 0)->find('a', 0); $item['title'] = $title_a->innertext; $item['url'] = $domain . $title_a->href; $item['link'] = $domain . $title_a->outertext(); $p2 = new Page($item['url']); $h2 = new simple_html_dom(); $h2->load($p2->content()); $item['image'] = $this->extract_images($h2); $content = $h2->find('.the_comic', 0)->find('.article-content', 0); $pagination = $content->find('.pagination', 0); if ($pagination) { foreach ($pagination->find('a') as $a) { if ($a->{'class'} == 'next') { continue; } $aurl = $domain . $a->href; $p3 = new Page($aurl); $h3 = new simple_html_dom(); $h3->load($p3->content()); $item['image'] = array_merge($item['image'], $this->extract_images($h3)); } } $item['image'] = implode('<br>', $item['image']); $result[] = $item; } } return $result; }
function idol_sankaku2($base_url, $from, $to) { $base = 'http://idol.sankakucomplex.com'; for ($i = $from; $i <= $to; $i++) { $P = new Page($base_url . '&page=' . $i); $T = new Text($P->content()); $a = $T->extract_to_array('href="', '"'); foreach ($a as $e) { $E = new Text($e); if ($E->contain('/post/show')) { $url = $base . $e; $P = new Page($url); $P->go_line('id="highres"'); $img = $P->curr_line()->cut_between('href="', '"')->to_s(); $P->reset_line(); $P->go_line('id="post_old_tags"'); $tag = $P->curr_line()->cut_between('value="', '"')->substr(0, 150)->to_s(); // max 100 karakter echo "<a href='{$img}'>{$tag}</a><br />\n"; } } } }
// otherwise will go looking for /home/root/.subversion or some other user $where = CONTENTDIR . 'static'; putenv('HOME=' . CONTENTDIR); `cd {$where} && /usr/bin/svn update`; // make troubleshooting page $source = CONTENTDIR . "static/"; #$path = BASEDIR; $page = new Page("Overview", "Overview"); $page->content(file_get_contents($source . "overview.html")); writeFile('overview/index.html', $page->out()); #copydirr($source.'/images', $path.'/images'); $page = new Page("Foundation", "Foundation"); $page->content(file_get_contents($source . "foundation.html")); writeFile('foundation/index.html', $page->out()); $page = new Page("People", "People"); $page->content(file_get_contents($source . "people.html")); writeFile('people/index.html', $page->out()); $benchmark_end = microtime_float(); $execution_time = round($benchmark_end - $benchmark_start, 4); ?> <h2>About page generation Successful</h2> <p>Generated files in <?php echo $execution_time; ?> seconds.</p> <!--<p>Page put here: <?php echo $source . "faq.html"; ?> </p>--> <!--<p>Page put here: <?php
echo "\t" . '<p id="header_title"><a href="' . $context['url_to_root'] . '" title="' . encode_field(i18n::s('Front page')) . '" accesskey="1">' . $context['site_name'] . '</a></p>' . "\n"; } // site slogan if ($context['site_slogan']) { echo "\t" . '<p id="header_slogan">' . $context['site_slogan'] . '</p>' . "\n"; } // end of the header panel echo '</div>' . "\n"; // the main panel echo '<div id="main_panel">' . "\n"; // display bread crumbs if not at the front page; if not defined, only the 'Home' link will be displayed if ($context['skin_variant'] != 'home') { Page::bread_crumbs(0); } // display main content Page::content(); // end of the main panel echo '</div>' . "\n"; // the side panel echo '<div id="side_panel">' . "\n"; // display side content Page::side(); // end of the side panel echo '</div>' . "\n"; // the footer panel comes after everything else echo '<div id="footer_panel">'; // display standard footer Page::footer(); // end of the footer panel echo '</div>' . "\n"; // end of the wrapper