function getHPFanficArchiveInfo($url) { $urlParts = parse_url($url); parse_str($urlParts['query'], $query); if (isset($query['sid'])) { $storyId = $query['sid']; if (is_numeric($storyId)) { $url = "{$urlParts['scheme']}://{$urlParts['host']}/stories/viewstory.php?sid={$storyId}"; $response = cURL($url); $html = new HTML5(); $html = $html->loadHTML($response); $story = new Story(); $story->id = $storyId; $story->url = $url; $title = qp($html, '#pagetitle')->find('a[href^="viewstory"]')->first()->text(); if (empty($title)) { throw new FicSaveException("Could not retrieve title for story at {$url}."); } else { $story->title = $title; } $author = qp($html, '#pagetitle')->find('a[href^="viewuser"]')->first()->text(); if (empty($author)) { throw new FicSaveException("Could not retrieve author for story at {$url}."); } else { $story->author = $author; } $description = qp($html, '#mainpage')->find('.block')->get(1); if ($description == NULL) { throw new FicSaveException("Could not retrieve description for story at {$url}."); } else { $story->description = stripAttributes(preg_replace('/<a(.*?)>(.*?)<\\/a>/', '\\2', trim(qp($description)->find('.content')->first()->innerHTML()))); } $chaptersBlock = qp($html, '#mainpage')->find('.block')->get(3); if ($chaptersBlock == NULL) { throw new FicSaveException("Could not get number of chapters for story at {$url}."); } else { $chapterLinks = qp($chaptersBlock)->find('a[href^="viewstory"]'); $numChapters = $chapterLinks->count(); if ($numChapters > 0) { $story->chapters = $numChapters; $story->metadata = array(); foreach ($chapterLinks as $chapterLink) { $story->metadata[] = $chapterLink->text(); } } else { throw new FicSaveException("Could not get number of chapters for story at {$url}."); } } return $story; } else { throw new FicSaveException("URL has an invalid story ID: {$storyId}."); } } else { throw new FicSaveException("URL is missing story ID."); } }
function getFanfictionNetInfo($url) { $urlParts = parse_url($url); $pathParts = explode('/', $urlParts['path']); if (isset($pathParts[2])) { $storyId = $pathParts[2]; if (is_numeric($storyId)) { $response = cURL($url); $html = new HTML5(); $html = $html->loadHTML($response); $story = new Story(); $story->id = $storyId; $urlParts = parse_url($url); $story->url = "{$urlParts['scheme']}://{$urlParts['host']}/s/{$storyId}"; $title = qp($html, '#profile_top')->find('b')->first()->text(); if (empty($title)) { throw new FicSaveException("Could not retrieve title for story at {$url}."); } else { $story->title = $title; } $author = qp($html, '#profile_top')->find('a')->first()->text(); if (empty($author)) { throw new FicSaveException("Could not retrieve author for story at {$url}."); } else { $story->author = $author; } $description = qp($html, '#profile_top')->find('div')->get(2); if ($description == NULL) { throw new FicSaveException("Could not retrieve description for story at {$url}."); } else { $story->description = stripAttributes(preg_replace('/<a(.*?)>(.*?)<\\/a>/', '\\2', trim(qp($description)->html() . qp($description)->next()->html()))); } $numChapters = qp($html, '#chap_select')->find('option')->count() / 2; // value is always doubled for some reason $story->chapters = $numChapters == 0 ? 1 : $numChapters; $coverImageUrl = qp($html, '#profile_top')->find('img')->first()->attr('src'); if ($coverImageUrl != NULL) { $coverImageUrlParts = parse_url($coverImageUrl); if (!isset($coverImageUrlParts['scheme']) && substr($coverImageUrl, 0, 2) == '//') { $coverImageUrl = $urlParts['scheme'] . ":" . $coverImageUrl; } $coverImageUrl = str_replace('/75/', '/180/', $coverImageUrl); $story->coverImageUrl = $coverImageUrl; } return $story; } else { throw new FicSaveException("URL has an invalid story ID: {$storyId}."); } } else { throw new FicSaveException("URL is missing story ID."); } }
function getAdultFanfictionOrgChapter($url, $chapterNumber) { $response = cURL($url . "&chapter=" . $chapterNumber); $html = new HTML5(); $html = $html->loadHTML($response); $chapter = new Chapter(); $chapter->number = $chapterNumber; $chapterTitle = qp($html, 'select[name=chapnav]')->find('option[selected]')->text(); $chapterTitle = trim(str_replace($chapterNumber . '.', '', $chapterTitle)); $chapter->title = $chapterTitle; $content = stripAttributes(trim(qp(qp($html, 'form[name=form]')->find('tr')->get(3))->find('td')->innerHTML())); $chapter->content = $content; return $chapter; }
function getAsianFanficsInfo($url) { $urlParts = parse_url($url); $pathParts = explode('/', $urlParts['path']); if (isset($pathParts[3])) { $storyId = $pathParts[3]; if (is_numeric($storyId)) { $url = "{$urlParts['scheme']}://{$urlParts['host']}/story/view/{$storyId}"; $response = cURL($url); $html = new HTML5(); $html = $html->loadHTML($response); $story = new Story(); $story->id = $storyId; $story->url = $url; $title = trim(qp($html, 'h1.title')->first()->text()); if (empty($title)) { throw new FicSaveException("Could not retrieve title for story at {$url}."); } else { $story->title = $title; } $author = qp(qp($html, 'span.text--info')->get(0))->next()->text(); if (empty($author)) { throw new FicSaveException("Could not retrieve author for story at {$url}."); } else { $story->author = $author; } $description = qp($html, '#bodyText')->find('h2')->first()->next(); if ($description == NULL) { throw new FicSaveException("Could not retrieve description for story at {$url}."); } else { $story->description = stripAttributes(trim($description->innerHTML())); } $story->chapters = qp($html, 'select[name="chapterNav"]')->find('option')->count() - 1; return $story; } else { throw new FicSaveException("URL has an invalid story ID: {$storyId}."); } } else { throw new FicSaveException("URL is missing story ID."); } }
/** * Callback used by a preg_replace_callback in nextPage to make a table * @param array $matches of table cells */ function makeTableCallback($matches) { $table = str_replace("\n!", "\n|#", $matches[2]); $table = str_replace("!!", "||#", $table); $table = str_replace("||", "\n|", $table); $row_data = explode("|", $table); $first = true; $out = $matches[1]; $state = ""; $skip = false; $type = "td"; $old_type = "td"; $table_cell_attributes = array("align", "colspan", "style", "scope", "rowspace", "valign"); foreach ($row_data as $item) { crawlTimeoutLog("..Making Wiki Tables.."); if ($first) { $item = trim(str_replace("\n", " ", $item)); $item = str_replace(""", "\"", $item); $item = stripAttributes($item, array('id', 'class', 'style')); $out .= "<table {$item} >\n<tr>"; $first = false; $old_line = true; continue; } $end = substr($out, -4); if ($item == "" || $item[0] == "-") { if ($end != "<tr>") { $out .= "</{$old_type}>"; } $out .= "</tr>\n<tr>"; continue; } if ($item[0] == "+") { $type = "caption"; $item = substr($item, 1); if ($end == "<tr>") { $out = substr($out, 0, -4); } } else { if ($item[0] == "#") { $type = "th"; $item = substr($item, 1); } else { $type = "td"; } } $trim_item = trim($item); $attribute_trim = str_replace("\n", " ", $trim_item); $attribute_trim = str_replace(""", "\"", $attribute_trim); if (!$skip && ($state = trim(stripAttributes($attribute_trim, $table_cell_attributes)))) { $old_type = $type; $skip = true; continue; } $skip = false; if ($end != "<tr>") { $out .= "</{$old_type}>"; if ($old_type == "caption") { $out .= "<tr>"; } } $out .= "<{$type} {$state}>\n{$trim_item}"; $state = ""; $old_type = $type; } $out .= "</{$old_type}></tr></table>"; return $out; }
/** * * @param unknown_type $content * @return unknown_type */ function formatText($content) { $content = html_entity_decode($content, ENT_COMPAT, 'UTF-8'); // end get proper links in text version $content = str_replace("\r", "\n", $content); $serachArray = array(' ', "</p><p>", "<p>", '</p>', "\n\n\n\n<ol>", "</ol>\n\n\n\n", "\n\n\n\n<ul>", "</ul>\n\n\n\n", "\n\n<ol>", "</ol>\n\n", "\n\n<ul>", "</ul>\n\n", "<li>\n", '<li>', '</li>', "<hr>", "<br />\n", '<br />', '<br>', '<h1>', '</h1>', '<h2>', '</h2>', '<h3>', '</h3>', '<h4>', '</h4>', '<h5>', '</h5>'); $replaceArray = array(' ', "[[[BR]]]\n", "\n", "\n", "[[[BR]]]", "[[[BR]]]", "[[[BR]]]", "[[[BR]]]", "[[[BR]]]", "[[[BR]]]", "[[[BR]]]", "[[[BR]]]", "<li>", '- ', "\n", "[[[HR]]]", "[[[BR]]]", "[[[BR]]]", '[[[BR]]]', "\n= ", " =\n", "\n== ", " ==\n", "\n=== ", " ===\n", "\n==== ", " ====\n", "\n===== ", " =====\n"); $content = str_replace($serachArray, $replaceArray, $content); // get proper links in text version $content = formatTextLink($content); $content = stripAttributes($content); // preg_replace("/\n[^\w]*\n/","\n", $content); // turn returns to newlines: $content = str_replace("\r", "\n", $content); // turn tabs to spaces: $content = str_replace("\t", " ", $content); // next is searching for double spaces. /* while (preg_match("/ /i", "$content")) { // replace them with single spaces: $content = str_replace(" ", " ", $content); }*/ // looks for spaces after a newline: while (preg_match("/\n /", "{$content}")) { // remove that space: $content = str_replace("\n ", "\n", $content); } // look for two newlines: while (preg_match("/\n\n/i", "{$content}")) { // turn it to one newline $content = str_replace("\n\n", "\n", $content); } // the \n now separates paragraphs; change \n to <p>: /*$content = "<p>" . str_replace("\n", "</p><p>", $content) . "</p>"; $content = str_replace("<p></p>", "", $content);*/ // done! $serachArray = array("\n= ", "\n== ", "\n=== ", "\n==== ", "\n===== ", "\n[[[HR]]]", "[[[BR]]]"); $replaceArray = array("\n\n= ", "\n\n== ", "\n\n=== ", "\n\n==== ", "\n\n===== ", "\n--------------------------------------------------------------------------------", "\n"); $content = str_replace($serachArray, $replaceArray, $content); return $content; }