private static function parseManga($item) { $crawler = new Crawler($item); $manga = new Manga(); $manga->setId($crawler->filter('id')->text()); $manga->setTitle($crawler->filter('title')->text()); $otherTitles = array(); $english = explode('; ', $crawler->filter('english')->text()); if (count($english) > 0 && $english !== '') { $otherTitles['english'] = $english; } $synonyms = explode('; ', $crawler->filter('synonyms')->text()); if (count($synonyms) > 0 && $synonyms[0] !== '') { $otherTitles['synonyms'] = $synonyms; } $manga->setOtherTitles($otherTitles); $manga->setOtherTitles($otherTitles); $manga->setChapters($crawler->filter('chapters')->text()); $manga->setVolumes($crawler->filter('volumes')->text()); $manga->setMembersScore($crawler->filter('score')->text()); $manga->setStatus($crawler->filter('status')->text()); $startDate = $crawler->filter('start_date')->text(); if ($startDate !== '0000-00-00') { $manga->setStartDate((new \DateTime())->createFromFormat('Y-m-d', $startDate)); } $EndDate = $crawler->filter('end_date')->text(); if ($EndDate !== '0000-00-00') { $manga->setEndDate((new \DateTime())->createFromFormat('Y-m-d', $EndDate)); } $manga->setSynopsis($crawler->filter('synopsis')->text()); $manga->setImageUrl($crawler->filter('image')->text()); return $manga; }
private static function parseRecord($item, $type) { $crawler = new Crawler($item); //Initialize our object based on the record type we were passed. switch ($type) { case 'anime': $media = new Anime(); break; case 'manga': $media = new Manga(); break; } //Separate all the details $details = explode("\n", trim($crawler->filter('div[class="detail"]')->text())); $subDetails = explode(' ', trim($details[1])); //Pull out all the common parts $media->setId((int) str_replace('#area', '', $crawler->filter('a')->attr('id'))); $media->setTitle($crawler->filter('a')->eq(1)->text()); //Convert thumbnail to full size image by stripping the "t" in the filename $media->setImageUrl(preg_replace('/r(.+?)\\/(.+?)\\?(.+?)$/', '$2', $crawler->filter('img')->attr('data-src'))); $media->setMembersCount((int) trim(str_replace(',', '', str_replace('members', '', $details[3])))); //Anime and manga have different details, so we grab an array of the list and then process based on the type switch ($type) { case 'anime': $media->setType($subDetails[0]); $media->setEpisodes(strstr($subDetails[1], '?') ? null : (int) trim(str_replace('eps', '', $subDetails[1]), '()')); $media->setMembersScore((double) $crawler->filter('td')->eq(2)->text()); break; case 'manga': $media->setVolumes(strstr($subDetails[1], '?') ? null : (int) trim(str_replace('vols', '', $subDetails[1]), '()')); $media->setMembersScore((double) $crawler->filter('td')->eq(2)->text()); break; } return $media; }
public function testVolumes() { $volumes = rand(); $manga = new Manga(); $manga->setVolumes($volumes); $this->assertEquals($volumes, $manga->getVolumes()); }
private static function parserecord($item, $type) { $crawler = new Crawler($item); //Get the type record. switch ($type) { case 'anime': $media = new Anime(); break; case 'manga': $media = new Manga(); break; } //Pull out all the common parts $media->setId((int) str_replace('sarea', '', $crawler->filter('a[class="hoverinfo_trigger"]')->attr('id'))); $media->setTitle($crawler->filter('strong')->text()); //Title Image //We need to do some string manipulation here so it doesn't return a tiny image $media->setImageUrl(preg_replace('/r(.+?)\\/(.+?)\\?(.+?)$/', '$2', $crawler->filter('img')->attr('data-src'))); $media->setType(trim($crawler->filterXPath('//td[3]')->text())); switch ($type) { case 'anime': //Custom parsing for anime $media->setEpisodes((int) trim($crawler->filterXPath('//td[4]')->text())); $start_date = trim($crawler->filterXPath('//td[6]')->text()); if ($start_date != '-') { $start_date = explode('-', trim($start_date)); if (strlen($start_date[2]) == 2 && strpos($start_date[2], '?') === false) { $start_date[2] = self::fixMalShortYear($start_date[2]); } //We must have a year. If we don't even have that, don't set a date. if (strpos($start_date[2], '?') === false) { // If we don't know the month, then we can only be accurate to a year. if (strpos($start_date[0], '?') !== false) { $media->setLiteralStartDate(null, DateTime::createFromFormat('Y', $start_date[2]), 'year'); } elseif (strpos($start_date[0], '?') === false && strpos($start_date[1], '?') !== false) { $media->setLiteralStartDate(null, DateTime::createFromFormat('Y m', "{$start_date['2']} {$start_date['0']}"), 'month'); } elseif (strpos($start_date[0], '?') === false && strpos($start_date[1], '?') === false && strpos($start_date[2], '?') === false) { $media->setLiteralStartDate("{$start_date['2']}-{$start_date['0']}-{$start_date['1']}", DateTime::createFromFormat('Y m d', "{$start_date['2']} {$start_date['0']} {$start_date['1']}"), 'day'); } } } $end_date = trim($crawler->filterXPath('//td[7]')->text()); if ($end_date != '-') { $end_date = explode('-', trim($end_date)); if (strlen($end_date[2]) == 2 && strpos($end_date[2], '?') === false) { $end_date[2] = self::fixMalShortYear($end_date[2]); } //We must have a year. If we don't even have that, don't set a date. if (strpos($end_date[2], '?') === false) { if (strpos($end_date[0], '?') !== false) { $media->setLiteralEndDate(null, DateTime::createFromFormat('Y', $end_date[2]), 'year'); } elseif (strpos($end_date[0], '?') === false && strpos($end_date[1], '?') !== false) { $media->setLiteralEndDate(null, DateTime::createFromFormat('Y m', "{$end_date['2']} {$end_date['0']}"), 'month'); } elseif (strpos($end_date[0], '?') === false && strpos($end_date[1], '?') === false && strpos($end_date[2], '?') === false) { $media->setLiteralEndDate("{$end_date['2']}-{$end_date['0']}-{$end_date['1']}", DateTime::createFromFormat('Y m d', "{$end_date['2']} {$end_date['0']} {$end_date['1']}"), 'day'); } } } $classification = trim($crawler->filterXPath('//td[9]')->text()); if ($classification != '-') { $media->setClassification($classification); } $media->setMembersScore((double) trim($crawler->filterXPath('//td[5]')->text())); $synopsis = $crawler->filterXPath('//td[2]/div[2]')->text(); if ($synopsis !== '') { $media->setSynopsis(str_replace('read more.', '', trim($synopsis))); } break; case 'manga': //Custom parsing for manga $media->setType(trim($crawler->filterXPath('//td[3]')->text())); $media->setChapters((int) trim($crawler->filterXPath('//td[5]')->text())); $media->setVolumes((int) trim($crawler->filterXPath('//td[4]')->text())); $media->setMembersScore((double) trim($crawler->filterXPath('//td[6]')->text())); $media->setSynopsis(str_replace('read more.', '', trim($crawler->filterXPath('//td[2]/div[2]')->text()))); break; } return $media; }
public static function parse($contents, $mine = false) { $crawler = new Crawler(); $crawler->addHTMLContent($contents, 'UTF-8'); $mangarecord = new Manga(); # Manga ID. # Example: # <input type="hidden" value="104" name="mid" /> $mangarecord->setId((int) $crawler->filter('input[name="mid"]')->attr('value')); # Title and rank. # Example: # <h1> # <div style="float: right; font-size: 13px;">Ranked #22</div> # <span itemprop="name">One Punch-Man</span> <span style="font-weight: normal;"><small>(Manga)</small></span> # </h1> $mangarecord->setTitle(trim($crawler->filter('span[itemprop="name"]')->text())); $rank = $crawler->filterXPath('//span[contains(@class, "ranked")]'); if (count($rank) > 0) { $mangarecord->setRank((int) str_replace('Ranked #', '', $rank->text())); } # Title Image # Example: # <a href="http://myanimelist.net/manga/104/Yotsubato!/pic&pid=90029"><img src="http://cdn.myanimelist.net/images/manga/4/90029.jpg" alt="Yotsubato!" align="center"></a> $mangarecord->setImageUrl(str_replace('t.jpg', '.jpg', $crawler->filter('div#content tr td div img')->attr('src'))); // Left Column - Alt titles, info, stats, tags $leftcolumn = $crawler->filterXPath('//div[@id="content"]/table/tr/td[@class="borderClass"]'); # Alternative Titles section. # Example: # <h2>Alternative Titles</h2> # <div class="spaceit_pad"><span class="dark_text">English:</span> Yotsuba&!</div> # <div class="spaceit_pad"><span class="dark_text">Synonyms:</span> Yotsubato!, Yotsuba and !, Yotsuba!, Yotsubato, Yotsuba and!</div> # <div class="spaceit_pad"><span class="dark_text">Japanese:</span> よつばと!</div> # English: $extracted = $leftcolumn->filterXPath('//span[text()="English:"]'); if ($extracted->count() > 0) { $text = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); $setother_titles['english'] = explode(', ', $text); $mangarecord->setOtherTitles($setother_titles); } # Synonyms: $extracted = $leftcolumn->filterXPath('//span[text()="Synonyms:"]'); if ($extracted->count() > 0) { $text = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); $setother_titles['synonyms'] = explode(', ', $text); $mangarecord->setOtherTitles($setother_titles); } # Japanese: $extracted = $leftcolumn->filterXPath('//span[text()="Japanese:"]'); if ($extracted->count() > 0) { $text = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); $setother_titles['japanese'] = explode(', ', $text); $mangarecord->setOtherTitles($setother_titles); } # Information section. # Example: # <h2>Information</h2> # <div><span class="dark_text">Type:</span> Manga</div> # <div class="spaceit"><span class="dark_text">Volumes:</span> Unknown</div> # <div><span class="dark_text">Chapters:</span> Unknown</div> # <div class="spaceit"><span class="dark_text">Status:</span> Publishing</div> # <div><span class="dark_text">Published:</span> Mar 21, 2003 to ?</div> # <div class="spaceit"><span class="dark_text">Genres:</span> # <a href="http://myanimelist.net/manga.php?genre[]=4">Comedy</a>, # <a href="http://myanimelist.net/manga.php?genre[]=36">Slice of Life</a> # </div> # <div><span class="dark_text">Authors:</span> # <a href="http://myanimelist.net/people/1939/Kiyohiko_Azuma">Azuma, Kiyohiko</a> (Story & Art) # </div> # <div class="spaceit"><span class="dark_text">Serialization:</span> # <a href="http://myanimelist.net/manga.php?mid=23">Dengeki Daioh (Monthly)</a> # </div> # Type: $extracted = $leftcolumn->filterXPath('//span[text()="Type:"]'); if ($extracted->count() > 0) { $mangarecord->setType(trim(str_replace($extracted->text(), '', $extracted->parents()->text()))); } # Volumes: $extracted = $leftcolumn->filterXPath('//span[text()="Volumes:"]'); $mangarecord->setVolumes(null); if ($extracted->count() > 0) { $data = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); if ($data != 'Unknown') { $mangarecord->setVolumes((int) $data); } else { $mangarecord->setVolumes(null); } } # Chapters: $extracted = $leftcolumn->filterXPath('//span[text()="Chapters:"]'); $mangarecord->setChapters(null); if ($extracted->count() > 0) { $data = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); if ($data != 'Unknown') { $mangarecord->setChapters((int) $data); } else { $mangarecord->setChapters(null); } } # Status: $extracted = $leftcolumn->filterXPath('//span[text()="Status:"]'); if ($extracted->count() > 0) { $mangarecord->setStatus(strtolower(trim(str_replace($extracted->text(), '', $extracted->parents()->text())))); } # Genres: $extracted = $leftcolumn->filterXPath('//span[text()="Genres:"]'); if ($extracted->count() > 0) { $mangarecord->setGenres(explode(', ', trim(str_replace($extracted->text(), '', $extracted->parents()->text())))); } # Statistics # Example: # <h2>Statistics</h2> # <div><span class="dark_text">Score:</span> 8.90<sup><small>1</small></sup> <small>(scored by 4899 users)</small> # </div> # <div class="spaceit"><span class="dark_text">Ranked:</span> #8<sup><small>2</small></sup></div> # <div><span class="dark_text">Popularity:</span> #32</div> # <div class="spaceit"><span class="dark_text">Members:</span> 8,344</div> # <div><span class="dark_text">Favorites:</span> 1,700</div> //TODO: Rewrite to properly clean up excess tags. # Score: $extracted = $leftcolumn->filterXPath('//span[text()="Score:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //Remove the parenthetical at the end of the string $extracted = trim(str_replace(strstr($extracted, '('), '', $extracted)); //Sometimes there is a superscript number at the end from a note. //Scores are only two decimals, so number_format should chop off the excess, hopefully. $mangarecord->setMembersScore((double) number_format($extracted, 2)); } # Popularity: $extracted = $leftcolumn->filterXPath('//span[text()="Popularity:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //Remove the hash at the front of the string and trim whitespace. Needed so we can cast to an int. $extracted = trim(str_replace('#', '', $extracted)); $mangarecord->setPopularityRank((int) $extracted); } # Members: $extracted = $leftcolumn->filterXPath('//span[text()="Members:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //PHP doesn't like commas in integers. Remove it. $extracted = trim(str_replace(',', '', $extracted)); $mangarecord->setMembersCount((int) $extracted); } # Members: $extracted = $leftcolumn->filterXPath('//span[text()="Favorites:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //PHP doesn't like commas in integers. Remove it. $extracted = trim(str_replace(',', '', $extracted)); $mangarecord->setFavoritedCount((int) $extracted); } # - # Extract from sections on the right column: Synopsis, Related Manga # - $rightcolumn = $crawler->filterXPath('//div[@id="content"]/table/tr/td[2]'); # Synopsis # Example: # <h2>Synopsis</h2> # Yotsuba's daily life is full of adventure. She is energetic, curious, and a bit odd – odd enough to be called strange by her father as well as ignorant of many things that even a five-year-old should know. Because of this, the most ordinary experience can become an adventure for her. As the days progress, she makes new friends and shows those around her that every day can be enjoyable.<br /> # <br /> # [Written by MAL Rewrite] $extracted = $crawler->filterXPath('//span[@itemprop="description"]'); //Compatibility Note: We don't convert extended characters to HTML entities, we just //use the output directly from MAL. This should be okay as our return charset is UTF-8. $mangarecord->setSynopsis('There is currently no synopsis for this title.'); if ($extracted->count() > 0) { $mangarecord->setSynopsis($extracted->html()); } # Related Manga # Example: #<table class="anime_detail_related_anime" style="border-spacing:0px;"> # <tr> # <td class="ar fw-n borderClass" nowrap="" valign="top">Side story:</td> # <td class="borderClass" width="100%"><a href="/manga/13992/Azumanga_Daioh:_Hoshuu-hen">Azumanga Daioh: Hoshuu-hen</a></td> # </tr> # <tr> # <td class="ar fw-n borderClass" nowrap="" valign="top">Other:</td> # <td class="borderClass" width="100%"><a href="/manga/29937/Bara_Manga_Daioh">Bara Manga Daioh</a>, <a href="/manga/59917/Osaka_Banpaku">Osaka Banpaku</a></td> # </tr> #</table> $related = $rightcolumn->filter('table.anime_detail_related_anime'); //NOTE: Not all relations are currently supported. if (iterator_count($related)) { $rows = $related->children(); foreach ($rows as $row) { $rowItem = $row->firstChild; $relationType = rtrim($rowItem->nodeValue, ':'); //This gets the next td containing the items $relatedItem = $rowItem->nextSibling->firstChild; do { if ($relatedItem->nodeType !== XML_TEXT_NODE && $relatedItem->tagName == 'a') { $url = $relatedItem->attributes->getNamedItem('href')->nodeValue; $id = preg_match('/\\/(anime|manga)\\/(\\d+)\\/.*?/', $url, $urlParts); if (count($urlParts) > 2) { if ($id !== false || $id !== 0) { $itemId = (int) $urlParts[2]; $itemTitle = $relatedItem->textContent; $itemUrl = $url; } $itemArray = array(); if ($urlParts[1] == 'anime') { $itemArray['anime_id'] = $itemId; } else { $itemArray['manga_id'] = $itemId; } $itemArray['title'] = $itemTitle; $itemArray['url'] = 'https://myanimelist.net' . $itemUrl; switch ($relationType) { case 'Adaptation': $mangarecord->setAnimeAdaptations($itemArray); break; case 'Alternative version': $mangarecord->setAlternativeVersions($itemArray); break; case 'Other': default: $mangarecord->setRelatedManga($itemArray); break; } } } //Grab next item $relatedItem = $relatedItem->nextSibling; } while ($relatedItem !== null); } } # Personal Info $userPersonalInfo = $crawler->filterXPath('//h2[text()="Edit Status"]'); // Only try to parse personal info if the box is there if ($userPersonalInfo->count() > 0) { #Read Status - Only available when user is authenticated $my_data = $crawler->filter('select#myinfo_status'); if (iterator_count($my_data) && iterator_count($my_data->filter('option[selected="selected"]'))) { $mangarecord->setReadStatus($my_data->filter('option[selected="selected"]')->attr('value')); } #Read Chapters - Only available when user is authenticated $my_data = $crawler->filter('input#myinfo_chapters'); if (iterator_count($my_data)) { $mangarecord->setChaptersRead((int) $my_data->attr('value')); } #Read Volumes - Only available when user is authenticated $my_data = $crawler->filter('input#myinfo_volumes'); if (iterator_count($my_data)) { $mangarecord->setVolumesRead((int) $my_data->attr('value')); } #User's Score - Only available when user is authenticated $my_data = $crawler->filter('select#myinfo_score'); if (iterator_count($my_data) && iterator_count($my_data->filter('option[selected="selected"]'))) { $mangarecord->setScore((int) $my_data->filter('option[selected="selected"]')->attr('value')); } #Listed ID (?) - Only available when user is authenticated $my_data = $crawler->filterXPath('//a[text()="Edit Details"]'); if (iterator_count($my_data)) { if (preg_match('/id=(\\d+)/', $my_data->attr('href'), $my_data)) { $mangarecord->setListedMangaId((int) $my_data[1]); } } } return $mangarecord; }