public static function parseDay($rows) { $result = array(); foreach ($rows as $item) { $crawler = new Crawler($item); $anime = new Anime(); $url = $crawler->filter('a[class="link-title"]')->attr('href'); $id = preg_match('/\\/(anime|manga)\\/(\\d+)\\/.*?/', $url, $urlParts); if ($id !== false || $id !== 0) { $anime->setId((int) $urlParts[2]); } $anime->setTitle(trim($crawler->filter('a[class="link-title"]')->text())); $producer = $crawler->filter('span[class="producer"] a'); if ($producer->count() > 0) { $anime->setProducers(explode(', ', $crawler->filter('span[class="producer"] a')->text())); } $anime->setEpisodes((int) str_replace(' eps', '', $crawler->filter('div[class="eps"] span')->text())); $genres = $crawler->filter('div[class="genres-inner js-genre-inner"] a'); $genreArray = array(); foreach ($genres as $genre) { $genreCrawler = new Crawler($genre); $genreArray[] = $genreCrawler->text(); } $anime->setGenres($genreArray); $anime->setImageUrl($crawler->filter('div[class="image lazyload"]')->attr('data-bg')); $anime->setSynopsis(trim($crawler->filter('div[class="synopsis js-synopsis"]')->text())); $detail = explode('-', $crawler->filter('div[class="info"]')->text()); $anime->setType(trim($detail[0])); $anime->setMembersCount((int) str_replace(',', '', trim($crawler->filter('span[class="member fl-r"]')->text()))); $anime->setMembersScore((double) trim($crawler->filter('span[class="score"]')->text())); $result[] = $anime; } return $result; }
private static function parseAnime($item) { $crawler = new Crawler($item); $anime = new Anime(); $anime->setId($crawler->filter('id')->text()); $anime->setTitle($crawler->filter('title')->text()); $otherTitles = array(); $english = explode('; ', $crawler->filter('english')->text()); if (count($english) > 0 && $english !== '') { $otherTitles['english'] = $english; } $synonyms = explode('; ', $crawler->filter('synonyms')->text()); if (count($synonyms) > 0 && $synonyms[0] !== '') { $otherTitles['synonyms'] = $synonyms; } $anime->setOtherTitles($otherTitles); $anime->setEpisodes($crawler->filter('episodes')->text()); $anime->setMembersScore($crawler->filter('score')->text()); $anime->setStatus($crawler->filter('status')->text()); $startDate = $crawler->filter('start_date')->text(); if ($startDate !== '0000-00-00') { $anime->setStartDate((new \DateTime())->createFromFormat('Y-m-d', $startDate)); } $EndDate = $crawler->filter('end_date')->text(); if ($EndDate !== '0000-00-00') { $anime->setEndDate((new \DateTime())->createFromFormat('Y-m-d', $EndDate)); } $anime->setSynopsis($crawler->filter('synopsis')->text()); $anime->setImageUrl($crawler->filter('image')->text()); return $anime; }
public static function parse($contents, $apiVersion) { $crawler = new Crawler(); $crawler->addHTMLContent($contents, 'UTF-8'); $animerecord = new Anime(); # Anime ID. # Example: # <input type="hidden" name="aid" value="790"> $animerecord->setId((int) $crawler->filter('input[name="aid"]')->attr('value')); # Title and rank. # Example: # <span itemprop="name">One Piece</span> $animerecord->setTitle(trim($crawler->filter('span[itemprop="name"]')->text())); $rank = $crawler->filterXPath('//span[contains(@class, "ranked")]'); if (count($rank) > 0) { $animerecord->setRank((int) str_replace('Ranked #', '', $rank->text())); } # Title Image # Example: # <a href="http://myanimelist.net/anime/16353/Love_Lab/pic&pid=50257"><img src="http://cdn.myanimelist.net/images/anime/12/50257.jpg" alt="Love Lab" align="center"></a> $animerecord->setImageUrl(str_replace('t.jpg', '.jpg', $crawler->filter('div#content tr td div img')->attr('src'))); # Alternative Titles section. # Example: # <h2>Alternative Titles</h2> # <div class="spaceit_pad"><span class="dark_text">English:</span> Lucky Star/div> # <div class="spaceit_pad"><span class="dark_text">Synonyms:</span> Lucky Star, Raki ☆ Suta</div> # <div class="spaceit_pad"><span class="dark_text">Japanese:</span> らき すた</div> $leftcolumn = $crawler->filterXPath('//div[@id="content"]/table/tr/td[@class="borderClass"]'); # English: $extracted = $leftcolumn->filterXPath('//span[text()="English:"]'); if ($extracted->count() > 0) { $text = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); $other_titles['english'] = explode(', ', $text); $animerecord->setOtherTitles($other_titles); } # Synonyms: $extracted = $leftcolumn->filterXPath('//span[text()="Synonyms:"]'); if ($extracted->count() > 0) { $text = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); $other_titles['synonyms'] = explode(', ', $text); $animerecord->setOtherTitles($other_titles); } # Japanese: $extracted = $leftcolumn->filterXPath('//span[text()="Japanese:"]'); if ($extracted->count() > 0) { $text = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); $other_titles['japanese'] = explode(', ', $text); $animerecord->setOtherTitles($other_titles); } # Information section. # Example: # <h2>Information</h2> # <div><span class="dark_text">Type:</span> TV</div> # <div class="spaceit"><span class="dark_text">Episodes:</span> 24</div> # <div><span class="dark_text">Status:</span> Finished Airing</div> # <div class="spaceit"><span class="dark_text">Aired:</span> Apr 9, 2007 to Sep 17, 2007</div> # <div> # <span class="dark_text">Producers:</span> # <a href="http://myanimelist.net/anime.php?p=2">Kyoto Animation</a>, # <a href="http://myanimelist.net/anime.php?p=104">Lantis</a>, # <a href="http://myanimelist.net/anime.php?p=262">Kadokawa Pictures USA</a><sup><small>L</small></sup>, # <a href="http://myanimelist.net/anime.php?p=286">Bang Zoom! Entertainment</a> # </div> # <div class="spaceit"> # <span class="dark_text">Genres:</span> # <a href="http://myanimelist.net/anime.php?genre[]=4">Comedy</a>, # <a href="http://myanimelist.net/anime.php?genre[]=20">Parody</a>, # <a href="http://myanimelist.net/anime.php?genre[]=23">School</a>, # <a href="http://myanimelist.net/anime.php?genre[]=36">Slice of Life</a> # </div> # <div><span class="dark_text">Duration:</span> 24 min. per episode</div> # <div class="spaceit"><span class="dark_text">Rating:</span> PG-13 - Teens 13 or older</div> # Type: $extracted = $leftcolumn->filterXPath('//span[text()="Type:"]'); if ($extracted->count() > 0) { $animerecord->setType(trim(str_replace($extracted->text(), '', $extracted->parents()->text()))); } # Episodes: $extracted = $leftcolumn->filterXPath('//span[text()="Episodes:"]'); if ($extracted->count() > 0) { $episodeCount = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); if (is_numeric($episodeCount)) { $animerecord->setEpisodes((int) $episodeCount); } } # Status: $extracted = $leftcolumn->filterXPath('//span[text()="Status:"]'); if ($extracted->count() > 0) { $animerecord->setStatus(strtolower(trim(str_replace($extracted->text(), '', $extracted->parents()->text())))); } # Aired: $extracted = $leftcolumn->filterXPath('//span[text()="Aired:"]'); if ($extracted->count() > 0) { /* * NOTE: The Ruby API has a bug where yet-to-air shows that only have one date * get that listed as the "end_date", not the "start_date". The code below fixes * this and in doing so delibrately breaks compatibility in order to present the * data properly. */ $daterange = explode(' to ', trim(str_replace($extracted->text(), '', $extracted->parents()->text()))); //MAL always provides record dates in US-style format. if (strpos($daterange[0], ',') === false) { if (strlen($daterange[0]) === 4) { $animerecord->setStartDate(DateTime::createFromFormat('Y m d', $daterange[0] . ' 01 01'), 'year'); //Example ID 6535 or 9951 } elseif ($daterange[0] !== 'Not available') { $animerecord->setStartDate(DateTime::createFromFormat('M Y d', $daterange[0] . ' 01'), 'month'); //Example ID 22535 (check upcoming list) } } else { if (count(explode(' ', $daterange[0])) == 2) { //MAL has been showing a comma with month and year (Jan, 2016), so catch that $dateComponents = explode(' ', $daterange[0]); $month = substr($dateComponents[0], 0, -1); $year = $dateComponents[1]; $animerecord->setStartDate(DateTime::createFromFormat('M Y d', $month . ' ' . $year . ' 01'), 'month'); } elseif (strlen($daterange[0]) !== 7 && strlen($daterange[0]) !== 8) { $animerecord->setStartDate(DateTime::createFromFormat('M j, Y', $daterange[0]), 'day'); } } //Series not yet to air won't list a range at all while currently airing series will use a "?" //For these, we should return a null if (count($daterange) > 1 && $daterange[1] !== '?') { //MAL always provides record dates in US-style format. //Not all dates are full, so we have to figure out how to properly parse them $dateParts = explode(' ', $daterange[1]); $firstIsNumber = is_numeric($dateParts[0]); $hasComma = strpos($dateParts[0], ','); if (count($dateParts) == 3) { //Full date, normal processing $endDate = DateTime::createFromFormat('M j, Y', $daterange[1]); $animerecord->setEndDate($endDate, 'day'); } elseif (count($dateParts) == 2) { //We only have two parts, figure out what we were given if ($firstIsNumber === false && $hasComma !== false) { //So, it looks like month and year, because MAL adds the comma regardless. $endDate = DateTime::createFromFormat('M, Y d', $daterange[1] . ' 01'); //Example ID 21275 $animerecord->setEndDate($endDate, 'month'); } } else { if (count($dateParts) == 1 && $firstIsNumber) { //Most likely just a year. $endDate = DateTime::createFromFormat('Y m d', $daterange[1] . ' 01 01'); $animerecord->setEndDate($endDate, 'year'); //Example ID 11836 } } } } # Producers: $extracted = $leftcolumn->filterXPath('//span[text()="Producers:"]'); if (strpos($extracted->parents()->text(), 'None found') === false && $extracted->count() > 0) { $records = $extracted->parents()->first()->filter('a'); foreach ($records as $rItem) { $producers[] = $rItem->nodeValue; } $animerecord->setProducers($producers); } # Genres: $extracted = $leftcolumn->filterXPath('//span[text()="Genres:"]'); if ($extracted->count() > 0) { $genres = array(); $records = $extracted->parents()->first()->filter('a'); foreach ($records as $rItem) { $genres[] = $rItem->nodeValue; } if (count($genres) > 0) { $animerecord->setGenres($genres); } } # Classification: $extracted = $leftcolumn->filterXPath('//span[text()="Rating:"]'); if ($extracted->count() > 0) { $animerecord->setClassification(trim(str_replace($extracted->text(), '', $extracted->parents()->text()))); } # Statistics # Example: # <h2>Statistics</h2> # <div> # <span class="dark_text">Score:</span> 8.41<sup><small>1</small></sup> # <small>(scored by 22601 users)</small> # </div> # <div class="spaceit"><span class="dark_text">Ranked:</span> #96<sup><small>2</small></sup></div> # <div><span class="dark_text">Popularity:</span> #15</div> # <div class="spaceit"><span class="dark_text">Members:</span> 36,961</div> # <div><span class="dark_text">Favorites:</span> 2,874</div> //TODO: Rewrite to properly clean up excess tags. # Score: $extracted = $leftcolumn->filterXPath('//span[text()="Score:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //Remove the parenthetical at the end of the string $extracted = trim(str_replace(strstr($extracted, '('), '', $extracted)); //Sometimes there is a superscript number at the end from a note. //Scores are only two decimals, so number_format should chop off the excess, hopefully. if (strpos($extracted, 'N/A') === false) { $animerecord->setMembersScore((double) number_format($extracted, 2)); } } # Popularity: $extracted = $leftcolumn->filterXPath('//span[text()="Popularity:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //Remove the hash at the front of the string and trim whitespace. Needed so we can cast to an int. $extracted = trim(str_replace('#', '', $extracted)); $animerecord->setPopularityRank((int) $extracted); } # Members: $extracted = $leftcolumn->filterXPath('//span[text()="Members:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //PHP doesn't like commas in integers. Remove it. $extracted = trim(str_replace(',', '', $extracted)); $animerecord->setMembersCount((int) $extracted); } # Members: $extracted = $leftcolumn->filterXPath('//span[text()="Favorites:"]'); if ($extracted->count() > 0) { $extracted = str_replace($extracted->text(), '', $extracted->parents()->text()); //PHP doesn't like commas in integers. Remove it. $extracted = trim(str_replace(',', '', $extracted)); $animerecord->setFavoritedCount((int) $extracted); } # - # Extract from sections on the right column: Synopsis, Related Anime, Characters & Voice Actors, Reviews # Recommendations. # - $rightcolumn = $crawler->filterXPath('//div[@id="content"]/table/tr/td[2]'); # Synopsis # Example: # <td> # <h2>Synopsis</h2> # Having fun in school, doing homework together, cooking and eating, playing videogames, watching anime. All those little things make up the daily life of the anime- and chocolate-loving Izumi Konata and her friends. Sometimes relaxing but more than often simply funny! <br /> # -From AniDB $extracted = $crawler->filterXPath('//span[@itemprop="description"]'); //Compatibility Note: We don't convert extended characters to HTML entities, we just //use the output directly from MAL. This should be okay as our return charset is UTF-8. $animerecord->setSynopsis('There is currently no synopsis for this title.'); if ($extracted->count() > 0) { $animerecord->setSynopsis($extracted->html()); } # Related Anime # Example: #<table class="anime_detail_related_anime" style="border-spacing:0px;"> # <tr> # <td class="ar fw-n borderClass" nowrap="" valign="top">Adaptation:</td> # <td class="borderClass" width="100%"><a href="/manga/587/Lucky☆Star">Lucky☆Star</a></td> # </tr> # <tr> # <td class="ar fw-n borderClass" nowrap="" valign="top">Character:</td> # <td class="borderClass" width="100%"><a href="/anime/3080/Anime_Tenchou">Anime Tenchou</a></td> # </tr> #</table> $related = $rightcolumn->filter('table.anime_detail_related_anime'); //NOTE: Not all relations are currently supported. if (iterator_count($related)) { $rows = $related->children(); foreach ($rows as $row) { $rowItem = $row->firstChild; $relationType = strtr(strtolower(rtrim($rowItem->nodeValue, ':')), ' ', '_'); //This gets the next td containing the items $relatedItem = $rowItem->nextSibling->firstChild; do { if ($relatedItem->nodeType !== XML_TEXT_NODE && $relatedItem->tagName == 'a') { $url = $relatedItem->attributes->getNamedItem('href')->nodeValue; $id = preg_match('/\\/(anime|manga)\\/(\\d+)\\/.*?/', $url, $urlParts); if (count($urlParts) > 2) { if ($id !== false || $id !== 0) { $itemId = (int) $urlParts[2]; $itemTitle = $relatedItem->textContent; $itemUrl = $url; } $itemArray = array(); if ($urlParts[1] == 'anime') { $itemArray['anime_id'] = $itemId; } else { $itemArray['manga_id'] = $itemId; } $itemArray['title'] = $itemTitle; $itemArray['url'] = 'https://myanimelist.net' . $itemUrl; $animerecord->addRelation($itemArray, $relationType); } } //Grab next item $relatedItem = $relatedItem->nextSibling; } while ($relatedItem !== null); } } # Personal Info $userPersonalInfo = $crawler->filterXPath('//h2[text()="Edit Status"]'); // Only try to parse personal info if the box is there if ($userPersonalInfo->count() > 0) { #Watched Status - Only available when user is authenticated $my_data = $crawler->filter('select#myinfo_status'); if (iterator_count($my_data) && iterator_count($my_data->filter('option[selected="selected"]'))) { $animerecord->setWatchedStatus($my_data->filter('option[selected="selected"]')->attr('value')); } #Watched Episodes - Only available when user is authenticated $my_data = $crawler->filter('input#myinfo_watchedeps'); if (iterator_count($my_data)) { $animerecord->setWatchedEpisodes((int) $my_data->attr('value')); } #User's Score - Only available when user is authenticated $my_data = $crawler->filter('select#myinfo_score'); if (iterator_count($my_data) && iterator_count($my_data->filter('option[selected="selected"]'))) { $animerecord->setScore((int) $my_data->filter('option[selected="selected"]')->attr('value')); } #Listed ID (?) - Only available when user is authenticated $my_data = $crawler->filterXPath('//a[text()="Edit Details"]'); if (iterator_count($my_data)) { if (preg_match('/id=(\\d+)/', $my_data->attr('href'), $my_data)) { $animerecord->setListedAnimeId((int) $my_data[1]); } } } if ($apiVersion >= '2.1') { # Background preg_match('/div>Background<\\/h2>(.+?)<div/s', $crawler->filter('span[itemprop="description"]')->parents()->html(), $matches); if (strpos($matches[0], 'No background information') !== false) { $animerecord->setBackground('No background information has been added to this title.'); } else { $animerecord->setBackground(trim($matches[1])); } # Broadcast: $extracted = $leftcolumn->filterXPath('//span[text()="Broadcast:"]'); if ($extracted->count() > 0) { $animerecord->setBroadcast(trim(preg_replace('/(\\w.+)s at(\\s\\d.+)\\((\\w.+)\\)/', '$1$2$3', str_replace($extracted->text(), '', $extracted->parents()->text())))); } # Duration: $extracted = $leftcolumn->filterXPath('//span[text()="Duration:"]'); if ($extracted->count() > 0) { $duration = trim(str_replace($extracted->text(), '', $extracted->parents()->text())); // Handle varations that include minutes if (strpos($duration, 'min.') !== false) { if (strpos($duration, 'hr.') !== false) { //contains hours and minutes preg_match('/([0-9]+) hr\\. ([0-9]+) min\\./', $duration, $durationParts); //This could all be done in one line, but it's more understandable and maintainable broken up. $hours = (int) $durationParts[1]; $minutes = (int) $durationParts[2]; $animerecord->setDuration($hours * 60 + $minutes); } else { //contains only minutes preg_match('/([0-9]+) min\\./', $duration, $durationParts); $animerecord->setDuration((int) $durationParts[1]); } //Handle hour-only durations } elseif (strpos($duration, 'hr.') !== false) { preg_match('/([0-9]+) hr\\./', $duration, $durationParts); $animerecord->setDuration((int) $durationParts[1] * 60); } // Any other format (such as just "Unknown") isn't understood and is ignored } // External links is only visible when an user has logged in any may be hidden on some records. $externalLinks = $crawler->filterXPath('//h2[text()="External Links"]'); if ($externalLinks->count() > 0) { $extracted = $externalLinks->nextAll()->filter('a'); # External Links: foreach ($extracted as $externalLinkRow) { $animerecord->setExternalLinks($externalLinkRow->nodeValue, $externalLinkRow->getAttribute('href')); } } # Preview: $extracted = $crawler->filter('div[class="video-promotion"] a'); if ($extracted->count() > 0) { $animerecord->setPreview(preg_replace('/\\?(.+?)$/', '$2', $extracted->attr('href'))); } # Opening Theme: $extracted = $crawler->filter('div[class="theme-songs js-theme-songs opnening"] span'); foreach ($extracted as $openingRow) { $animerecord->setOpeningTheme($openingRow->nodeValue); } # Ending Theme: $extracted = $crawler->filter('div[class="theme-songs js-theme-songs ending"] span'); foreach ($extracted as $endingRow) { $animerecord->setEndingTheme($endingRow->nodeValue); } # Recommendations $extracted = $crawler->filter('div[id="anime_recommendation"] li[class="btn-anime"]'); foreach ($extracted as $recommendationsRow) { $recommendationsCrawler = new Crawler($recommendationsRow); $anime = new Anime(); // ID can be on either side of the hyphen $tempId = preg_replace('/.*\\//', '', $recommendationsCrawler->filter('a')->attr('href')); // Put the 2 IDs into $tempId $anime->setId(preg_replace('/(-|' . $animerecord->getId() . ')/', '', $tempId)); // Remove the hyphen and the current anime's id, leaving the desired id $anime->setTitle($recommendationsCrawler->filter('span')->text()); $anime->setImageUrl(preg_replace('/r(.+?)\\/(.+?)\\?(.+?)$/', '$2', $recommendationsCrawler->filter('img')->attr('data-src'))); $animerecord->setRecommendations($anime); } } return $animerecord; }
public function testEpisodes() { $episodes = rand(); $anime = new Anime(); $anime->setEpisodes($episodes); $this->assertEquals($episodes, $anime->getEpisodes()); }