/** * Fetches the data for a given IMDB-ID * * @author Tiago Fonseca <*****@*****.**> * @author Victor La <*****@*****.**> * @author Roland Obermayer <*****@*****.**> * @param int IMDB-ID * @return array Result data */ function imdbData($imdbID) { global $imdbServer; global $imdbIdPrefix; global $CLIENTERROR; global $cache; $imdbID = preg_replace('/^' . $imdbIdPrefix . '/', '', $imdbID); $data = array(); // result $ary = array(); // temp // fetch mainpage $resp = httpClient($imdbServer . '/title/tt' . $imdbID . '/', $cache); // added trailing / to avoid redirect if (!$resp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } // add encoding $data['encoding'] = get_response_encoding($resp); // Check if it is a TV series episode if (preg_match('/<div id="titleTVEpisodes"/i', $resp['data'])) { #if (preg_match('/<div id="titleTVSeries"/i', $resp['data'])) { $data['istv'] = 1; # find id of Series preg_match('/<a href="\\/title\\/tt(\\d+)\\/episodes.*?" title="Full Episode List.*"/i', $resp['data'], $ary); $data['tvseries_id'] = trim($ary[1]); } // Titles and Year if ($data['istv']) { preg_match('/<meta name="title" content=""(.*?)"\\s+(.*?)(.TV episode .*?)?( - IMDB)?"/si', $resp['data'], $ary); $data['title'] = trim($ary[1]); $data['subtitle'] = trim($ary[2]); #dlog($ary); if (preg_match('/<h1 class="header".*?>.*?<span class="nobr">\\(.*?(\\d\\d\\d\\d)\\)</si', $resp['data'], $ary)) { $data['year'] = $ary[1]; } } else { preg_match('/<meta name="title" content="(IMDb - )?(.*?) \\(.*?(\\d\\d\\d\\d).*?\\)( - IMDb)?" \\/>/si', $resp['data'], $ary); #dlog($ary); $data['year'] = trim($ary[3]); # split title - subtitle list($t, $s) = explode(' - ', trim($ary[2]), 2); $data['title'] = trim($t); $data['subtitle'] = trim($s); } # orig. title preg_match('/<span class="title-extra".+?>\\s*"?(.*?)"?\\s*<i>\\(original title\\)<\\/i>\\s*</si', $resp['data'], $ary); $data['origtitle'] = trim($ary[1]); // Cover URL $data['coverurl'] = imdbGetCoverURL($resp['data']); // MPAA Rating preg_match('/<span\\s?itemprop="contentRating">(.*?)</is', $resp['data'], $ary); $data['mpaa'] = trim($ary[1]); // UK BBFC Rating # no longer appears on main page #preg_match('/>\s*UK:(.*?)<\/a>\s+/s', $resp['data'], $ary); #$data['bbfc'] = trim($ary[1]); // Runtime // many but not all yet have new <time itemprop="duration"> tag preg_match('/itemprop="duration".*?>(\\d+)\\s+min<\\//si', $sresp['data'], $ary); if (!$ary) { preg_match('/Runtime:?<\\/h4>.*?>(\\d+)\\s+min/si', $resp['data'], $ary); } $data['runtime'] = preg_replace('/,/', '', trim($ary[1])); // Director preg_match('/Directors?:\\s*<\\/h4>(.+?)<\\/div>/si', $resp['data'], $ary); preg_match_all('/<a.*?href="\\/name\\/nm.+?".*?>(.+?)<\\/a>/si', $ary[1], $ary, PREG_PATTERN_ORDER); // TODO: Update templates to use multiple directors $data['director'] = trim(join(', ', $ary[1])); // Rating preg_match('/<span .*? itemprop="ratingValue">([\\d\\.]+)<\\/span>/si', $resp['data'], $ary); $data['rating'] = trim($ary[1]); // Countries preg_match('/Country:\\s*<\\/h4>(.+?)<\\/div>/si', $resp['data'], $ary); preg_match_all('/<a.*?href="\\/country\\/.+?".*?>(.+?)<\\/a>/si', $ary[1], $ary, PREG_PATTERN_ORDER); $data['country'] = trim(join(', ', $ary[1])); // Languages preg_match('/Languages?:\\s*<\\/h4>(.+?)<\\/div>/si', $resp['data'], $ary); preg_match_all('/<a.*?href="\\/language\\/.+?".*?>(.+?)<\\/a>/si', $ary[1], $ary, PREG_PATTERN_ORDER); $data['language'] = trim(strtolower(join(', ', $ary[1]))); // Genres (as Array) preg_match('/Genres:\\s*<\\/h4>(.+?)<\\/div>/si', $resp['data'], $ary); preg_match_all('/<a.*?href="\\/genres?\\/.+?".*?>(.+?)<\\/a>/si', $ary[1], $ary, PREG_PATTERN_ORDER); foreach ($ary[1] as $genre) { $data['genres'][] = trim($genre); } // for Episodes - try to get some missing stuff from the main series page if ($data['istv'] and (!$data['runtime'] or !$data['country'] or !$data['language'] or !$data['coverurl'])) { $sresp = httpClient($imdbServer . '/title/tt' . $data['tvseries_id'] . '/', $cache); if (!$sresp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } # runtime if (!$data['runtime']) { preg_match('/itemprop="duration".*?>(\\d+)\\s+min<\\//si', $sresp['data'], $ary); if (!$ary) { preg_match('/Runtime:?<\\/h4>.*?>(\\d+)\\s+min/si', $resp['data'], $ary); } $data['runtime'] = preg_replace('/,/', '', trim($ary[1])); } # country if (!$data['country']) { preg_match('/Country:\\s*<\\/h4>(.+?)<\\/div>/si', $sresp['data'], $ary); preg_match_all('/<a.*?href="\\/country\\/.+?".*?>(.+?)<\\/a>/si', $ary[1], $ary, PREG_PATTERN_ORDER); $data['country'] = trim(join(', ', $ary[1])); } # language if (!$data['language']) { preg_match('/Languages?:\\s*<\\/h4>(.+?)<\\/div>/si', $sresp['data'], $ary); preg_match_all('/<a.*?href="\\/language\\/.+?".*?>(.+?)<\\/a>/si', $ary[1], $ary, PREG_PATTERN_ORDER); $data['language'] = trim(strtolower(join(', ', $ary[1]))); } # cover if (!$data['coverurl']) { $data['coverurl'] = imdbGetCoverURL($sresp['data']); } } // Plot preg_match('/<h2>Storyline<\\/h2>.*?<p>(.*?)</si', $resp['data'], $ary); $data['plot'] = $ary[1]; // Fetch credits $resp = imdbFixEncoding($data, httpClient($imdbServer . '/title/tt' . $imdbID . '/fullcredits', $cache)); if (!$resp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } // Cast if (preg_match('#<table class="cast_list">(.*)#si', $resp['data'], $match)) { // no idea why it does not always work with (.*?)</table // could be some maximum length of .*? // anyways, I'm cutting it here $casthtml = substr($match[1], 0, strpos($match[1], '</table')); if (preg_match_all('#<td .*? itemprop="actor".*?>\\s+<a href="/name/(nm\\d+)/?.*?".*?>(.*?)</a>.*?<td class="character">(.*?)</td>#si', $casthtml, $ary, PREG_PATTERN_ORDER)) { for ($i = 0; $i < sizeof($ary[0]); $i++) { $actorid = trim(strip_tags($ary[1][$i])); $actor = trim(strip_tags($ary[2][$i])); $character = trim(preg_replace('/\\s+/', ' ', strip_tags(preg_replace('/ /', ' ', $ary[3][$i])))); $cast .= "{$actor}::{$character}::{$imdbIdPrefix}{$actorid}\n"; } } // remove html entities and replace with simple space $data['cast'] = html_clean_utf8($cast); // sometimes appearing in series (e.g. Scrubs) $data['cast'] = preg_replace('#/ ... #', '', $data['cast']); } // Fetch plot $resp = $resp = imdbFixEncoding($data, httpClient($imdbServer . '/title/tt' . $imdbID . '/plotsummary', $cache)); if (!$resp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } // Plot preg_match('/<P CLASS="plotSummary">(.+?)<\\/P>/is', $resp['data'], $ary); if ($ary[1]) { $data['plot'] = trim($ary[1]); $data['plot'] = preg_replace('/"/', '"', $data['plot']); //Replace HTML " with " //Begin removal of 'Written by' section $data['plot'] = preg_replace('/<a href="\\/SearchPlotWriters.*?<\\/a>/', '', $data['plot']); $data['plot'] = preg_replace('/Written by/', '', $data['plot']); $data['plot'] = preg_replace('/<i>\\s+<\\/i>/', ' ', $data['plot']); //End of removal of 'Written by' section $data['plot'] = preg_replace('/\\s+/s', ' ', $data['plot']); } $data['plot'] = html_clean($data['plot']); #dump($data['plot']); return $data; }
/** * Clean HTML tags from hierarchical associative array * * @param array $data string or hierarchical array to convert */ function engine_clean_input(&$data) { if (is_array($data)) { foreach ($data as $key => $val) { if (is_array($val)) { engine_clean_input($data[$key]); } else { $val = html_to_text($val); $data[$key] = html_clean_utf8($val); } } } }
/** * Fetches the data for a given Allocine-ID * * @author Douglas Mayle <*****@*****.**> * @author Tiago Fonseca <*****@*****.**> * @param int imdb-ID * @return array Result data */ function allocineData($imdbID) { global $allocineServer; global $allocineIdPrefix; global $CLIENTERROR; $allocineID = preg_replace('/^' . $allocineIdPrefix . '/', '', $imdbID); // fetch mainpage $resp = httpClient($allocineServer . '/film/fichefilm_gen_cfilm=' . $allocineID . '.html', 1); // added trailing / to avoid redirect if (!$resp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } $data = array(); // result $ary = array(); // temp // add encoding $data['encoding'] = get_response_encoding($resp); // Allocine ID $data['id'] = "allocine:" . $allocineID; // We remove all the multiples spaces and line breakers $resp['data'] = preg_replace('/[\\s]{2,}/', '', $resp['data']); /* Title and subtitle */ preg_match('#<h1.*?>(.*?)</h1>#si', $resp['data'], $ary); list($t, $s) = explode(" - ", trim($ary[1]), 2); // Some bugs when using html_clean function --> using html_clean_utf8 $data['title'] = html_clean_utf8($t); $data['subtitle'] = html_clean_utf8($s); /* Year */ preg_match('/<a.*? href="\\/film\\/tous\\/decennie.*?year=(\\d+)">(\\d+)<\\/a>/i', $resp['data'], $ary); if (!empty($ary[1])) { $data['year'] = trim($ary[1]); } /* Release Date added to the comments */ preg_match('#<a.*? href="/film/agenda\\.html\\?week=\\d+\\-\\d+\\-\\d+">(.*)</a>#i', $resp['data'], $ary); $release_date = ""; if (!empty($ary[1])) { $release_date = "\r\nDate de sortie cinéma : " . html_clean_utf8($ary[1]); } /* Cover URL */ preg_match('#<div class="colleft">\\s*?<div class="vmargin20b">\\s*?<div class=\\"poster\\">\\s*?<em class=\\"imagecontainer\\">\\s*?<a .*?>\\s*?<img.*?src=\'(.*?)\'.*?>#si', $resp['data'], $ary); $data['coverurl'] = trim($ary[1]); /* Runtime */ #Durée : 02h13min preg_match('/Durée :\\s*?(\\d+)h(\\d+)\\s*?min/i', $resp['data'], $ary); $hours = preg_replace('/,/', '', trim($ary[1])); $minutes = preg_replace('/,/', '', trim($ary[2])); $data['runtime'] = $hours * 60 + $minutes; /* Director */ preg_match('#Réalisé par\\s*<span.*?><a.*?rel="v:directedBy".*?href=\'/personne/fichepersonne_gen_cpersonne=\\d+\\.html\' title=\'.*\'>(.*)</a></span>#i', $resp['data'], $ary); $data['director'] = trim($ary[1]); /* Rating */ preg_match('#<p class="withstars"><a.*?href="/film/critiquepublic_gen_cfilm=\\d+\\.html"><img.*?class="stareval.*?".*?<span class=\\"moreinfo\\">\\((.*)\\)</span></p>#i', $resp['data'], $ary); $data['rating'] = trim($ary[1]); $data['rating'] = str_replace(",", ".", $data['rating']); // Allocine rating is based on 5, imdb is based on 10 $data['rating'] = $data['rating'] * 2; /* Countries */ // Countries in English $map_countries = array('allemand' => 'Germany', 'américain' => 'USA', 'arménien' => 'Armenia', 'argentin' => 'Argentina', 'sud-africain' => 'South Africa', 'australien' => 'Australia', 'belge' => 'Belgium', 'britannique' => 'UK', 'bulgare' => 'Bulgaria', 'canadien' => 'Canada', 'chinois' => 'China', 'coréen' => 'South Korea', 'danois' => 'Denmark', 'espagnol' => 'Spain', 'français' => 'France', 'grec' => 'Greece', 'hollandais' => 'Netherlands', 'hong-kongais' => 'Hong-Kong', 'hongrois' => 'Hungary', 'indien' => 'India', 'irlandais' => 'Republic of Ireland', 'islandais' => 'Iceland', 'israëlien' => 'Israel', 'italien' => 'Italy', 'japonais' => 'Japan', 'luxembourgeois' => 'Luxembourg', 'mexicain' => 'Mexico', 'norvégien' => 'Norge', 'néo-zélandais' => 'New Zealand', 'polonais' => 'Poland', 'portugais' => 'Portugal', 'roumain' => 'Romania', 'russe' => 'Russia', 'serbe' => 'Serbia', 'suédois' => 'Sweden', 'taïwanais' => 'Taiwan', 'tchèque' => 'Czech Republic', 'thaïlandais' => 'Thailand', 'turc' => 'Turkey', 'ukrainien' => 'Ukraine', 'vietnamien' => 'Vietnam'); if (preg_match_all('#Long\\-métrage\\s*?<a.*?href=".*?">(.*?)</a>#si', $resp['data'], $ary, PREG_PATTERN_ORDER) > 0) { $originlist = explode(",", trim(join(', ', $ary[1]))); foreach ($originlist as $origin) { $mapped_country_found = ''; foreach ($map_countries as $pattern_c => $mapped_country) { if (preg_match_all('/' . $pattern_c . '/i', $origin, $junk, PREG_PATTERN_ORDER) > 0) { $mapped_country_found = $mapped_country; break; } } if ($data['country'] == '') { $data['country'] = $mapped_country_found; } elseif (stristr($data['country'], $mapped_country_found) == TRUE) { $data['country'] = $data['country']; } else { $data['country'] = $data['country'] . ', ' . $mapped_country_found; } } } /* Plot */ preg_match('#<div id="synopsis_full">\\s*?<p>\\s*?<span class=\\"bold\\">Synopsis \\: </span>\\s*?<span property="v:summary">(.*?)</span>#is', $resp['data'], $ary); if (!empty($ary[1])) { $data['plot'] = $ary[1]; $data['plot'] = html_clean_utf8($data['plot']); // And cleanup $data['plot'] = trim($data['plot']); $data['plot'] = preg_replace('/[\\n\\r]/', ' ', $data['plot']); $data['plot'] = preg_replace('/ /', ' ', $data['plot']); } /* Genres (as Array) */ $map_genres = array('Action' => 'Action', 'Animation' => 'Animation', 'Arts Martiaux' => 'Action', 'Aventure' => 'Adventure', 'Biopic' => 'Biography', 'Bollywood' => 'Musical', 'Classique' => '-', 'Comédie Dramatique' => 'Drama', 'Comédie musicale' => 'Musical', 'Comédie' => 'Comedy', 'Dessin animé' => 'Animation', 'Divers' => '-', 'Documentaire' => 'Documentary', 'Drame' => 'Drama', 'Epouvante-horreur' => 'Horror', 'Erotique' => 'Adult', 'Espionnage' => '-', 'Famille' => 'Family', 'Fantastique' => 'Fantasy', 'Guerre' => 'War', 'Historique' => 'History', 'Horreur' => 'Horror', 'Musique' => 'Musical', 'Policier' => 'Crime', 'Péplum' => 'History', 'Romance' => 'Romance', 'Science fiction' => 'Sci-Fi', 'Thriller' => 'Thriller', 'Western' => 'Western'); if (preg_match_all('#Genre :(.*?)</a>\\s*?<br#si', $resp['data'], $ary, PREG_PATTERN_ORDER) > 0) { $genrelist = explode(",", trim(join(', ', $ary[1]))); foreach ($genrelist as $genre) { $mapped_genre_found = ''; foreach ($map_genres as $pattern => $mapped_genre) { if (preg_match_all('/' . $pattern . '/i', $genre, $junk, PREG_PATTERN_ORDER) > 0) { $mapped_genre_found = $mapped_genre; break; } } $data['genres'][] = $mapped_genre_found != '-' ? $mapped_genre_found : trim($genre); } } /* Original Title */ preg_match('#Titre original : <span class=\\"purehtml\\"><em>(.*)</em></span>#', $resp['data'], $ary); $data['origtitle'] = trim($ary[1]); /* Title and Subtitle If sub-title is blank, we'll try to fill in the original title for foreign films. */ if (empty($data['subtitle'])) { if ($data['origtitle']) { $data['subtitle'] = $data['title']; $data['title'] = $data['origtitle']; } } /* CREDITS AND CAST */ // fetch credits // Another HTML page $resp = httpClient($allocineServer . '/film/casting_gen_cfilm=' . $allocineID . '.html', 1); if (!$resp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } // We remove all the multiples spaces and line breakers $resp['data'] = preg_replace('/[\\s]{2,}/', '', $resp['data']); if (preg_match('#<h2>Acteurs, rôles, personnages</h2>(.*?)<div class="titlebar">\\s*?<a class="anchor" id=\'actors\'></a>\\s*?<h2>#is', $resp['data'], $Section)) { # the big ones with image /* <div class="titlebar"> <h3> <a href="/personne/fichepersonne_gen_cpersonne=5568.html">Liam Neeson</a> </h3> </div> <p> Rôle : Qui-Gon Jinn </p> <div class="spacer"></div> */ preg_match_all('#<div class="titlebar">\\s*?<h3>\\s*?<a href="/personne/fichepersonne_gen_cpersonne=(\\d+?).html">(.*?)</a>\\s*?</h3>\\s*?</div>\\s*?<p>\\s*Rôle : (.*?)\\s*</p>#is', $Section[1], $ary, PREG_PATTERN_ORDER); $count = 0; while (isset($ary[1][$count])) { $cast .= $ary[2][$count] . "::" . $ary[3][$count] . "::allocine:" . $ary[1][$count] . "\n"; $count++; } # extended cast - without image /* <tr class="odd"> <td> Shmi Skywalker </td> <td> <a href="/personne/fichepersonne_gen_cpersonne=14279.html">Pernilla August</a> </td> </tr> */ preg_match_all('#<tr.*?>\\s*?<td>\\s*(.*?)\\s*</td>\\s*?<td>\\s*?<a href="/personne/fichepersonne_gen_cpersonne=(\\d+).html">(.*?)</a>\\s*?</td>#si', $Section[1], $ary, PREG_PATTERN_ORDER); $count = 0; while (isset($ary[1][$count])) { $cast .= $ary[3][$count] . "::" . $ary[1][$count] . "::allocine:" . $ary[2][$count] . "\n"; $count++; } $data['cast'] = trim($cast); } /* Comments */ // By default $data['language'] = 'french'; // Another HTML page $resp = httpClient($allocineServer . '/film/fichefilm-' . $allocineID . '/technique/', 1); if (!$resp['success']) { $CLIENTERROR .= $resp['error'] . "\n"; } // We remove all the multiples spaces and line breakers $resp['data'] = preg_replace('/[\\s]{2,}/', '', $resp['data']); // Technical informations as comment preg_match('#<div class=\\"rubric\\">\\s*?<div class=\\"vpadding20b\\">\\s*(.*?)\\s*</div>\\s*?</div>#si', $resp['data'], $ary); if (!empty($ary[1])) { $data['comment'] = $ary[1]; $data['comment'] = str_replace("Tourné en :", "Tourné en : ", $data['comment']); // Adding the release date in theater $data['comment'] = $data['comment'] . $release_date; // Search the language // Default language $data['language'] = "french"; if (preg_match('#<p>\\s*?<span class=\\"bold\\">Tourné en :</span>\\s*(.*?)\\s*</p>#si', $resp['data'], $ary)) { $data['language'] = $ary[1]; // Converting languages from french to english $map_languages = array('Anglais' => 'english', 'Français' => 'french', 'Allemand' => 'german', 'Italien' => 'italian', 'Espagnol' => 'spanish', 'Coréen' => 'Korean', 'Roumain' => 'romanian', 'Autre' => 'french', 'Hindi' => 'hindi', 'Arabe' => 'arabic', 'Thaï' => 'thai', 'Danois' => 'danish', 'Suédois' => 'swedish', 'Tchèque' => 'czech', 'Japonais' => 'japanese', 'Portugais' => 'portuguese', 'Norvégien' => 'norwegian', 'Bulgare' => 'bulgarian', 'Grec' => 'greek', 'Hongrois' => 'hungarian', 'Turc' => 'turkish', 'Islandais' => 'icelandic', 'Polonais' => 'polish', 'Russe' => 'russian', 'Ukrainien' => 'ukrainian', 'Serbe' => 'serbian', 'Vietnamien' => 'vietnamese', 'Afrikaans' => 'afrikaans'); foreach ($map_languages as $pattern => $map_lang) { $data['language'] = str_replace($pattern, $map_lang, $data['language']); } } } // Return the data collected return $data; }
<?php error_reporting(E_ALL ^ E_NOTICE); if (!$submit) { echo "<h2>Warning- be sure to backup your data before submitting the cleanup request!</h2>"; } $SQL = 'SELECT * FROM ' . TBL_DATA; $result = runSQL($SQL); $count = 0; foreach ($result as $video) { $SQL = ''; $keys = array(); foreach ($video as $key => $value) { if ($key == 'id') { continue; } $new = html_clean_utf8($value); if ($new != $value) { $keys[] = $key; if ($SQL) { $SQL .= ', '; } $SQL .= "{$key} = '" . mysql_escape_string($new) . "'"; } } if ($SQL) { $count++; echo ($submit ? 'Converting: ' : '<b>Conversion needed:</b> ') . $video['title'] . "<br/>\n"; // actually perform the conversion? if ($submit) { $SQL = "UPDATE " . TBL_DATA . " SET {$SQL} WHERE id = " . $video['id']; runSQL($SQL);