/** * Follows redirects of a url and returns the final one. * * @param string $url * @param int $timeout * @return mixed */ function get_final_url($url, $timeout = 120) { $url = str_replace("&", "&", urldecode(trim($url))); $cookie = tempnam("/tmp", "CURLCOOKIE"); $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1"); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_ENCODING, ""); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_MAXREDIRS, 10); $content = curl_exec($ch); $response = curl_getinfo($ch); curl_close($ch); if ($response['http_code'] == 301 || $response['http_code'] == 302) { ini_set("user_agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1"); $headers = get_headers($response['url']); $location = ""; foreach ($headers as $value) { if (substr(strtolower($value), 0, 9) == "location:") { return get_final_url(trim(substr($value, 9, strlen($value)))); } } } if (preg_match("/window\\.location\\.replace\\('(.*)'\\)/i", $content, $value) || preg_match("/window\\.location\\=\"(.*)\"/i", $content, $value)) { return get_final_url($value[1]); } else { return $response['url']; } }
if (preg_match("~^" . $embedPattern . "\$~", $url, $hash)) { $isEmbed = true; } else { if (!preg_match($normalPattern, $url)) { error("Érvénytelen URL", $hotlink); } } // Get hash if ($isEmbed) { // Embed link, it's simple $hash = $hash[1]; } else { // Not an embed link, gotta scrape it from the html source // Follow redirects even when CURL sucks. // Might need to start using composer + Guzzle one day :) $url = get_final_url($url); // Load page $result = get_web_page($url); if ($result['http_code'] != 200) { error("Az oldal nem elérhető"); } $page = $result['content']; // Page loaded, get the embed link preg_match("~" . $embedPattern . "~", $page, $hash); if (sizeof($hash) < 2) { error("A videó hash nem található", $hotlink); } $hash = $hash[1]; } // Get video URL $result = get_web_page(INDA_AMFPHP . $hash);
static function parse_message($text, $redirect = false) { if (empty($text)) { return; } $twitter = array(); //extract links if (preg_match_all('/https?:[^\\s<>"\',]+/', $text, $matches)) { foreach ($matches as $match) { $temp = $match[0]; $twitter["url_raw"][] = $temp; if ($redirect) { if (strlen($temp) < 30) { $temp1 = get_redirect_url($temp); if ($temp1) { $twitter["url"][] = $temp1; } else { $twitter["url"][] = $temp; } } } if ($follow_redirect) { $twitter["final_url"][] = get_final_url($temp); } } // print_r($matches); } //expand links //extract hashtags if (preg_match_all('/#[A-Za-z0-9-_]+/', $text, $matches)) { foreach ($matches[0] as $match) { $temp = substr($match, 1); $twitter["dc:subject"][] = $temp; // $twitter["dc:subject"][] = URL_TWITTER_HASHTAG. $temp; $twitter["rdfs:seeAlso"][] = URL_TWITTER4RDF_HASHTAG . $temp; $twitter["dc:relation"][] = URL_TWITTERLOGIC_HASHTAG . strtolower($temp); } // print_r($matches); } return $twitter; }
echo 'Failed to find the course url from ' . $url . "\n"; continue; } $data = fetch_url($courses_url, LONG_CACHE_EXPIRY_TIMESPAN); if (!$data) { echo 'Failed to grab the course url data from ' . $url . "\n"; continue; } $html = str_get_html($data); $elm = $html->find('a.Level2Group'); foreach ($elm as $e) { $facultyUrl = $rootUrl . $e->href; preg_match_all('/(.+) \\((.+)\\)/', $e->innertext, $matches); $facultyName = $matches[1][0]; $facultyAcronym = $matches[2][0]; $faculties[$facultyAcronym] = array('name' => $facultyName, 'url' => get_final_url($facultyUrl)); } } } $html->__destruct(); unset($html); } /////////////////////////////////////////////////////////////////////////////////////////////////// // Great, we have all of the faculties. Now let's cross reference it with the existing db. /////////////////////////////////////////////////////////////////////////////////////////////////// /* // Don't prune anymore. $results = $db->query('SELECT * FROM faculties;'); while ($row = mysql_fetch_assoc($results)) { // Check if this faculty no longer exists. if (!isset($faculties[$row['acronym']])) {
function searchCorrectDBPediaArticle($article) { $dbpediaUrl = "http://dbpedia.org/page/" . $article; $correctUrl = urldecode(get_final_url($dbpediaUrl)); $tempArticle = removeLink($correctUrl); $correctDBPediaArticle = ""; for ($i = 0; $i < strlen($tempArticle); $i++) { if ($tempArticle[$i] == ' ') { $correctDBPediaArticle = $correctDBPediaArticle . "_"; } else { $correctDBPediaArticle = $correctDBPediaArticle . $tempArticle[$i]; } } return $correctDBPediaArticle; }