Beispiel #1
0
/**
 * Follows redirects of a url and returns the final one.
 *
 * @param string  $url
 * @param int  $timeout
 * @return mixed
 */
function get_final_url($url, $timeout = 120)
{
    $url = str_replace("&", "&", urldecode(trim($url)));
    $cookie = tempnam("/tmp", "CURLCOOKIE");
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1");
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($ch, CURLOPT_ENCODING, "");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_AUTOREFERER, true);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
    $content = curl_exec($ch);
    $response = curl_getinfo($ch);
    curl_close($ch);
    if ($response['http_code'] == 301 || $response['http_code'] == 302) {
        ini_set("user_agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1");
        $headers = get_headers($response['url']);
        $location = "";
        foreach ($headers as $value) {
            if (substr(strtolower($value), 0, 9) == "location:") {
                return get_final_url(trim(substr($value, 9, strlen($value))));
            }
        }
    }
    if (preg_match("/window\\.location\\.replace\\('(.*)'\\)/i", $content, $value) || preg_match("/window\\.location\\=\"(.*)\"/i", $content, $value)) {
        return get_final_url($value[1]);
    } else {
        return $response['url'];
    }
}
Beispiel #2
0
if (preg_match("~^" . $embedPattern . "\$~", $url, $hash)) {
    $isEmbed = true;
} else {
    if (!preg_match($normalPattern, $url)) {
        error("Érvénytelen URL", $hotlink);
    }
}
// Get hash
if ($isEmbed) {
    // Embed link, it's simple
    $hash = $hash[1];
} else {
    // Not an embed link, gotta scrape it from the html source
    // Follow redirects even when CURL sucks.
    // Might need to start using composer + Guzzle one day :)
    $url = get_final_url($url);
    // Load page
    $result = get_web_page($url);
    if ($result['http_code'] != 200) {
        error("Az oldal nem elérhető");
    }
    $page = $result['content'];
    // Page loaded, get the embed link
    preg_match("~" . $embedPattern . "~", $page, $hash);
    if (sizeof($hash) < 2) {
        error("A videó hash nem található", $hotlink);
    }
    $hash = $hash[1];
}
// Get video URL
$result = get_web_page(INDA_AMFPHP . $hash);
Beispiel #3
0
 static function parse_message($text, $redirect = false)
 {
     if (empty($text)) {
         return;
     }
     $twitter = array();
     //extract links
     if (preg_match_all('/https?:[^\\s<>"\',]+/', $text, $matches)) {
         foreach ($matches as $match) {
             $temp = $match[0];
             $twitter["url_raw"][] = $temp;
             if ($redirect) {
                 if (strlen($temp) < 30) {
                     $temp1 = get_redirect_url($temp);
                     if ($temp1) {
                         $twitter["url"][] = $temp1;
                     } else {
                         $twitter["url"][] = $temp;
                     }
                 }
             }
             if ($follow_redirect) {
                 $twitter["final_url"][] = get_final_url($temp);
             }
         }
         // print_r($matches);
     }
     //expand links
     //extract hashtags
     if (preg_match_all('/#[A-Za-z0-9-_]+/', $text, $matches)) {
         foreach ($matches[0] as $match) {
             $temp = substr($match, 1);
             $twitter["dc:subject"][] = $temp;
             //       $twitter["dc:subject"][] = URL_TWITTER_HASHTAG. $temp;
             $twitter["rdfs:seeAlso"][] = URL_TWITTER4RDF_HASHTAG . $temp;
             $twitter["dc:relation"][] = URL_TWITTERLOGIC_HASHTAG . strtolower($temp);
         }
         //    print_r($matches);
     }
     return $twitter;
 }
Beispiel #4
0
                echo 'Failed to find the course url from ' . $url . "\n";
                continue;
            }
            $data = fetch_url($courses_url, LONG_CACHE_EXPIRY_TIMESPAN);
            if (!$data) {
                echo 'Failed to grab the course url data from ' . $url . "\n";
                continue;
            }
            $html = str_get_html($data);
            $elm = $html->find('a.Level2Group');
            foreach ($elm as $e) {
                $facultyUrl = $rootUrl . $e->href;
                preg_match_all('/(.+) \\((.+)\\)/', $e->innertext, $matches);
                $facultyName = $matches[1][0];
                $facultyAcronym = $matches[2][0];
                $faculties[$facultyAcronym] = array('name' => $facultyName, 'url' => get_final_url($facultyUrl));
            }
        }
    }
    $html->__destruct();
    unset($html);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Great, we have all of the faculties. Now let's cross reference it with the existing db.
///////////////////////////////////////////////////////////////////////////////////////////////////
/*
// Don't prune anymore.
$results = $db->query('SELECT * FROM faculties;');
while ($row = mysql_fetch_assoc($results)) {
  // Check if this faculty no longer exists.
  if (!isset($faculties[$row['acronym']])) {
Beispiel #5
0
function searchCorrectDBPediaArticle($article)
{
    $dbpediaUrl = "http://dbpedia.org/page/" . $article;
    $correctUrl = urldecode(get_final_url($dbpediaUrl));
    $tempArticle = removeLink($correctUrl);
    $correctDBPediaArticle = "";
    for ($i = 0; $i < strlen($tempArticle); $i++) {
        if ($tempArticle[$i] == ' ') {
            $correctDBPediaArticle = $correctDBPediaArticle . "_";
        } else {
            $correctDBPediaArticle = $correctDBPediaArticle . $tempArticle[$i];
        }
    }
    return $correctDBPediaArticle;
}