Exemplo n.º 1
0
<?php

require 'config.php';
require 'functions.php';
require config::redirect_file;
$redirect_key = get_redirect_key();
$redirect_key_formatted = format_redirect_key($redirect_key);
$redirect_target = get_redirect_target($redirect_key_formatted);
$redirect_url = build_url($redirect_target);
send_header($redirect_url);
/**
 * Read short urls text file tumblr_likes.txt
 * deal with every short url:
 *     1 read the unwanted_files.txt for name of files that we don't need
 *     2 fetch out long url of short url, if failed write the short url to invalid_urls.txt
 *     3 fetch out resource uris inside html content of long url, filter this uris by the content of unwanted_files.txt
 *     4 write out the resource uris to resource_urls.txt
 * @param null $direct_url if specific deal with this single url instead of reading tumblr_likes.txt
 */
function main($direct_url = null)
{
    $txt = $direct_url ? $direct_url : file_get_contents('tumblr_likes.txt');
    if (preg_match_all('#http://ift.tt/.*#', $txt, $matches)) {
        $unwanted = file_get_contents('unwanted_files.txt');
        #deal with each short url
        foreach ($matches[0] as $ori_url) {
            try {
                echo str_repeat('-', 30), "\n";
                echo "Start: {$ori_url}\n";
                #get long url
                $redirect_url = get_redirect_target($ori_url);
                if (!$redirect_url) {
                    file_put_contents('invalid_urls.txt', "{$ori_url}\n", FILE_APPEND);
                    throw new exception("invalid original URL {$ori_url}");
                } else {
                    #file_put_contents('real_post_urls.txt', "$redirect_url\n", FILE_APPEND);
                }
                echo "Location fetched: {$redirect_url}\n";
                #get html page content
                $page_src = get_page_src($redirect_url);
                if (!$page_src) {
                    throw new exception("zero length page_src");
                }
                printf("Page fetched: length(%d)\n", strlen($page_src));
                #fetch out resource urls
                $resource_urls = array();
                $post_type = get_post_type($page_src);
                switch ($post_type) {
                    case 'photo':
                    case 'photoset':
                        $resource_urls = parse_img_urls($page_src);
                        break;
                    case 'audio':
                        echo "fetching audio\n";
                        $resource_urls = parse_audio_url($page_src);
                        break;
                    case 'video':
                        echo "fetching video\n";
                        $resource_urls = parse_video_url($page_src);
                        break;
                    default:
                        echo 'unknown resource, trying images', "\n";
                        $resource_urls = parse_img_urls($page_src);
                }
                if (is_array($resource_urls)) {
                    foreach ($resource_urls as $index => $url) {
                        if (strpos($unwanted, basename($url)) !== false) {
                            unset($resource_urls[$index]);
                        }
                    }
                    $resource_urls = implode("\n", $resource_urls);
                }
                echo $resource_urls .= "\n";
                file_put_contents('resource_urls.txt', $resource_urls, FILE_APPEND);
            } catch (exception $e) {
                echo 'Exception: ', $e->getMessage(), "\n";
            }
        }
    }
}