Exemplo n.º 1
0
// quickly bail out if there is no job for me
$workingQuery = work_getOneForMe();
if ($workingQuery == null) {
    die("no jobs for me\n");
}
// configuration
const QUERY_HD_ONLY = true;
const QUERY_RESULTS_COUNT = 100;
const QUERY_SORTING_SEQUENCE = ['relevance', 'viewCount', 'rating'];
const MIN_VIDEO_VIEWS = 2000;
const FORCE_REINDEX = false;
define('VIDEOS_PROCESSED_SET', 'jam_videos_processed');
//define('VIDEOS_INDEXED_SET_NAME', 'jam_videos_indexed');
// create the global objects
require_once 'YTMachine.php';
$ytMachine = new \YTMachine();
//require_once 'IndexMachine_Algolia.php';
//$indexMachine = new \IndexMachine_Algolia(isset($_GET['index']) ? 'yt_' . $_GET['index'] : '');
require_once 'IndexMachine_ElasticSearch.php';
$indexMachine = new \IndexMachine_ElasticSearch(['173.230.144.120:9199']);
// loop until there's work
do {
    // let's go with the current query
    $outPrefix = 'Q: ' . $workingQuery . ': ';
    echo $outPrefix . "started\n";
    $someQueries = [$workingQuery];
    // search youtube for N queries, for M (3) ordering criteria
    /* @var $videoLeads YTVideo[] */
    $videoLeads = [];
    foreach ($someQueries as $query) {
        foreach (QUERY_SORTING_SEQUENCE as $order) {
Exemplo n.º 2
0
 /**
  * @return bool True if successful
  */
 public function resolveCaptions()
 {
     // do it at most once
     if ($this->resolvedCaptions) {
         return $this->ytCC != null;
     }
     $this->resolvedCaptions = true;
     $this->lastResolveCaptionsIssue = '';
     // perform the resolution
     // FIXME: SPEED BOTTLENECK (1/second)
     $captionsList = YTMachine::getYoutube()->captions->listCaptions('snippet', $this->videoId);
     // get all the SRT from this track that match the language
     $ytCCs = [];
     foreach ($captionsList->getItems() as $item) {
         /* @var $item Google_Service_YouTube_Caption */
         /* @var $cc Google_Service_YouTube_CaptionSnippet */
         $cc = $item->getSnippet();
         $ccId = $item->getId();
         // Sanity: abort if the returned CC is for a different video
         $ccVideoId = $cc->getVideoId();
         if ($ccVideoId != $this->videoId) {
             if (YT_VIOLENT) {
                 die('wrong video id, got ' . $ccVideoId . ' expecting ' . $this->videoId);
             }
             $this->lastResolveCaptionsIssue .= ' i';
             continue;
         }
         // Sanity: check constant attributes, or stop if unexpected
         $ccStatus = $cc->getStatus();
         if ($ccStatus != 'serving') {
             if (YT_VERBOSE) {
                 echo '{wrong cc status: ' . $ccStatus;
                 if ($ccStatus == 'failure') {
                     echo '(' . $cc->getFailureReason() . ')';
                 }
                 echo '}';
             }
             $this->lastResolveCaptionsIssue .= ' s';
             continue;
         }
         // base fetching query
         $fetchQuery = 'v=' . $ccVideoId;
         // add kind
         $ccKind = $cc->getTrackKind();
         if ($ccKind == 'standard') {
             // nothing to do here
         } else {
             if ($ccKind == 'ASR') {
                 // FILTER: TODO: we don't support the ASR format yet, at all. Always fails.
                 if (YT_VERBOSE) {
                     echo $ccVideoId . ',  skipping ASR tracks (unsupported yet)' . "\n";
                 }
                 $this->lastResolveCaptionsIssue .= ' k-asr';
                 continue;
             } else {
                 if (YT_VIOLENT) {
                     die('unknown track type ' . $ccKind);
                 }
                 $this->lastResolveCaptionsIssue .= ' k-' . $ccKind;
                 continue;
             }
         }
         // add language
         $ccLang = $cc->getLanguage();
         if (!empty($ccLang)) {
             // FILTER: skip if the language is not what we asked for
             if ($ccLang != $this->language) {
                 // NOTE: in the future we could also stash other languages for later
                 if (YT_VERBOSE) {
                     echo $ccVideoId . ',  skipping CC for different language: ' . $ccLang . "\n";
                 }
                 $this->lastResolveCaptionsIssue .= ' l-' . $ccLang;
                 continue;
             }
             $fetchQuery .= '&lang=' . $ccLang;
         }
         // add 'name'
         $ccTrackName = $cc->getName();
         if (!empty($ccTrackName)) {
             $fetchQuery .= '&name=' . $ccTrackName;
         }
         // customize the output format. available formats:
         // srv1: <text start="2.501" dur="3.671">
         // srv2: <timedtext><window t="0" id="1" op="define" rc="15" cc="32" ap="7" ah="50" av="95"/><text w="2" t="5538" d="2536">RE</text>
         // srv3: <timedtext format="3"><p t="2501" d="3671" w="2"><s>TH</s><s t="33">E</s>
         // sbv:  [SubViewer] 0:00:02.501,0:00:06.172 \n THE WASHINGTON CORRESPONDENT
         // srt:  [SubRip   ] 1 \n 00:00:02,501 --> 00:00:06,172 \n THE WASHINGTON CORRESPONDENT
         // ttml: [TTML     ] <p begin="00:00:02.501" end="00:00:06.172" region="r3" style="s2"><span begin="00:00:00.000">TH</span>
         // vtt:  [WebVTT   ] 00:00:02.501 --> 00:00:06.172 align:start position:0% line:7% \n THE WASHINGTON CORRESPONDENT
         $fetchQuery .= '&fmt=srv1';
         // Fetch the Caption from cache
         $ccString = CacheMachine::retrieveValue('cc_' . $fetchQuery);
         // Fetch the Caption (and expect a 200:OK code)
         if ($ccString == null) {
             try {
                 $response = YTMachine::getGuzzle()->get('https://www.youtube.com/api/timedtext?' . $fetchQuery);
                 if ($response->getStatusCode() != 200) {
                     if (YT_VIOLENT) {
                         die('wrong status code ' . $response->getStatusCode() . ' on ' . $fetchQuery);
                     }
                     $this->lastResolveCaptionsIssue .= ' h1-' . $response->getStatusCode();
                     continue;
                 }
                 $ccString = $response->getBody()->getContents();
                 CacheMachine::storeValue('cc_' . $fetchQuery, $ccString, null);
             } catch (\GuzzleHttp\Exception\ClientException $exception) {
                 if (YT_VIOLENT) {
                     die('HTTP request failed: ' . $exception);
                 }
                 $this->lastResolveCaptionsIssue .= ' h2-"' . $exception->getMessage() . '"';
                 continue;
             }
         }
         // FILTER: Size Heuristic (FIXME): reject semi-empty captions (usually with not much more than the title)
         $ccStringSize = $ccString != null ? strlen($ccString) : 0;
         if ($ccStringSize < self::SUB_OK_THRESHOLD) {
             if (YT_VERBOSE) {
                 echo $ccVideoId . ',  skipping for small size: ' . $ccStringSize . "\n";
             }
             $this->lastResolveCaptionsIssue .= ' S-' . $ccStringSize;
             continue;
         }
         // FILTER: parse and validate XML
         try {
             $ccTranscript = new SimpleXMLElement($ccString);
             if ($ccTranscript->getName() != 'transcript') {
                 if (YT_VIOLENT) {
                     die('expected a transcript root element, got a ' . $ccTranscript->getName() . ' instead ');
                 }
                 $this->lastResolveCaptionsIssue .= ' x-1';
                 continue;
             }
         } catch (Exception $e) {
             if (YT_VERBOSE) {
                 echo 'skipping for xml parsing error' . $ccVideoId . "\n";
             }
             $this->lastResolveCaptionsIssue .= ' x-2';
             continue;
         }
         // Break a SRT into individual Lines
         $lines = [];
         $maxLength = 0;
         foreach ($ccTranscript->text as $line) {
             $text = $this->fixSrv1Caption(strval($line));
             // fix quoted strings
             if (strlen($text) > YT_MIN_VALID_CHARS && substr($text, 0, 1) == '"' && substr($text, -1) == '"') {
                 $text = substr($text, 1, -1);
             }
             // skip lines with less than YT_MIN_VALID_CHARS chars
             $textLength = strlen($text);
             if ($textLength < YT_MIN_VALID_CHARS || $textLength > YT_MAX_VALID_CHARS) {
                 continue;
             }
             // FILTER: videos that have start but not duration are usually 1-liners
             $attributes = $line->attributes();
             $start = $attributes['start'];
             $duration = $attributes['dur'];
             if (empty($start) || empty($duration)) {
                 if (YT_VERBOSE) {
                     echo 'skipping cc line for start or duration empty on ' . $ccVideoId . " body: " . $ccString . "\n";
                 }
                 continue;
             }
             if (floatval($duration) > YT_MAX_VALID_DURATION) {
                 if (YT_VERBOSE) {
                     echo 'skipping cc line for duration ' . $ccVideoId . "\n";
                 }
                 continue;
             }
             // skip "( ... )" strings, since parenthesis are not real speech
             if (substr($text, 0, 1) == '(' && substr($text, -1) == ')') {
                 if (YT_VERBOSE) {
                     echo 'skipping cc line for (parenthesis) ' . $ccVideoId . "\n";
                 }
                 continue;
             }
             // add the line
             array_push($lines, ['t' => $text, 's' => floatval($start), 'd' => floatval($duration), 'e' => floatval($start) + floatval($duration)]);
             if ($textLength > $maxLength) {
                 $maxLength = $textLength;
             }
         }
         // FILTER: almost-empty docs, or docs with at most 3 letters per line
         if ($maxLength < YT_MIN_VALID_CHARS || sizeof($lines) < YT_MIN_VALID_LINES) {
             if (YT_VERBOSE) {
                 echo 'skipping for emptiness  ' . sizeof($lines) . " lines and " . $maxLength . " max chars per line\n";
             }
             $this->lastResolveCaptionsIssue .= ' e';
             continue;
         }
         // save the fully-fetched caption
         array_push($ytCCs, new YTCC($ccId, $ccVideoId, $ccTrackName, $ccStringSize, $ccString, $lines, $cc->getLastUpdated()));
     }
     // use just best caption amongst those available, chosen by size
     usort($ytCCs, function ($a, $b) {
         return $b->ccSize - $a->ccSize;
     });
     // pick the best (if any), or null
     $this->ytCC = empty($ytCCs) ? null : $ytCCs[0];
     return $this->ytCC != null;
 }