// quickly bail out if there is no job for me $workingQuery = work_getOneForMe(); if ($workingQuery == null) { die("no jobs for me\n"); } // configuration const QUERY_HD_ONLY = true; const QUERY_RESULTS_COUNT = 100; const QUERY_SORTING_SEQUENCE = ['relevance', 'viewCount', 'rating']; const MIN_VIDEO_VIEWS = 2000; const FORCE_REINDEX = false; define('VIDEOS_PROCESSED_SET', 'jam_videos_processed'); //define('VIDEOS_INDEXED_SET_NAME', 'jam_videos_indexed'); // create the global objects require_once 'YTMachine.php'; $ytMachine = new \YTMachine(); //require_once 'IndexMachine_Algolia.php'; //$indexMachine = new \IndexMachine_Algolia(isset($_GET['index']) ? 'yt_' . $_GET['index'] : ''); require_once 'IndexMachine_ElasticSearch.php'; $indexMachine = new \IndexMachine_ElasticSearch(['173.230.144.120:9199']); // loop until there's work do { // let's go with the current query $outPrefix = 'Q: ' . $workingQuery . ': '; echo $outPrefix . "started\n"; $someQueries = [$workingQuery]; // search youtube for N queries, for M (3) ordering criteria /* @var $videoLeads YTVideo[] */ $videoLeads = []; foreach ($someQueries as $query) { foreach (QUERY_SORTING_SEQUENCE as $order) {
/** * @return bool True if successful */ public function resolveCaptions() { // do it at most once if ($this->resolvedCaptions) { return $this->ytCC != null; } $this->resolvedCaptions = true; $this->lastResolveCaptionsIssue = ''; // perform the resolution // FIXME: SPEED BOTTLENECK (1/second) $captionsList = YTMachine::getYoutube()->captions->listCaptions('snippet', $this->videoId); // get all the SRT from this track that match the language $ytCCs = []; foreach ($captionsList->getItems() as $item) { /* @var $item Google_Service_YouTube_Caption */ /* @var $cc Google_Service_YouTube_CaptionSnippet */ $cc = $item->getSnippet(); $ccId = $item->getId(); // Sanity: abort if the returned CC is for a different video $ccVideoId = $cc->getVideoId(); if ($ccVideoId != $this->videoId) { if (YT_VIOLENT) { die('wrong video id, got ' . $ccVideoId . ' expecting ' . $this->videoId); } $this->lastResolveCaptionsIssue .= ' i'; continue; } // Sanity: check constant attributes, or stop if unexpected $ccStatus = $cc->getStatus(); if ($ccStatus != 'serving') { if (YT_VERBOSE) { echo '{wrong cc status: ' . $ccStatus; if ($ccStatus == 'failure') { echo '(' . $cc->getFailureReason() . ')'; } echo '}'; } $this->lastResolveCaptionsIssue .= ' s'; continue; } // base fetching query $fetchQuery = 'v=' . $ccVideoId; // add kind $ccKind = $cc->getTrackKind(); if ($ccKind == 'standard') { // nothing to do here } else { if ($ccKind == 'ASR') { // FILTER: TODO: we don't support the ASR format yet, at all. Always fails. if (YT_VERBOSE) { echo $ccVideoId . ', skipping ASR tracks (unsupported yet)' . "\n"; } $this->lastResolveCaptionsIssue .= ' k-asr'; continue; } else { if (YT_VIOLENT) { die('unknown track type ' . $ccKind); } $this->lastResolveCaptionsIssue .= ' k-' . $ccKind; continue; } } // add language $ccLang = $cc->getLanguage(); if (!empty($ccLang)) { // FILTER: skip if the language is not what we asked for if ($ccLang != $this->language) { // NOTE: in the future we could also stash other languages for later if (YT_VERBOSE) { echo $ccVideoId . ', skipping CC for different language: ' . $ccLang . "\n"; } $this->lastResolveCaptionsIssue .= ' l-' . $ccLang; continue; } $fetchQuery .= '&lang=' . $ccLang; } // add 'name' $ccTrackName = $cc->getName(); if (!empty($ccTrackName)) { $fetchQuery .= '&name=' . $ccTrackName; } // customize the output format. available formats: // srv1: <text start="2.501" dur="3.671"> // srv2: <timedtext><window t="0" id="1" op="define" rc="15" cc="32" ap="7" ah="50" av="95"/><text w="2" t="5538" d="2536">RE</text> // srv3: <timedtext format="3"><p t="2501" d="3671" w="2"><s>TH</s><s t="33">E</s> // sbv: [SubViewer] 0:00:02.501,0:00:06.172 \n THE WASHINGTON CORRESPONDENT // srt: [SubRip ] 1 \n 00:00:02,501 --> 00:00:06,172 \n THE WASHINGTON CORRESPONDENT // ttml: [TTML ] <p begin="00:00:02.501" end="00:00:06.172" region="r3" style="s2"><span begin="00:00:00.000">TH</span> // vtt: [WebVTT ] 00:00:02.501 --> 00:00:06.172 align:start position:0% line:7% \n THE WASHINGTON CORRESPONDENT $fetchQuery .= '&fmt=srv1'; // Fetch the Caption from cache $ccString = CacheMachine::retrieveValue('cc_' . $fetchQuery); // Fetch the Caption (and expect a 200:OK code) if ($ccString == null) { try { $response = YTMachine::getGuzzle()->get('https://www.youtube.com/api/timedtext?' . $fetchQuery); if ($response->getStatusCode() != 200) { if (YT_VIOLENT) { die('wrong status code ' . $response->getStatusCode() . ' on ' . $fetchQuery); } $this->lastResolveCaptionsIssue .= ' h1-' . $response->getStatusCode(); continue; } $ccString = $response->getBody()->getContents(); CacheMachine::storeValue('cc_' . $fetchQuery, $ccString, null); } catch (\GuzzleHttp\Exception\ClientException $exception) { if (YT_VIOLENT) { die('HTTP request failed: ' . $exception); } $this->lastResolveCaptionsIssue .= ' h2-"' . $exception->getMessage() . '"'; continue; } } // FILTER: Size Heuristic (FIXME): reject semi-empty captions (usually with not much more than the title) $ccStringSize = $ccString != null ? strlen($ccString) : 0; if ($ccStringSize < self::SUB_OK_THRESHOLD) { if (YT_VERBOSE) { echo $ccVideoId . ', skipping for small size: ' . $ccStringSize . "\n"; } $this->lastResolveCaptionsIssue .= ' S-' . $ccStringSize; continue; } // FILTER: parse and validate XML try { $ccTranscript = new SimpleXMLElement($ccString); if ($ccTranscript->getName() != 'transcript') { if (YT_VIOLENT) { die('expected a transcript root element, got a ' . $ccTranscript->getName() . ' instead '); } $this->lastResolveCaptionsIssue .= ' x-1'; continue; } } catch (Exception $e) { if (YT_VERBOSE) { echo 'skipping for xml parsing error' . $ccVideoId . "\n"; } $this->lastResolveCaptionsIssue .= ' x-2'; continue; } // Break a SRT into individual Lines $lines = []; $maxLength = 0; foreach ($ccTranscript->text as $line) { $text = $this->fixSrv1Caption(strval($line)); // fix quoted strings if (strlen($text) > YT_MIN_VALID_CHARS && substr($text, 0, 1) == '"' && substr($text, -1) == '"') { $text = substr($text, 1, -1); } // skip lines with less than YT_MIN_VALID_CHARS chars $textLength = strlen($text); if ($textLength < YT_MIN_VALID_CHARS || $textLength > YT_MAX_VALID_CHARS) { continue; } // FILTER: videos that have start but not duration are usually 1-liners $attributes = $line->attributes(); $start = $attributes['start']; $duration = $attributes['dur']; if (empty($start) || empty($duration)) { if (YT_VERBOSE) { echo 'skipping cc line for start or duration empty on ' . $ccVideoId . " body: " . $ccString . "\n"; } continue; } if (floatval($duration) > YT_MAX_VALID_DURATION) { if (YT_VERBOSE) { echo 'skipping cc line for duration ' . $ccVideoId . "\n"; } continue; } // skip "( ... )" strings, since parenthesis are not real speech if (substr($text, 0, 1) == '(' && substr($text, -1) == ')') { if (YT_VERBOSE) { echo 'skipping cc line for (parenthesis) ' . $ccVideoId . "\n"; } continue; } // add the line array_push($lines, ['t' => $text, 's' => floatval($start), 'd' => floatval($duration), 'e' => floatval($start) + floatval($duration)]); if ($textLength > $maxLength) { $maxLength = $textLength; } } // FILTER: almost-empty docs, or docs with at most 3 letters per line if ($maxLength < YT_MIN_VALID_CHARS || sizeof($lines) < YT_MIN_VALID_LINES) { if (YT_VERBOSE) { echo 'skipping for emptiness ' . sizeof($lines) . " lines and " . $maxLength . " max chars per line\n"; } $this->lastResolveCaptionsIssue .= ' e'; continue; } // save the fully-fetched caption array_push($ytCCs, new YTCC($ccId, $ccVideoId, $ccTrackName, $ccStringSize, $ccString, $lines, $cc->getLastUpdated())); } // use just best caption amongst those available, chosen by size usort($ytCCs, function ($a, $b) { return $b->ccSize - $a->ccSize; }); // pick the best (if any), or null $this->ytCC = empty($ytCCs) ? null : $ytCCs[0]; return $this->ytCC != null; }