/** * @return float|string */ public function getPeePerc() { if (is_null($this->pe_effort_perc)) { $this->pe_effort_perc = round((1 - MyMemory::TMS_MATCH($this->suggestion, $this->translation)) * 100); } return $this->pe_effort_perc; }
public static function getEditingLogData($jid, $password, $use_ter_diff = false) { $data = getEditLog($jid, $password); $slow_cut = 30; $fast_cut = 0.25; $stat_too_slow = array(); $stat_too_fast = array(); if (!$data) { return false; } $stats['total-word-count'] = 0; $stat_mt = array(); foreach ($data as &$seg) { $seg['sm'] .= "%"; $seg['jid'] = $jid; $tte = self::parse_time_to_edit($seg['tte']); $seg['time_to_edit'] = "{$tte['1']}m:{$tte['2']}s"; $stat_rwc[] = $seg['rwc']; // by definition we cannot have a 0 word sentence. It is probably a - or a tag, so we want to consider at least a word. if ($seg['rwc'] < 1) { $seg['rwc'] = 1; } $seg['secs-per-word'] = round($seg['tte'] / 1000 / $seg['rwc'], 1); if ($seg['secs-per-word'] < $slow_cut and $seg['secs-per-word'] > $fast_cut) { $seg['stats-valid'] = 'Yes'; $seg['stats-valid-color'] = ''; $seg['stats-valid-style'] = ''; $stat_valid_rwc[] = $seg['rwc']; $stat_valid_tte[] = $seg['tte']; $stat_spw[] = $seg['secs-per-word']; } else { $seg['stats-valid'] = 'No'; $seg['stats-valid-color'] = '#ee6633'; $seg['stats-valid-style'] = 'border:2px solid #EE6633'; } // Stats if ($seg['secs-per-word'] >= $slow_cut) { $stat_too_slow[] = $seg['rwc']; } if ($seg['secs-per-word'] <= $fast_cut) { $stat_too_fast[] = $seg['rwc']; } $seg['pe_effort_perc'] = round((1 - MyMemory::TMS_MATCH($seg['sug'], $seg['translation'])) * 100); if ($seg['pe_effort_perc'] < 0) { $seg['pe_effort_perc'] = 0; } if ($seg['pe_effort_perc'] > 100) { $seg['pe_effort_perc'] = 100; } $stat_pee[] = $seg['pe_effort_perc'] * $seg['rwc']; $seg['pe_effort_perc'] .= "%"; $lh = Langs_Languages::getInstance(); $lang = $lh->getIsoCode($lh->getLocalizedName($seg['target_lang'])); $sug_for_diff = self::placehold_xliff_tags($seg['sug']); $tra_for_diff = self::placehold_xliff_tags($seg['translation']); // possible patch // $sug_for_diff = html_entity_decode($sug_for_diff, ENT_NOQUOTES, 'UTF-8'); // $tra_for_diff = html_entity_decode($tra_for_diff, ENT_NOQUOTES, 'UTF-8'); //with this patch we have warnings when accessing indexes if ($use_ter_diff) { $ter = MyMemory::diff_tercpp($sug_for_diff, $tra_for_diff, $lang); } else { $ter = array(); } // Log::doLog( $sug_for_diff ); // Log::doLog( $tra_for_diff ); // Log::doLog( $ter ); $seg['ter'] = @$ter[1] * 100; $stat_ter[] = $seg['ter'] * $seg['rwc']; $seg['ter'] = round(@$ter[1] * 100) . "%"; $diff_ter = @$ter[0]; if ($seg['sug'] != $seg['translation']) { //force use of third party ter diff if ($use_ter_diff) { $seg['diff'] = $diff_ter; } else { $diff_PE = MyMemory::diff_html($sug_for_diff, $tra_for_diff); // we will use diff_PE until ter_diff will not work properly $seg['diff'] = $diff_PE; } //$seg[ 'diff_ter' ] = $diff_ter; } else { $seg['diff'] = ''; //$seg[ 'diff_ter' ] = ''; } $seg['diff'] = self::restore_xliff_tags_for_view($seg['diff']); //$seg['diff_ter'] = self::restore_xliff_tags_for_view($seg['diff_ter']); // BUG: While suggestions source is not correctly set if ($seg['sm'] == "85%" or $seg['sm'] == "86%") { $seg['ss'] = 'Machine Translation'; $stat_mt[] = $seg['rwc']; } else { $seg['ss'] = 'Translation Memory'; } $seg['sug_view'] = trim(CatUtils::rawxliff2view($seg['sug'])); $seg['source'] = trim(CatUtils::rawxliff2view($seg['source'])); $seg['translation'] = trim(CatUtils::rawxliff2view($seg['translation'])); $array_patterns = array(rtrim(self::lfPlaceholderRegex, 'g'), rtrim(self::crPlaceholderRegex, 'g'), rtrim(self::crlfPlaceholderRegex, 'g'), rtrim(self::tabPlaceholderRegex, 'g'), rtrim(self::nbspPlaceholderRegex, 'g')); $array_replacements_csv = array('\\n', '\\r', '\\r\\n', '\\t', Utils::unicode2chr(0xa0)); $seg['source_csv'] = preg_replace($array_patterns, $array_replacements_csv, $seg['source']); $seg['translation_csv'] = preg_replace($array_patterns, $array_replacements_csv, $seg['translation']); $seg['sug_csv'] = preg_replace($array_patterns, $array_replacements_csv, $seg['sug_view']); $seg['diff_csv'] = preg_replace($array_patterns, $array_replacements_csv, $seg['diff']); $array_replacements = array('<span class="_0A"></span><br />', '<span class="_0D"></span><br />', '<span class="_0D0A"></span><br />', '<span class="_tab">	</span>', '<span class="_nbsp"> </span>'); $seg['source'] = preg_replace($array_patterns, $array_replacements, $seg['source']); $seg['translation'] = preg_replace($array_patterns, $array_replacements, $seg['translation']); $seg['sug_view'] = preg_replace($array_patterns, $array_replacements, $seg['sug_view']); $seg['diff'] = preg_replace($array_patterns, $array_replacements, $seg['diff']); if ($seg['mt_qe'] == 0) { $seg['mt_qe'] = 'N/A'; } } $stats['edited-word-count'] = array_sum($stat_rwc); $stats['valid-word-count'] = array_sum($stat_valid_rwc); if ($stats['edited-word-count'] > 0) { $stats['too-slow-words'] = round(array_sum($stat_too_slow) / $stats['edited-word-count'], 2) * 100; $stats['too-fast-words'] = round(array_sum($stat_too_fast) / $stats['edited-word-count'], 2) * 100; $stats['avg-pee'] = round(array_sum($stat_pee) / array_sum($stat_rwc)) . "%"; $stats['avg-ter'] = round(array_sum($stat_ter) / array_sum($stat_rwc)) . "%"; } // echo array_sum($stat_ter); // echo "@@@"; // echo array_sum($stat_rwc); // exit; $stats['mt-words'] = round(array_sum($stat_mt) / $stats['edited-word-count'], 2) * 100; $stats['tm-words'] = 100 - $stats['mt-words']; $stats['total-valid-tte'] = round(array_sum($stat_valid_tte) / 1000); // Non weighted... // $stats['avg-secs-per-word'] = round(array_sum($stat_spw)/count($stat_spw),1); // Weighted $stats['avg-secs-per-word'] = round($stats['total-valid-tte'] / $stats['valid-word-count'], 1); $stats['est-words-per-day'] = number_format(round(3600 * 8 / $stats['avg-secs-per-word']), 0, '.', ','); // Last minute formatting (after calculations) $temp = self::parse_time_to_edit(round(array_sum($stat_valid_tte))); $stats['total-valid-tte'] = "{$temp['0']}h:{$temp['1']}m:{$temp['2']}s"; $stats['total-tte-seconds'] = $temp[0] * 3600 + $temp[1] * 60 + $temp[2]; return array($data, $stats); }
private function getEditLogData($use_ter_diff = false) { $editLogDao = new EditLog_EditLogDao(Database::obtain()); $data = $editLogDao->getSegments($this->getJid(), $this->getPassword(), self::$start_id); //get translation mismatches and convert the array in a hashmap $translationMismatchList = $editLogDao->getTranslationMismatches($this->getJid()); foreach ($translationMismatchList as $idx => $translMismRow) { $translMismRow[$translMismRow['segment_hash']] = (bool) $translMismRow['translation_mismatch']; } $__pagination_prev = PHP_INT_MAX; $__pagination_next = -2147483648; //PHP_INT_MIN $stat_too_slow = array(); $stat_too_fast = array(); if (!$data) { throw new Exception('There are no changes in this job', -1); } $stats['total-word-count'] = 0; $stat_mt = array(); $stat_valid_rwc = array(); $stat_rwc = array(); $stat_valid_tte = array(); $stat_pee = array(); $stat_ter = array(); $output_data = array(); foreach ($data as $seg) { //if the segment is before the current one if ($seg->id < self::$start_id) { if ($seg->id <= $__pagination_prev) { $__pagination_prev = $seg->id; } continue; } if ($seg->id > $__pagination_next) { $__pagination_next = $seg->id; } $displaySeg = new EditLog_EditLogSegmentClientStruct($seg->toArray()); $displaySeg->suggestion_match .= "%"; $displaySeg->job_id = $this->jid; $tte = CatUtils::parse_time_to_edit($displaySeg->time_to_edit); $displaySeg->display_time_to_edit = "{$tte['1']}m:{$tte['2']}s"; $stat_rwc[] = $seg->raw_word_count; // by definition we cannot have a 0 word sentence. It is probably a - or a tag, so we want to consider at least a word. if ($seg->raw_word_count < 1) { $displaySeg->raw_word_count = 1; } //todo: remove this $displaySeg->secs_per_word = $seg->getSecsPerWord(); if ($displaySeg->secs_per_word < self::EDIT_TIME_SLOW_CUT && $displaySeg->secs_per_word > self::EDIT_TIME_FAST_CUT) { $displaySeg->stats_valid = true; $stat_valid_rwc[] = $seg->raw_word_count; $stat_spw[] = $displaySeg->secs_per_word; } else { $displaySeg->stats_valid = false; } // Stats if ($displaySeg->secs_per_word >= self::EDIT_TIME_SLOW_CUT) { $stat_too_slow[] = $seg->raw_word_count; } if ($displaySeg->secs_per_word <= self::EDIT_TIME_FAST_CUT) { $stat_too_fast[] = $seg->raw_word_count; } $displaySeg->secs_per_word .= "s"; $displaySeg->pe_effort_perc = $displaySeg->getPeePerc(); if ($displaySeg->pe_effort_perc < 0) { $displaySeg->pe_effort_perc = 0; } if ($displaySeg->pe_effort_perc > 100) { $displaySeg->pe_effort_perc = 100; } $stat_pee[] = $displaySeg->pe_effort_perc * $seg->raw_word_count; $displaySeg->pe_effort_perc .= "%"; $lh = Langs_Languages::getInstance(); $lang = $lh->getIsoCode($lh->getLocalizedName($seg->job_target)); $sug_for_diff = CatUtils::placehold_xliff_tags($seg->suggestion); $tra_for_diff = CatUtils::placehold_xliff_tags($seg->translation); //with this patch we have warnings when accessing indexes if ($use_ter_diff) { $ter = MyMemory::diff_tercpp($sug_for_diff, $tra_for_diff, $lang); } else { $ter = array(); } $displaySeg->ter = @$ter[1] * 100; $stat_ter[] = $displaySeg->ter * $seg->raw_word_count; $displaySeg->ter = round(@$ter[1] * 100) . "%"; $diff_ter = @$ter[0]; if ($seg->suggestion != $seg->translation) { //force use of third party ter diff if ($use_ter_diff) { $displaySeg->diff = $diff_ter; } else { $diff_PE = MyMemory::diff_html($sug_for_diff, $tra_for_diff); // we will use diff_PE until ter_diff will not work properly $displaySeg->diff = $diff_PE; } //$seg[ 'diff_ter' ] = $diff_ter; } else { $displaySeg->diff = ''; } $displaySeg->diff = CatUtils::restore_xliff_tags_for_view($displaySeg->diff); // BUG: While suggestions source is not correctly set if ($displaySeg->suggestion_match == "85%" || $displaySeg->suggestion_match == "86%") { $displaySeg->suggestion_source = 'Machine Translation'; $stat_mt[] = $seg->raw_word_count; } else { $displaySeg->suggestion_source = 'TM'; } $array_patterns = array(rtrim(CatUtils::lfPlaceholderRegex, 'g'), rtrim(CatUtils::crPlaceholderRegex, 'g'), rtrim(CatUtils::crlfPlaceholderRegex, 'g'), rtrim(CatUtils::tabPlaceholderRegex, 'g'), rtrim(CatUtils::nbspPlaceholderRegex, 'g')); $array_replacements_csv = array('\\n', '\\r', '\\r\\n', '\\t', Utils::unicode2chr(0xa0)); $displaySeg->source_csv = preg_replace($array_patterns, $array_replacements_csv, $seg->source); $displaySeg->translation_csv = preg_replace($array_patterns, $array_replacements_csv, $seg->translation); $displaySeg->sug_csv = preg_replace($array_patterns, $array_replacements_csv, $displaySeg->suggestion_view); $displaySeg->diff_csv = preg_replace($array_patterns, $array_replacements_csv, $displaySeg->diff); $array_replacements = array('<span class="_0A"></span><br />', '<span class="_0D"></span><br />', '<span class="_0D0A"></span><br />', '<span class="_tab">	</span>', '<span class="_nbsp"> </span>'); $displaySeg->source = preg_replace($array_patterns, $array_replacements, $seg->source); $displaySeg->translation = preg_replace($array_patterns, $array_replacements, $seg->translation); $displaySeg->suggestion_view = preg_replace($array_patterns, $array_replacements, $displaySeg->suggestion_view); $displaySeg->diff = preg_replace($array_patterns, $array_replacements, $displaySeg->diff); $displaySeg->source = trim(CatUtils::rawxliff2view($seg->source)); $displaySeg->suggestion_view = trim(CatUtils::rawxliff2view($seg->suggestion)); $displaySeg->translation = trim(CatUtils::rawxliff2view($seg->translation)); if ($seg->mt_qe == 0) { $displaySeg->mt_qe = 'N/A'; } $displaySeg->num_translation_mismatch = @(int) $translationMismatchList[$displaySeg->segment_hash]; $displaySeg->evaluateWarningString(); $output_data[] = $displaySeg; } $pagination = $this->evaluatePagination($__pagination_prev, $__pagination_next + 1); $globalStats = $this->evaluateGlobalStats(); $stats['valid-word-count'] = $globalStats['raw_words']; //TODO: this will not work anymore $stats['edited-word-count'] = array_sum($stat_rwc); if ($stats['edited-word-count'] > 0) { $stats['too-slow-words'] = round(array_sum($stat_too_slow) / $stats['edited-word-count'], 2) * 100; $stats['too-fast-words'] = round(array_sum($stat_too_fast) / $stats['edited-word-count'], 2) * 100; $stats['avg-pee'] = round(array_sum($stat_pee) / array_sum($stat_rwc)) . "%"; $stats['avg-ter'] = round(array_sum($stat_ter) / array_sum($stat_rwc)) . "%"; } $stats['mt-words'] = round(array_sum($stat_mt) / $stats['edited-word-count'], 2) * 100; $stats['tm-words'] = 100 - $stats['mt-words']; $stats['total-valid-tte'] = round($globalStats['tot_tte']); // Non weighted... // $stats['avg-secs-per-word'] = round(array_sum($stat_spw)/count($stat_spw),1); // Weighted $stats['avg-secs-per-word'] = round($globalStats['secs_per_word'] / 1000, 1); $stats['est-words-per-day'] = number_format(round(3600 * 8 / $stats['avg-secs-per-word']), 0, '.', ','); // Last minute formatting (after calculations) $temp = CatUtils::parse_time_to_edit(round($stats['total-valid-tte'])); $stats['total-valid-tte'] = "{$temp['0']}h:{$temp['1']}m:{$temp['2']}s"; $stats['total-tte-seconds'] = $temp[0] * 3600 + $temp[1] * 60 + $temp[2]; $stats['avg-pee'] = round($globalStats['avg_pee'], 2); $stats['avg-pee'] .= "%"; return array($output_data, $stats, $pagination); }
function main($args) { $db = Database::obtain(); $lastProcessedJob = (int) file_get_contents(self::$last_job_file_name); do { $queryMaxJob = "select min(id) as min, max(id) as max\n from jobs\n where completed = 1\n and id > %d"; $queryFirst = "select id, password, job_first_segment, job_last_segment\n from jobs\n where completed = 1\n and id >= %d and id <= %d"; $querySegments = "select suggestion,\n translation,\n raw_word_count,\n time_to_edit\n from segment_translations st\n join segments s on st.id_segment = s.id\n and s.id between %d and %d\n where status='translated'\n and id_job = %d\n and show_in_cattool = 1\n and id_segment > %d\n limit %d"; $queryUpdateJob = "update jobs\n set avg_post_editing_effort = %f,\n total_time_to_edit = %f\n where id = %d and password = '******'"; $minJobMaxJob = $db->query_first(sprintf($queryMaxJob, (int) $lastProcessedJob)); $maxJob = (int) $minJobMaxJob['max']; $minJob = (int) $minJobMaxJob['min']; $start = time(); for ($firstJob = $minJob; self::$RUNNING && $firstJob < $maxJob; $firstJob += self::NR_OF_JOBS) { $jobs = $db->fetch_array(sprintf($queryFirst, $firstJob, $firstJob + self::NR_OF_JOBS)); //iterate over completed jobs, evaluate PEE and save it in the job row for ($j = 0; self::$RUNNING && $j < count($jobs); $j++) { $job = $jobs[$j]; //BEGIN TRANSACTION $db->begin(); $_jid = $job['id']; $_password = $job['password']; $_job_first_segment = $job['job_first_segment']; $_job_last_segment = $job['job_last_segment']; Log::doLog("job {$_jid} -> " . ($_job_last_segment - $_job_first_segment) . " segments"); echo "job {$_jid} -> " . ($_job_last_segment - $_job_first_segment) . " segments\n"; $raw_post_editing_effort_job = 0; $raw_wc_job = 0; $time_to_edit_job = 0; for ($firstSeg = $_job_first_segment; $firstSeg <= $_job_last_segment; $firstSeg += self::NR_OF_SEGS) { if ($firstSeg > $_job_last_segment) { $firstSeg = $_job_last_segment; } Log::doLog("starting from segment {$firstSeg}"); echo "starting from segment {$firstSeg}\n"; $segments = $db->fetch_array(sprintf($querySegments, $_job_first_segment, $_job_last_segment, $_jid, $firstSeg, self::NR_OF_SEGS)); foreach ($segments as $i => $segment) { $post_editing_effort = round((1 - MyMemory::TMS_MATCH($segment['suggestion'], $segment['translation'])) * 100); if ($post_editing_effort < 0) { $post_editing_effort = 0; } else { if ($post_editing_effort > 100) { $post_editing_effort = 100; } } $raw_wc_job += $segment['raw_word_count']; $time_to_edit_job += $segment['time_to_edit']; $raw_post_editing_effort_job += $post_editing_effort * $segment['raw_word_count']; } //sleep 100 nanosecs usleep(100); } $job_pee = round($raw_post_editing_effort_job / $raw_wc_job, 3); Log::doLog("job pee: {$job_pee}\njob time to edit: {$time_to_edit_job}\nWriting into DB"); echo "job pee: {$job_pee}\njob time to edit: {$time_to_edit_job}\nWriting into DB\n"; $db->query(sprintf($queryUpdateJob, $job_pee, $time_to_edit_job, $_jid, $_password)); Log::doLog("done"); echo "done.\n"; if (!file_put_contents(self::$last_job_file_name, $_jid)) { $db->rollback(); Utils::sendErrMailReport("", "[JobPostEditingEffortRunner] Failed to process job {$_jid}"); self::$RUNNING = false; continue; //exit; } //COMMIT TRANSACTION $db->commit(); } } Log::doLog("took " . (time() - $start) / 60 . " seconds"); echo "took " . (time() - $start) / 60 . " seconds\n"; Log::doLog("sleeping for 1 month"); echo "sleeping for 1 month\n"; if (self::$RUNNING) { sleep(self::$sleeptime); } } while (self::$RUNNING); }
public static function getEditingLogData($jid, $password) { $data = getEditLog($jid, $password); $slow_cut = 30; $fast_cut = 0.25; $stat_too_slow = array(); $stat_too_fast = array(); if (!$data) { return false; } $stats['total-word-count'] = 0; $stat_mt = array(); foreach ($data as &$seg) { //$seg['source'] = self::stripTagesFromSource($seg['source']); $seg['source'] = trim($seg['source']); $seg['sm'] .= "%"; $seg['jid'] = $jid; $tte = self::parse_time_to_edit($seg['tte']); $seg['time_to_edit'] = "{$tte['1']}m:{$tte['2']}s"; $stat_rwc[] = $seg['rwc']; // by definition we cannot have a 0 word sentence. It is probably a - or a tag, so we want to consider at least a word. if ($seg['rwc'] < 1) { $seg['rwc'] = 1; } $seg['secs-per-word'] = round($seg['tte'] / 1000 / $seg['rwc'], 1); if ($seg['secs-per-word'] < $slow_cut and $seg['secs-per-word'] > $fast_cut) { $seg['stats-valid'] = 'Yes'; $seg['stats-valid-color'] = ''; $seg['stats-valid-style'] = ''; $stat_valid_rwc[] = $seg['rwc']; $stat_valid_tte[] = $seg['tte']; $stat_spw[] = $seg['secs-per-word']; } else { $seg['stats-valid'] = 'No'; $seg['stats-valid-color'] = '#ee6633'; $seg['stats-valid-style'] = 'border:2px solid #EE6633'; } // Stats if ($seg['secs-per-word'] >= $slow_cut) { $stat_too_slow[] = $seg['rwc']; } if ($seg['secs-per-word'] <= $fast_cut) { $stat_too_fast[] = $seg['rwc']; } $seg['pe_effort_perc'] = round((1 - MyMemory::TMS_MATCH($seg['sug'], $seg['translation'])) * 100); if ($seg['pe_effort_perc'] < 0) { $seg['pe_effort_perc'] = 0; } if ($seg['pe_effort_perc'] > 100) { $seg['pe_effort_perc'] = 100; } $stat_pee[] = $seg['pe_effort_perc'] * $seg['rwc']; $seg['pe_effort_perc'] .= "%"; $seg['sug_view'] = html_entity_decode($seg['sug']); if ($seg['sug'] != $seg['translation']) { $seg['diff'] = MyMemory::diff_html($seg['sug'], $seg['translation']); } else { $seg['diff'] = ''; } // BUG: While suggestions source is not correctly set if ($seg['sm'] == "85%" or $seg['sm'] == "86%") { $seg['ss'] = 'Machine Translation'; $stat_mt[] = $seg['rwc']; } else { $seg['ss'] = 'Translation Memory'; } } $stats['edited-word-count'] = array_sum($stat_rwc); $stats['valid-word-count'] = array_sum($stat_valid_rwc); if ($stats['edited-word-count'] > 0) { $stats['too-slow-words'] = round(array_sum($stat_too_slow) / $stats['edited-word-count'], 2) * 100; $stats['too-fast-words'] = round(array_sum($stat_too_fast) / $stats['edited-word-count'], 2) * 100; $stats['avg-pee'] = round(array_sum($stat_pee) / array_sum($stat_rwc)) . "%"; } $stats['mt-words'] = round(array_sum($stat_mt) / $stats['edited-word-count'], 2) * 100; $stats['tm-words'] = 100 - $stats['mt-words']; $stats['total-valid-tte'] = round(array_sum($stat_valid_tte) / 1000); // Non weighted... // $stats['avg-secs-per-word'] = round(array_sum($stat_spw)/count($stat_spw),1); // Weighted $stats['avg-secs-per-word'] = round($stats['total-valid-tte'] / $stats['valid-word-count'], 1); $stats['est-words-per-day'] = number_format(round(3600 * 8 / $stats['avg-secs-per-word']), 0, '.', ','); // Last minute formatting (after calculations) $temp = self::parse_time_to_edit(round(array_sum($stat_valid_tte))); $stats['total-valid-tte'] = "{$temp['0']}h:{$temp['1']}m:{$temp['2']}s"; return array($data, $stats); }
/** * @return float|int */ public function getPEE() { $post_editing_effort = round((1 - MyMemory::TMS_MATCH($this->suggestion, $this->translation)) * 100); if ($post_editing_effort < 0) { $post_editing_effort = 0; } else { if ($post_editing_effort > 100) { $post_editing_effort = 100; } } return $post_editing_effort; }