function testFileWithMaliciousNote() { $file = test_file_path('xliff/file-with-notes-and-malicious-code.xliff'); $content = file_get_contents($file); $xliff_obj = new Xliff_Parser(); $xliff = $xliff_obj->Xliff2Array($content); $this->assertEquals("<script>alert('This is malicious code');</script>", $xliff['files'][3]['trans-units'][1]['notes'][0]['raw-content']); }
/** * Extract sources and pre-translations from sdlxliff file and put them in Database * * @param $xliff_file_content * @param $fid * * @throws Exception */ protected function _extractSegments($xliff_file_content, $fid) { //create Structure fro multiple files $this->projectStructure['segments']->offsetSet($fid, new ArrayObject(array())); $xliff_obj = new Xliff_Parser(); $xliff = $xliff_obj->Xliff2Array($xliff_file_content); // Checking that parsing went well if (isset($xliff['parser-errors']) or !isset($xliff['files'])) { Log::doLog("Xliff Import: Error parsing. " . join("\n", $xliff['parser-errors'])); throw new Exception("Xliff Import: Error parsing. Check Log file.", -4); } //needed to check if a file has only one segment //for correctness: we could have more tag files in the xliff $fileCounter_Show_In_Cattool = 0; // Creating the Query foreach ($xliff['files'] as $xliff_file) { if (!array_key_exists('trans-units', $xliff_file)) { continue; } //extract internal reference base64 files and store their index in $this->projectStructure $this->_extractFileReferences($fid, $xliff_file); foreach ($xliff_file['trans-units'] as $xliff_trans_unit) { //initialize flag $show_in_cattool = 1; if (!isset($xliff_trans_unit['attr']['translate'])) { $xliff_trans_unit['attr']['translate'] = 'yes'; } if ($xliff_trans_unit['attr']['translate'] == "no") { //No segments to translate //don't increment global counter '$fileCounter_Show_In_Cattool' $show_in_cattool = 0; } else { // If the XLIFF is already segmented (has <seg-source>) if (isset($xliff_trans_unit['seg-source'])) { foreach ($xliff_trans_unit['seg-source'] as $position => $seg_source) { $tempSeg = strip_tags($seg_source['raw-content']); $tempSeg = trim($tempSeg); //init tags $seg_source['mrk-ext-prec-tags'] = ''; $seg_source['mrk-ext-succ-tags'] = ''; if (is_null($tempSeg) || $tempSeg === '') { $show_in_cattool = 0; } else { $extract_external = $this->_strip_external($seg_source['raw-content']); $seg_source['mrk-ext-prec-tags'] = $extract_external['prec']; $seg_source['mrk-ext-succ-tags'] = $extract_external['succ']; $seg_source['raw-content'] = $extract_external['seg']; if (isset($xliff_trans_unit['seg-target'][$position]['raw-content'])) { $target_extract_external = $this->_strip_external($xliff_trans_unit['seg-target'][$position]['raw-content']); //we don't want THE CONTENT OF TARGET TAG IF PRESENT and EQUAL TO SOURCE??? //AND IF IT IS ONLY A CHAR? like "*" ? //we can't distinguish if it is translated or not //this means that we lose the tags id inside the target if different from source $src = strip_tags(html_entity_decode($extract_external['seg'], ENT_QUOTES, 'UTF-8')); $trg = strip_tags(html_entity_decode($target_extract_external['seg'], ENT_QUOTES, 'UTF-8')); if ($src != $trg && !is_numeric($src)) { //treat 0,1,2.. as translated content! $target_extract_external['seg'] = CatUtils::raw2DatabaseXliff($target_extract_external['seg']); $target = $this->dbHandler->escape($target_extract_external['seg']); //add an empty string to avoid casting to int: 0001 -> 1 //useful for idiom internal xliff id $this->projectStructure['translations']->offsetSet("" . $xliff_trans_unit['attr']['id'], new ArrayObject(array(2 => $target))); //seg-source and target translation can have different mrk id //override the seg-source surrounding mrk-id with them of target $seg_source['mrk-ext-prec-tags'] = $target_extract_external['prec']; $seg_source['mrk-ext-succ-tags'] = $target_extract_external['succ']; } } } //Log::doLog( $xliff_trans_unit ); die(); // $seg_source[ 'raw-content' ] = CatUtils::placeholdnbsp( $seg_source[ 'raw-content' ] ); $mid = $this->dbHandler->escape($seg_source['mid']); $ext_tags = $this->dbHandler->escape($seg_source['ext-prec-tags']); $source = $this->dbHandler->escape(CatUtils::raw2DatabaseXliff($seg_source['raw-content'])); $source_hash = $this->dbHandler->escape(md5($seg_source['raw-content'])); $ext_succ_tags = $this->dbHandler->escape($seg_source['ext-succ-tags']); $num_words = CatUtils::segment_raw_wordcount($seg_source['raw-content'], $xliff_file['attr']['source-language']); $trans_unit_id = $this->dbHandler->escape($xliff_trans_unit['attr']['id']); $mrk_ext_prec_tags = $this->dbHandler->escape($seg_source['mrk-ext-prec-tags']); $mrk_ext_succ_tags = $this->dbHandler->escape($seg_source['mrk-ext-succ-tags']); if ($this->projectStructure['file_references']->offsetExists($fid)) { $file_reference = (int) $this->projectStructure['file_references'][$fid]; } else { $file_reference = 'NULL'; } $this->projectStructure['segments'][$fid]->append("('{$trans_unit_id}',{$fid},{$file_reference},'{$source}','{$source_hash}',{$num_words},'{$mid}','{$ext_tags}','{$ext_succ_tags}',{$show_in_cattool},'{$mrk_ext_prec_tags}','{$mrk_ext_succ_tags}')"); } } else { $tempSeg = strip_tags($xliff_trans_unit['source']['raw-content']); $tempSeg = trim($tempSeg); // $tempSeg = CatUtils::placeholdnbsp( $tempSeg ); $prec_tags = null; $succ_tags = null; if (empty($tempSeg)) { //|| $tempSeg == NBSPPLACEHOLDER ) { //@see CatUtils.php, ( DEFINE NBSPPLACEHOLDER ) don't show <x id=\"nbsp\"/> $show_in_cattool = 0; } else { $extract_external = $this->_strip_external($xliff_trans_unit['source']['raw-content']); $prec_tags = empty($extract_external['prec']) ? null : $extract_external['prec']; $succ_tags = empty($extract_external['succ']) ? null : $extract_external['succ']; $xliff_trans_unit['source']['raw-content'] = $extract_external['seg']; if (isset($xliff_trans_unit['target']['raw-content'])) { $target_extract_external = $this->_strip_external($xliff_trans_unit['target']['raw-content']); if ($xliff_trans_unit['source']['raw-content'] != $target_extract_external['seg']) { $target = CatUtils::raw2DatabaseXliff($target_extract_external['seg']); $target = $this->dbHandler->escape($target); //add an empty string to avoid casting to int: 0001 -> 1 //useful for idiom internal xliff id $this->projectStructure['translations']->offsetSet("" . $xliff_trans_unit['attr']['id'], new ArrayObject(array(2 => $target))); } } } $source = $xliff_trans_unit['source']['raw-content']; //we do the word count after the place-holding with <x id="nbsp"/> //so are now not recognized as word and not counted as payable $num_words = CatUtils::segment_raw_wordcount($source, $xliff_file['attr']['source-language']); //applying escaping after raw count $source = $this->dbHandler->escape(CatUtils::raw2DatabaseXliff($source)); $source_hash = $this->dbHandler->escape(md5($source)); $trans_unit_id = $this->dbHandler->escape($xliff_trans_unit['attr']['id']); if (!is_null($prec_tags)) { $prec_tags = $this->dbHandler->escape($prec_tags); } if (!is_null($succ_tags)) { $succ_tags = $this->dbHandler->escape($succ_tags); } if ($this->projectStructure['file_references']->offsetExists($fid)) { $file_reference = (int) $this->projectStructure['file_references'][$fid]; } else { $file_reference = 'NULL'; } $this->projectStructure['segments'][$fid]->append("('{$trans_unit_id}',{$fid}, {$file_reference},'{$source}','{$source_hash}',{$num_words},NULL,'{$prec_tags}','{$succ_tags}',{$show_in_cattool},NULL,NULL)"); } } //increment the counter for not empty segments $fileCounter_Show_In_Cattool += $show_in_cattool; } } // *NOTE*: PHP>=5.3 throws UnexpectedValueException, but PHP 5.2 throws ErrorException //use generic if (empty($this->projectStructure['segments'][$fid]) || $fileCounter_Show_In_Cattool == 0) { Log::doLog("Segment import - no segments found\n"); throw new Exception("Segment import - no segments found", -1); } $baseQuery = "INSERT INTO segments ( internal_id, id_file, id_file_part, segment, segment_hash, raw_word_count, xliff_mrk_id, xliff_ext_prec_tags, xliff_ext_succ_tags, show_in_cattool,xliff_mrk_ext_prec_tags,xliff_mrk_ext_succ_tags) values "; Log::doLog("Segments: Total Rows to insert: " . count($this->projectStructure['segments'][$fid])); //split the query in to chunks if there are too much segments $this->projectStructure['segments'][$fid]->exchangeArray(array_chunk($this->projectStructure['segments'][$fid]->getArrayCopy(), 200)); Log::doLog("Segments: Total Queries to execute: " . count($this->projectStructure['segments'][$fid])); foreach ($this->projectStructure['segments'][$fid] as $i => $chunk) { $this->dbHandler->query($baseQuery . join(",\n", $chunk)); Log::doLog("Segments: Executed Query " . ($i + 1)); if ($this->dbHandler->get_error_number()) { Log::doLog("Segment import - DB Error: " . mysql_error() . " - \n"); throw new Exception("Segment import - DB Error: " . mysql_error() . " - {$chunk}", -2); } } //Log::doLog( $this->projectStructure ); if (!empty($this->projectStructure['translations'])) { $last_segments_query = "SELECT id, internal_id, segment_hash from segments WHERE id_file = %u"; $last_segments_query = sprintf($last_segments_query, $fid); $_last_segments = $this->dbHandler->fetch_array($last_segments_query); foreach ($_last_segments as $row) { if ($this->projectStructure['translations']->offsetExists("" . $row['internal_id'])) { $this->projectStructure['translations']["" . $row['internal_id']]->offsetSet(0, $row['id']); $this->projectStructure['translations']["" . $row['internal_id']]->offsetSet(1, $row['internal_id']); //WARNING offset 2 are the target translations $this->projectStructure['translations']["" . $row['internal_id']]->offsetSet(3, $row['segment_hash']); } } } }
function extractSegments($files_path, $file, $pid, $fid, $jid) { // Output // true = ok // -1 = Extension not supported // -2 = Parse Error // -3 = DB Error $mysql_hostname = INIT::$DB_SERVER; // Database Server machine $mysql_database = INIT::$DB_DATABASE; // Database Name $mysql_username = INIT::$DB_USER; // Database User $mysql_password = INIT::$DB_PASS; // Database Password $mysql_link = mysql_connect($mysql_hostname, $mysql_username, $mysql_password); mysql_select_db($mysql_database, $mysql_link); $query_segment = array(); // Checking Extentions $info = pathinfo($file); if ($info['extension'] == 'xliff' || $info['extension'] == 'sdlxliff' || $info['extension'] == 'xlf') { $content = file_get_contents("{$files_path}/{$file}"); } else { log::doLog("Xliff Import: Extension " . $info['extension'] . " not managed"); return false; } $xliff_obj = new Xliff_Parser(); $xliff = $xliff_obj->Xliff2Array($content); //log::doLog($xliff); // Checking that parsing went well if (isset($xliff['parser-errors']) or !isset($xliff['files'])) { log::doLog("Xliff Import: Error parsing. " . join("\n", $xliff['parser-errors'])); return false; } // Creating the Query foreach ($xliff['files'] as $xliff_file) { $count = 0; foreach ($xliff_file['trans-units'] as $xliff_trans_unit) { $count = $count + 1; if (!isset($xliff_trans_unit['attr']['translate'])) { $xliff_trans_unit['attr']['translate'] = 'yes'; } if ($xliff_trans_unit['attr']['translate'] == "no") { log::doLog("Xliff Import: Skipping segment marked as non-translatable: " . $xliff_trans_unit['source']['raw-content']); } else { // If the XLIFF is already segmented (has <seg-source>) if (isset($xliff_trans_unit['seg-source'])) { foreach ($xliff_trans_unit['seg-source'] as $seg_source) { $show_in_cattool = 1; $tempSeg = stripTagsFromSource2($seg_source['raw-content']); $tempSeg = trim($tempSeg); if (empty($tempSeg)) { $show_in_cattool = 0; } $mid = mysql_real_escape_string($seg_source['mid']); $ext_tags = mysql_real_escape_string($seg_source['ext-prec-tags']); $source = mysql_real_escape_string($seg_source['raw-content']); $ext_succ_tags = mysql_real_escape_string($seg_source['ext-succ-tags']); $num_words = CatUtils::segment_raw_wordcount($seg_source['raw-content']); $trans_unit_id = mysql_real_escape_string($xliff_trans_unit['attr']['id']); $query_segment = "('{$trans_unit_id}',{$fid},'{$source}',{$num_words},'{$mid}','{$ext_tags}','{$ext_succ_tags}',{$show_in_cattool})"; } } else { $show_in_cattool = 1; $tempSeg = stripTagsFromSource2($xliff_trans_unit['source']['raw-content']); $tempSeg = trim($tempSeg); if (empty($tempSeg)) { $show_in_cattool = 0; } $source = mysql_real_escape_string($xliff_trans_unit['source']['raw-content']); $num_words = CatUtils::segment_raw_wordcount($xliff_trans_unit['source']['raw-content']); $trans_unit_id = mysql_real_escape_string($xliff_trans_unit['attr']['id']); $query_segment = "('{$trans_unit_id}',{$fid},'{$source}',{$num_words},NULL,NULL,NULL,{$show_in_cattool})"; } $ret = true; // Executing the Query $query_segment = "INSERT INTO segments (internal_id,id_file, segment, raw_word_count, xliff_mrk_id, xliff_ext_prec_tags, xliff_ext_succ_tags, show_in_cattool)\n values " . $query_segment; //log::doLog($query_segment); //exit; $res = mysql_query($query_segment, $mysql_link); if (!$res) { log::doLog("File import - DB Error: " . mysql_error() . " - {$query_segment}\n"); $ret = false; } if (isset($xliff_trans_unit['target'])) { $target = mysql_real_escape_string($xliff_trans_unit['target']['raw-content']); //log::doLog("Target: ".$target); if (!empty($target)) { $last_id = mysql_insert_id($mysql_link); //log::doLog("Last_id: ".$last_id); $query_segment_translations = "('{$last_id}', '{$jid}','TRANSLATED','{$target}',NULL,NULL,NULL,NULL, NULL, NULL, '{$target}',NULL,NULL,NULL)"; // Executing the Query $query_segment_translations = "INSERT INTO segment_translations (id_segment, id_job,status, translation, translation_date, time_to_edit, match_type, context_hash, eq_word_count, suggestions_array, suggestion, suggestion_match, suggestion_source, suggestion_position)\n values " . $query_segment_translations; //log::doLog($query_segment_translations); $res2 = mysql_query($query_segment_translations, $mysql_link); if (!$res2) { log::doLog("File import - DB Error: " . mysql_error() . " - {$query_segment_translations}\n"); $ret = false; } } } } } } return $ret; }