Ejemplo n.º 1
0
 function testFileWithMaliciousNote()
 {
     $file = test_file_path('xliff/file-with-notes-and-malicious-code.xliff');
     $content = file_get_contents($file);
     $xliff_obj = new Xliff_Parser();
     $xliff = $xliff_obj->Xliff2Array($content);
     $this->assertEquals("<script>alert('This is malicious code');</script>", $xliff['files'][3]['trans-units'][1]['notes'][0]['raw-content']);
 }
Ejemplo n.º 2
0
 /**
  * Extract sources and pre-translations from sdlxliff file and put them in Database
  *
  * @param $xliff_file_content
  * @param $fid
  *
  * @throws Exception
  */
 protected function _extractSegments($xliff_file_content, $fid)
 {
     //create Structure fro multiple files
     $this->projectStructure['segments']->offsetSet($fid, new ArrayObject(array()));
     $xliff_obj = new Xliff_Parser();
     $xliff = $xliff_obj->Xliff2Array($xliff_file_content);
     // Checking that parsing went well
     if (isset($xliff['parser-errors']) or !isset($xliff['files'])) {
         Log::doLog("Xliff Import: Error parsing. " . join("\n", $xliff['parser-errors']));
         throw new Exception("Xliff Import: Error parsing. Check Log file.", -4);
     }
     //needed to check if a file has only one segment
     //for correctness: we could have more tag files in the xliff
     $fileCounter_Show_In_Cattool = 0;
     // Creating the Query
     foreach ($xliff['files'] as $xliff_file) {
         if (!array_key_exists('trans-units', $xliff_file)) {
             continue;
         }
         //extract internal reference base64 files and store their index in $this->projectStructure
         $this->_extractFileReferences($fid, $xliff_file);
         foreach ($xliff_file['trans-units'] as $xliff_trans_unit) {
             //initialize flag
             $show_in_cattool = 1;
             if (!isset($xliff_trans_unit['attr']['translate'])) {
                 $xliff_trans_unit['attr']['translate'] = 'yes';
             }
             if ($xliff_trans_unit['attr']['translate'] == "no") {
                 //No segments to translate
                 //don't increment global counter '$fileCounter_Show_In_Cattool'
                 $show_in_cattool = 0;
             } else {
                 // If the XLIFF is already segmented (has <seg-source>)
                 if (isset($xliff_trans_unit['seg-source'])) {
                     foreach ($xliff_trans_unit['seg-source'] as $position => $seg_source) {
                         $tempSeg = strip_tags($seg_source['raw-content']);
                         $tempSeg = trim($tempSeg);
                         //init tags
                         $seg_source['mrk-ext-prec-tags'] = '';
                         $seg_source['mrk-ext-succ-tags'] = '';
                         if (is_null($tempSeg) || $tempSeg === '') {
                             $show_in_cattool = 0;
                         } else {
                             $extract_external = $this->_strip_external($seg_source['raw-content']);
                             $seg_source['mrk-ext-prec-tags'] = $extract_external['prec'];
                             $seg_source['mrk-ext-succ-tags'] = $extract_external['succ'];
                             $seg_source['raw-content'] = $extract_external['seg'];
                             if (isset($xliff_trans_unit['seg-target'][$position]['raw-content'])) {
                                 $target_extract_external = $this->_strip_external($xliff_trans_unit['seg-target'][$position]['raw-content']);
                                 //we don't want THE CONTENT OF TARGET TAG IF PRESENT and EQUAL TO SOURCE???
                                 //AND IF IT IS ONLY A CHAR? like "*" ?
                                 //we can't distinguish if it is translated or not
                                 //this means that we lose the tags id inside the target if different from source
                                 $src = strip_tags(html_entity_decode($extract_external['seg'], ENT_QUOTES, 'UTF-8'));
                                 $trg = strip_tags(html_entity_decode($target_extract_external['seg'], ENT_QUOTES, 'UTF-8'));
                                 if ($src != $trg && !is_numeric($src)) {
                                     //treat 0,1,2.. as translated content!
                                     $target_extract_external['seg'] = CatUtils::raw2DatabaseXliff($target_extract_external['seg']);
                                     $target = $this->dbHandler->escape($target_extract_external['seg']);
                                     //add an empty string to avoid casting to int: 0001 -> 1
                                     //useful for idiom internal xliff id
                                     $this->projectStructure['translations']->offsetSet("" . $xliff_trans_unit['attr']['id'], new ArrayObject(array(2 => $target)));
                                     //seg-source and target translation can have different mrk id
                                     //override the seg-source surrounding mrk-id with them of target
                                     $seg_source['mrk-ext-prec-tags'] = $target_extract_external['prec'];
                                     $seg_source['mrk-ext-succ-tags'] = $target_extract_external['succ'];
                                 }
                             }
                         }
                         //Log::doLog( $xliff_trans_unit ); die();
                         //                            $seg_source[ 'raw-content' ] = CatUtils::placeholdnbsp( $seg_source[ 'raw-content' ] );
                         $mid = $this->dbHandler->escape($seg_source['mid']);
                         $ext_tags = $this->dbHandler->escape($seg_source['ext-prec-tags']);
                         $source = $this->dbHandler->escape(CatUtils::raw2DatabaseXliff($seg_source['raw-content']));
                         $source_hash = $this->dbHandler->escape(md5($seg_source['raw-content']));
                         $ext_succ_tags = $this->dbHandler->escape($seg_source['ext-succ-tags']);
                         $num_words = CatUtils::segment_raw_wordcount($seg_source['raw-content'], $xliff_file['attr']['source-language']);
                         $trans_unit_id = $this->dbHandler->escape($xliff_trans_unit['attr']['id']);
                         $mrk_ext_prec_tags = $this->dbHandler->escape($seg_source['mrk-ext-prec-tags']);
                         $mrk_ext_succ_tags = $this->dbHandler->escape($seg_source['mrk-ext-succ-tags']);
                         if ($this->projectStructure['file_references']->offsetExists($fid)) {
                             $file_reference = (int) $this->projectStructure['file_references'][$fid];
                         } else {
                             $file_reference = 'NULL';
                         }
                         $this->projectStructure['segments'][$fid]->append("('{$trans_unit_id}',{$fid},{$file_reference},'{$source}','{$source_hash}',{$num_words},'{$mid}','{$ext_tags}','{$ext_succ_tags}',{$show_in_cattool},'{$mrk_ext_prec_tags}','{$mrk_ext_succ_tags}')");
                     }
                 } else {
                     $tempSeg = strip_tags($xliff_trans_unit['source']['raw-content']);
                     $tempSeg = trim($tempSeg);
                     //                        $tempSeg = CatUtils::placeholdnbsp( $tempSeg );
                     $prec_tags = null;
                     $succ_tags = null;
                     if (empty($tempSeg)) {
                         //|| $tempSeg == NBSPPLACEHOLDER ) { //@see CatUtils.php, ( DEFINE NBSPPLACEHOLDER ) don't show <x id=\"nbsp\"/>
                         $show_in_cattool = 0;
                     } else {
                         $extract_external = $this->_strip_external($xliff_trans_unit['source']['raw-content']);
                         $prec_tags = empty($extract_external['prec']) ? null : $extract_external['prec'];
                         $succ_tags = empty($extract_external['succ']) ? null : $extract_external['succ'];
                         $xliff_trans_unit['source']['raw-content'] = $extract_external['seg'];
                         if (isset($xliff_trans_unit['target']['raw-content'])) {
                             $target_extract_external = $this->_strip_external($xliff_trans_unit['target']['raw-content']);
                             if ($xliff_trans_unit['source']['raw-content'] != $target_extract_external['seg']) {
                                 $target = CatUtils::raw2DatabaseXliff($target_extract_external['seg']);
                                 $target = $this->dbHandler->escape($target);
                                 //add an empty string to avoid casting to int: 0001 -> 1
                                 //useful for idiom internal xliff id
                                 $this->projectStructure['translations']->offsetSet("" . $xliff_trans_unit['attr']['id'], new ArrayObject(array(2 => $target)));
                             }
                         }
                     }
                     $source = $xliff_trans_unit['source']['raw-content'];
                     //we do the word count after the place-holding with <x id="nbsp"/>
                     //so &nbsp; are now not recognized as word and not counted as payable
                     $num_words = CatUtils::segment_raw_wordcount($source, $xliff_file['attr']['source-language']);
                     //applying escaping after raw count
                     $source = $this->dbHandler->escape(CatUtils::raw2DatabaseXliff($source));
                     $source_hash = $this->dbHandler->escape(md5($source));
                     $trans_unit_id = $this->dbHandler->escape($xliff_trans_unit['attr']['id']);
                     if (!is_null($prec_tags)) {
                         $prec_tags = $this->dbHandler->escape($prec_tags);
                     }
                     if (!is_null($succ_tags)) {
                         $succ_tags = $this->dbHandler->escape($succ_tags);
                     }
                     if ($this->projectStructure['file_references']->offsetExists($fid)) {
                         $file_reference = (int) $this->projectStructure['file_references'][$fid];
                     } else {
                         $file_reference = 'NULL';
                     }
                     $this->projectStructure['segments'][$fid]->append("('{$trans_unit_id}',{$fid}, {$file_reference},'{$source}','{$source_hash}',{$num_words},NULL,'{$prec_tags}','{$succ_tags}',{$show_in_cattool},NULL,NULL)");
                 }
             }
             //increment the counter for not empty segments
             $fileCounter_Show_In_Cattool += $show_in_cattool;
         }
     }
     // *NOTE*: PHP>=5.3 throws UnexpectedValueException, but PHP 5.2 throws ErrorException
     //use generic
     if (empty($this->projectStructure['segments'][$fid]) || $fileCounter_Show_In_Cattool == 0) {
         Log::doLog("Segment import - no segments found\n");
         throw new Exception("Segment import - no segments found", -1);
     }
     $baseQuery = "INSERT INTO segments ( internal_id, id_file, id_file_part, segment, segment_hash, raw_word_count, xliff_mrk_id, xliff_ext_prec_tags, xliff_ext_succ_tags, show_in_cattool,xliff_mrk_ext_prec_tags,xliff_mrk_ext_succ_tags) values ";
     Log::doLog("Segments: Total Rows to insert: " . count($this->projectStructure['segments'][$fid]));
     //split the query in to chunks if there are too much segments
     $this->projectStructure['segments'][$fid]->exchangeArray(array_chunk($this->projectStructure['segments'][$fid]->getArrayCopy(), 200));
     Log::doLog("Segments: Total Queries to execute: " . count($this->projectStructure['segments'][$fid]));
     foreach ($this->projectStructure['segments'][$fid] as $i => $chunk) {
         $this->dbHandler->query($baseQuery . join(",\n", $chunk));
         Log::doLog("Segments: Executed Query " . ($i + 1));
         if ($this->dbHandler->get_error_number()) {
             Log::doLog("Segment import - DB Error: " . mysql_error() . " - \n");
             throw new Exception("Segment import - DB Error: " . mysql_error() . " - {$chunk}", -2);
         }
     }
     //Log::doLog( $this->projectStructure );
     if (!empty($this->projectStructure['translations'])) {
         $last_segments_query = "SELECT id, internal_id, segment_hash from segments WHERE id_file = %u";
         $last_segments_query = sprintf($last_segments_query, $fid);
         $_last_segments = $this->dbHandler->fetch_array($last_segments_query);
         foreach ($_last_segments as $row) {
             if ($this->projectStructure['translations']->offsetExists("" . $row['internal_id'])) {
                 $this->projectStructure['translations']["" . $row['internal_id']]->offsetSet(0, $row['id']);
                 $this->projectStructure['translations']["" . $row['internal_id']]->offsetSet(1, $row['internal_id']);
                 //WARNING offset 2 are the target translations
                 $this->projectStructure['translations']["" . $row['internal_id']]->offsetSet(3, $row['segment_hash']);
             }
         }
     }
 }
Ejemplo n.º 3
0
 /**
 	This function exists because many developers started adding html tags directly into the XLIFF source since:
 	1) XLIFF tag remapping is too complex for them
 	2) Trados does not lock Tags within the <source> that are expressed as &gt;b&lt; but is tollerant to html tags in <source>
 
 	in short people typed:
 	<source>The <b>red</d> house</source> or worst <source>5 > 3</source>
 	instead of
 	<source>The <g id="1">red</g> house.</source> and <source>5 &gt; 3</source>
 
 	This function will do the following
 	<g id="1">Hello</g>, 4 > 3 -> <g id="1">Hello</g>, 4 &gt; 3
 	<g id="1">Hello</g>, 4 > 3 &gt; -> <g id="1">Hello</g>, 4 &gt; 3 &gt; 2
 */
 public static function fix_non_well_formed_xml($content)
 {
     if (self::$find_xliff_tags_reg === null) {
         // List of the tags that we don't want to escape
         $xliff_tags = array('g', 'x', 'bx', 'ex', 'bpt', 'ept', 'ph', 'it', 'mrk');
         // Convert the list of tags in a regexp list, for example "g|x|bx|ex"
         $xliff_tags_reg_list = implode('|', $xliff_tags);
         // Regexp to find all the XLIFF tags:
         //   </?               -> matches the tag start, for both opening and
         //                        closure tags (see the optional slash)
         //   ($xliff_tags_reg) -> matches one of the XLIFF tags in the list above
         //   (\s[^>]*)?        -> matches attributes and so on; ensures there's a
         //                        space after the tag, to not confuse for example a
         //                        "g" tag with a "gblabla"; [^>]* matches anything,
         //                        including additional spaces; the entire block is
         //                        optional, to allow tags with no spaces or attrs
         //   /? >              -> matches tag end, with optional slash for
         //                        self-closing ones
         // If you are wondering about spaces inside tags, look at this:
         //   http://www.w3.org/TR/REC-xml/#sec-starttags
         // It says that there cannot be any space between the '<' and the tag name,
         // between '</' and the tag name, or inside '/>'. But you can add white
         // space after the tag name, though.
         self::$find_xliff_tags_reg = "#</?({$xliff_tags_reg_list})(\\s[^>]*)?/?>#si";
     }
     // Find all the XLIFF tags
     preg_match_all(self::$find_xliff_tags_reg, $content, $matches);
     $tags = (array) $matches[0];
     // Prepare placeholders
     $tags_placeholders = array();
     for ($i = 0; $i < count($tags); $i++) {
         $tag = $tags[$i];
         $tags_placeholders[$tag] = "#@!XLIFF-TAG-{$i}!@#";
     }
     // Replace all XLIFF tags with placeholders that will not be escaped
     foreach ($tags_placeholders as $tag => $placeholder) {
         $content = str_replace($tag, $placeholder, $content);
     }
     // Escape the string with the remaining non-XLIFF tags
     $content = htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
     // Put again in place the original XLIFF tags replacing placeholders
     foreach ($tags_placeholders as $tag => $placeholder) {
         $content = str_replace($placeholder, $tag, $content);
     }
     return $content;
     /*
     I wrote a sort of unit-test to test the function. Obviously, it passes.
     TODO: move this code to a real unit-test ASAP.
     
     $tests = array(
         '' => '',
         'just text' => 'just text',
     	'<gap>Hey</gap>' => '&lt;gap&gt;Hey&lt;/gap&gt;',
         '<mrk>Hey</mrk>' => '<mrk>Hey</mrk>',
         '<g >Hey</g >' => '<g >Hey</g >',
         '<g    >Hey</g   >' => '<g    >Hey</g   >',
         '<g id="99">Hey</g>' => '<g id="99">Hey</g>',
         'Hey<x/>' => 'Hey<x/>',
         'Hey<x />' => 'Hey<x />',
         'Hey<x   />' => 'Hey<x   />',
         'Hey<x id="15"/>' => 'Hey<x id="15"/>',
         'Hey<bx id="1"/>' => 'Hey<bx id="1"/>',
         'Hey<ex id="1"/>' => 'Hey<ex id="1"/>',
         '<bpt id="1">Hey</bpt>' => '<bpt id="1">Hey</bpt>',
         '<ept id="1">Hey</ept>' => '<ept id="1">Hey</ept>',
         '<ph id="1">Hey</ph>' => '<ph id="1">Hey</ph>',
         '<it id="1">Hey</it>' => '<it id="1">Hey</it>',
         '<mrk mid="3" mtype="seg"><g id="2">Hey man! <x id="1"/><b id="dunno">Hey man & hey girl!</b></mrk>' => '<mrk mid="3" mtype="seg"><g id="2">Hey man! <x id="1"/>&lt;b id=&quot;dunno&quot;&gt;Hey man &amp; hey girl!&lt;/b&gt;</mrk>',
     );
     
     foreach ($tests as $in => $expected) {
         $out = fix_non_well_formed_xml($in);
         if (strcmp($out, $expected) !== 0) {
             echo "ERROR!\nInput:    $in\nOutput:   $out\nExpected: $expected\n";
         }
     }
     */
 }
Ejemplo n.º 4
0
function extractSegments($files_path, $file, $pid, $fid, $jid)
{
    // Output
    // true = ok
    // -1   = Extension not supported
    // -2   = Parse Error
    // -3   = DB Error
    $mysql_hostname = INIT::$DB_SERVER;
    // Database Server machine
    $mysql_database = INIT::$DB_DATABASE;
    // Database Name
    $mysql_username = INIT::$DB_USER;
    // Database User
    $mysql_password = INIT::$DB_PASS;
    // Database Password
    $mysql_link = mysql_connect($mysql_hostname, $mysql_username, $mysql_password);
    mysql_select_db($mysql_database, $mysql_link);
    $query_segment = array();
    // Checking Extentions
    $info = pathinfo($file);
    if ($info['extension'] == 'xliff' || $info['extension'] == 'sdlxliff' || $info['extension'] == 'xlf') {
        $content = file_get_contents("{$files_path}/{$file}");
    } else {
        log::doLog("Xliff Import: Extension " . $info['extension'] . " not managed");
        return false;
    }
    $xliff_obj = new Xliff_Parser();
    $xliff = $xliff_obj->Xliff2Array($content);
    //log::doLog($xliff);
    // Checking that parsing went well
    if (isset($xliff['parser-errors']) or !isset($xliff['files'])) {
        log::doLog("Xliff Import: Error parsing. " . join("\n", $xliff['parser-errors']));
        return false;
    }
    // Creating the Query
    foreach ($xliff['files'] as $xliff_file) {
        $count = 0;
        foreach ($xliff_file['trans-units'] as $xliff_trans_unit) {
            $count = $count + 1;
            if (!isset($xliff_trans_unit['attr']['translate'])) {
                $xliff_trans_unit['attr']['translate'] = 'yes';
            }
            if ($xliff_trans_unit['attr']['translate'] == "no") {
                log::doLog("Xliff Import: Skipping segment marked as non-translatable: " . $xliff_trans_unit['source']['raw-content']);
            } else {
                // If the XLIFF is already segmented (has <seg-source>)
                if (isset($xliff_trans_unit['seg-source'])) {
                    foreach ($xliff_trans_unit['seg-source'] as $seg_source) {
                        $show_in_cattool = 1;
                        $tempSeg = stripTagsFromSource2($seg_source['raw-content']);
                        $tempSeg = trim($tempSeg);
                        if (empty($tempSeg)) {
                            $show_in_cattool = 0;
                        }
                        $mid = mysql_real_escape_string($seg_source['mid']);
                        $ext_tags = mysql_real_escape_string($seg_source['ext-prec-tags']);
                        $source = mysql_real_escape_string($seg_source['raw-content']);
                        $ext_succ_tags = mysql_real_escape_string($seg_source['ext-succ-tags']);
                        $num_words = CatUtils::segment_raw_wordcount($seg_source['raw-content']);
                        $trans_unit_id = mysql_real_escape_string($xliff_trans_unit['attr']['id']);
                        $query_segment = "('{$trans_unit_id}',{$fid},'{$source}',{$num_words},'{$mid}','{$ext_tags}','{$ext_succ_tags}',{$show_in_cattool})";
                    }
                } else {
                    $show_in_cattool = 1;
                    $tempSeg = stripTagsFromSource2($xliff_trans_unit['source']['raw-content']);
                    $tempSeg = trim($tempSeg);
                    if (empty($tempSeg)) {
                        $show_in_cattool = 0;
                    }
                    $source = mysql_real_escape_string($xliff_trans_unit['source']['raw-content']);
                    $num_words = CatUtils::segment_raw_wordcount($xliff_trans_unit['source']['raw-content']);
                    $trans_unit_id = mysql_real_escape_string($xliff_trans_unit['attr']['id']);
                    $query_segment = "('{$trans_unit_id}',{$fid},'{$source}',{$num_words},NULL,NULL,NULL,{$show_in_cattool})";
                }
                $ret = true;
                // Executing the Query
                $query_segment = "INSERT INTO segments (internal_id,id_file, segment, raw_word_count, xliff_mrk_id, xliff_ext_prec_tags, xliff_ext_succ_tags, show_in_cattool)\n                             values " . $query_segment;
                //log::doLog($query_segment); //exit;
                $res = mysql_query($query_segment, $mysql_link);
                if (!$res) {
                    log::doLog("File import - DB Error: " . mysql_error() . " - {$query_segment}\n");
                    $ret = false;
                }
                if (isset($xliff_trans_unit['target'])) {
                    $target = mysql_real_escape_string($xliff_trans_unit['target']['raw-content']);
                    //log::doLog("Target: ".$target);
                    if (!empty($target)) {
                        $last_id = mysql_insert_id($mysql_link);
                        //log::doLog("Last_id: ".$last_id);
                        $query_segment_translations = "('{$last_id}', '{$jid}','TRANSLATED','{$target}',NULL,NULL,NULL,NULL, NULL, NULL, '{$target}',NULL,NULL,NULL)";
                        // Executing the Query
                        $query_segment_translations = "INSERT INTO segment_translations (id_segment, id_job,status, translation, translation_date, time_to_edit, match_type, context_hash, eq_word_count, suggestions_array, suggestion, suggestion_match, suggestion_source, suggestion_position)\n                                 values " . $query_segment_translations;
                        //log::doLog($query_segment_translations);
                        $res2 = mysql_query($query_segment_translations, $mysql_link);
                        if (!$res2) {
                            log::doLog("File import - DB Error: " . mysql_error() . " - {$query_segment_translations}\n");
                            $ret = false;
                        }
                    }
                }
            }
        }
    }
    return $ret;
}