/**
 * return a cleaned IPTC value.
 *
 * @param string $value
 * @return string
 */
function clean_iptc_value($value)
{
    // strip leading zeros (weird Kodak Scanner software)
    while (isset($value[0]) and $value[0] == chr(0)) {
        $value = substr($value, 1);
    }
    // remove binary nulls
    $value = str_replace(chr(0x0), ' ', $value);
    if (preg_match('/[\\x80-\\xff]/', $value)) {
        // apparently mac uses some MacRoman crap encoding. I don't know
        // how to detect it so a plugin should do the trick.
        $value = trigger_change('clean_iptc_value', $value);
        if (($qual = qualify_utf8($value)) != 0) {
            // has non ascii chars
            if ($qual > 0) {
                $input_encoding = 'utf-8';
            } else {
                $input_encoding = 'iso-8859-1';
                if (function_exists('iconv') or function_exists('mb_convert_encoding')) {
                    // using windows-1252 because it supports additional characters
                    // such as "oe" in a single character (ligature). About the
                    // difference between Windows-1252 and ISO-8859-1: the characters
                    // 0x80-0x9F will not convert correctly. But these are control
                    // characters which are almost never used.
                    $input_encoding = 'windows-1252';
                }
            }
            $value = convert_charset($value, $input_encoding, get_pwg_charset());
        }
    }
    return $value;
}
Example #2
0
/**
 * Remove accents from a UTF-8 or ISO-8859-1 string (from wordpress)
 *
 * @param string $string
 * @return string
 */
function remove_accents($string)
{
    $utf = qualify_utf8($string);
    if ($utf == 0) {
        return $string;
        // ascii
    }
    if ($utf > 0) {
        $chars = array("À" => 'A', "Á" => 'A', "Â" => 'A', "Ã" => 'A', "Ä" => 'A', "Å" => 'A', "Ç" => 'C', "È" => 'E', "É" => 'E', "Ê" => 'E', "Ë" => 'E', "Ì" => 'I', "Í" => 'I', "Î" => 'I', "Ï" => 'I', "Ñ" => 'N', "Ò" => 'O', "Ó" => 'O', "Ô" => 'O', "Õ" => 'O', "Ö" => 'O', "Ù" => 'U', "Ú" => 'U', "Û" => 'U', "Ü" => 'U', "Ý" => 'Y', "ß" => 's', "à" => 'a', "á" => 'a', "â" => 'a', "ã" => 'a', "ä" => 'a', "å" => 'a', "ç" => 'c', "è" => 'e', "é" => 'e', "ê" => 'e', "ë" => 'e', "ì" => 'i', "í" => 'i', "î" => 'i', "ï" => 'i', "ñ" => 'n', "ò" => 'o', "ó" => 'o', "ô" => 'o', "õ" => 'o', "ö" => 'o', "ù" => 'u', "ú" => 'u', "û" => 'u', "ü" => 'u', "ý" => 'y', "ÿ" => 'y', "Ā" => 'A', "ā" => 'a', "Ă" => 'A', "ă" => 'a', "Ą" => 'A', "ą" => 'a', "Ć" => 'C', "ć" => 'c', "Ĉ" => 'C', "ĉ" => 'c', "Ċ" => 'C', "ċ" => 'c', "Č" => 'C', "č" => 'c', "Ď" => 'D', "ď" => 'd', "Đ" => 'D', "đ" => 'd', "Ē" => 'E', "ē" => 'e', "Ĕ" => 'E', "ĕ" => 'e', "Ė" => 'E', "ė" => 'e', "Ę" => 'E', "ę" => 'e', "Ě" => 'E', "ě" => 'e', "Ĝ" => 'G', "ĝ" => 'g', "Ğ" => 'G', "ğ" => 'g', "Ġ" => 'G', "ġ" => 'g', "Ģ" => 'G', "ģ" => 'g', "Ĥ" => 'H', "ĥ" => 'h', "Ħ" => 'H', "ħ" => 'h', "Ĩ" => 'I', "ĩ" => 'i', "Ī" => 'I', "ī" => 'i', "Ĭ" => 'I', "ĭ" => 'i', "Į" => 'I', "į" => 'i', "İ" => 'I', "ı" => 'i', "IJ" => 'IJ', "ij" => 'ij', "Ĵ" => 'J', "ĵ" => 'j', "Ķ" => 'K', "ķ" => 'k', "ĸ" => 'k', "Ĺ" => 'L', "ĺ" => 'l', "Ļ" => 'L', "ļ" => 'l', "Ľ" => 'L', "ľ" => 'l', "Ŀ" => 'L', "ŀ" => 'l', "Ł" => 'L', "ł" => 'l', "Ń" => 'N', "ń" => 'n', "Ņ" => 'N', "ņ" => 'n', "Ň" => 'N', "ň" => 'n', "ʼn" => 'N', "Ŋ" => 'n', "ŋ" => 'N', "Ō" => 'O', "ō" => 'o', "Ŏ" => 'O', "ŏ" => 'o', "Ő" => 'O', "ő" => 'o', "Œ" => 'OE', "œ" => 'oe', "Ŕ" => 'R', "ŕ" => 'r', "Ŗ" => 'R', "ŗ" => 'r', "Ř" => 'R', "ř" => 'r', "Ś" => 'S', "ś" => 's', "Ŝ" => 'S', "ŝ" => 's', "Ş" => 'S', "ş" => 's', "Š" => 'S', "š" => 's', "Ţ" => 'T', "ţ" => 't', "Ť" => 'T', "ť" => 't', "Ŧ" => 'T', "ŧ" => 't', "Ũ" => 'U', "ũ" => 'u', "Ū" => 'U', "ū" => 'u', "Ŭ" => 'U', "ŭ" => 'u', "Ů" => 'U', "ů" => 'u', "Ű" => 'U', "ű" => 'u', "Ų" => 'U', "ų" => 'u', "Ŵ" => 'W', "ŵ" => 'w', "Ŷ" => 'Y', "ŷ" => 'y', "Ÿ" => 'Y', "Ź" => 'Z', "ź" => 'z', "Ż" => 'Z', "ż" => 'z', "Ž" => 'Z', "ž" => 'z', "ſ" => 's', "Ș" => 'S', "ș" => 's', "Ț" => 'T', "ț" => 't', "€" => 'E', "£" => '');
        $string = strtr($string, $chars);
    } else {
        // Assume ISO-8859-1 if not UTF-8
        $chars['in'] = chr(128) . chr(131) . chr(138) . chr(142) . chr(154) . chr(158) . chr(159) . chr(162) . chr(165) . chr(181) . chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) . chr(199) . chr(200) . chr(201) . chr(202) . chr(203) . chr(204) . chr(205) . chr(206) . chr(207) . chr(209) . chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) . chr(217) . chr(218) . chr(219) . chr(220) . chr(221) . chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) . chr(231) . chr(232) . chr(233) . chr(234) . chr(235) . chr(236) . chr(237) . chr(238) . chr(239) . chr(241) . chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) . chr(249) . chr(250) . chr(251) . chr(252) . chr(253) . chr(255);
        $chars['out'] = "EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy";
        $string = strtr($string, $chars['in'], $chars['out']);
        $double_chars['in'] = array(chr(140), chr(156), chr(198), chr(208), chr(222), chr(223), chr(230), chr(240), chr(254));
        $double_chars['out'] = array('OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th');
        $string = str_replace($double_chars['in'], $double_chars['out'], $string);
    }
    return $string;
}