示例#1
0
/**
 * Convert special chars (like german umlauts) to ASCII characters.
 *
 * @todo dh> IMHO this function should not be included in a file that gets used often/always.
 * @param string
 * @return string
 */
function replace_special_chars($str)
{
    global $evo_charset;
    if (can_convert_charsets('UTF-8', $evo_charset) && can_convert_charsets('UTF-8', 'ISO-8859-1')) {
        $str = convert_charset($str, 'UTF-8', $evo_charset);
        // TODO: add more...?!
        $search = array('Ä', 'ä', 'Ö', 'ö', 'Ü', 'ü', 'ß', 'à', 'ç', 'è', 'é', 'ì', 'ò', 'ô', 'ù');
        // iso-8859-1
        $replace = array('Ae', 'ae', 'Oe', 'oe', 'Ue', 'ue', 'ss', 'a', 'c', 'e', 'e', 'i', 'o', 'o', 'u');
        foreach ($search as $k => $v) {
            // convert $search to UTF-8
            $search[$k] = convert_charset($v, 'UTF-8', 'ISO-8859-1');
        }
        $str = str_replace($search, $replace, $str);
        // Replace HTML entities
        $str = htmlentities($str, ENT_NOQUOTES, 'UTF-8');
    } else {
        // Replace HTML entities only
        $str = htmlentities($str, ENT_NOQUOTES, $evo_charset);
    }
    // Keep only one char in entities!
    $str = preg_replace('/&(.).+?;/', '$1', $str);
    // Replace non acceptable chars
    $str = preg_replace('/[^A-Za-z0-9_]+/', '-', $str);
    // Remove '-' at start and end:
    $str = preg_replace('/^-+/', '', $str);
    $str = preg_replace('/-+$/', '', $str);
    return $str;
}
 /**
  * Test {@link bpost_count_words()}.
  */
 function test_bpost_count_words()
 {
     global $evo_charset;
     if (!can_convert_charsets('ISO-8859-1', 'UTF-8')) {
         echo 'Skipping tests (cannot convert charsets)...<br />', "\n";
         return;
     }
     $old_evo_charset = $evo_charset;
     $evo_charset = 'ISO-8859-1';
     $this->assertEqual(bpost_count_words(convert_charset('eine gleichung wie 1 + 2 = 9 /', 'ISO-8859-1', 'UTF-8')), 3);
     $this->assertEqual(bpost_count_words(convert_charset('mixed with the 3 ümläuts: äää ööö üüü ÄÄÄ ÖÖÖ	ÜÜÜ', 'ISO-8859-1', 'UTF-8')), 10);
     $evo_charset = 'UTF-8';
     $this->assertEqual(bpost_count_words('möre (again 3) ümläüts... öö üü ää ÄÄ ÖÖ ÜÜ'), 9);
     $this->assertEqual(bpost_count_words('russian: Расширенные возможности - это удобный'), 5);
     $this->assertEqual(bpost_count_words('A versão foi apelidade de Tilqi, porque era aniversário dele. numbers: 42'), 11);
     $this->assertEqual(bpost_count_words('HTML tags -> <a href="http://b2evolution.net" target="_blank">visit b2evo!</a>. Some other chars: "\' \' " <<< < >>> > ``` -- versão удобный überladen'), 10);
     $evo_charset = $old_evo_charset;
 }
示例#3
0
/**
 * Convert special chars (like german umlauts) to ASCII characters.
 *
 * @param string Input string to operate on
 * @param NULL|string The post locale or NULL if there is no specific locale.
 *                    Gets passed to evo_iconv_transliterate().
 * @return string The input string with replaced chars.
 */
function replace_special_chars($str, $post_locale = NULL)
{
    global $evo_charset, $default_locale, $current_locale, $locales;
    // Decode entities to be able to transliterate the associated chars:
    // Tblue> TODO: Check if this could have side effects.
    $str = html_entity_decode($str, ENT_NOQUOTES, $evo_charset);
    $our_locale = $post_locale;
    if ($our_locale === NULL) {
        // post locale is not set, try to guess current locale
        if (!empty($default_locale)) {
            $our_locale = $default_locale;
        }
        if (!empty($current_locale)) {
            // Override with current locale if available
            $our_locale = $current_locale;
        }
    }
    if ($our_locale !== NULL && isset($locales[$our_locale]) && !empty($locales[$our_locale]['transliteration_map'])) {
        // Use locale 'transliteration_map' if present
        if (!array_key_exists('', $locales[$our_locale]['transliteration_map'])) {
            // Make sure there's no empty string key, otherwise strtr() returns false
            if ($tmp_str = strtr($str, $locales[$our_locale]['transliteration_map'])) {
            }
            // Use newly transliterated string
            $str = $tmp_str;
        }
    }
    if (($newstr = evo_iconv_transliterate($str, $post_locale)) !== false) {
        // iconv allows us to get nice URL titles by transliterating non-ASCII chars.
        // Tblue> htmlentities() does not know anything about ASCII?! ISO-8859-1 will work too, though.
        $newstr_charset = 'ISO-8859-1';
    } else {
        if (can_convert_charsets('UTF-8', $evo_charset) && can_convert_charsets('UTF-8', 'ISO-8859-1')) {
            // Fallback to the limited old method: Transliterate only a few known chars.
            $newstr = convert_charset($str, 'UTF-8', $evo_charset);
            $newstr_charset = 'UTF-8';
            $search = array('Ä', 'ä', 'Ö', 'ö', 'Ü', 'ü', 'ß', 'à', 'ç', 'è', 'é', 'ì', 'ò', 'ô', 'ù');
            // iso-8859-1
            $replace = array('Ae', 'ae', 'Oe', 'oe', 'Ue', 'ue', 'ss', 'a', 'c', 'e', 'e', 'i', 'o', 'o', 'u');
            foreach ($search as $k => $v) {
                // convert $search to UTF-8
                $search[$k] = convert_charset($v, 'UTF-8', 'ISO-8859-1');
            }
            $newstr = str_replace($search, $replace, $newstr);
        } else {
            // Replace HTML entities only.
            $newstr = $str;
            $newstr_charset = $evo_charset;
        }
    }
    // Replace HTML entities
    $newstr = htmlentities($newstr, ENT_NOQUOTES, $newstr_charset);
    // Handle special entities (e.g., use "-" instead of "a" for "&"):
    $newstr = str_replace(array('&amp;', '&laquo;', '&raquo;'), '-', $newstr);
    // Keep only one char in entities!
    $newstr = preg_replace('/&(.).+?;/', '$1', $newstr);
    // Replace non acceptable chars
    $newstr = preg_replace('/[^A-Za-z0-9_]+/', '-', $newstr);
    // Remove '-' at start and end:
    $newstr = preg_replace('/^-+/', '', $newstr);
    $newstr = preg_replace('/-+$/', '', $newstr);
    //pre_dump( $str, $newstr );
    return $newstr;
}
 /**
  * Check the content of a given URL (referer), if the requested URI (with different hostname variations)
  * is present.
  *
  * @todo Use DB cache to avoid checking the same page again and again! (Plugin DB table)
  *
  * @param string
  * @param string URI to append to matching pattern for hostnames
  * @return boolean
  */
 function is_referer_linking_us($referer, $uri)
 {
     global $misc_inc_path, $lib_subdir, $ReqHost;
     if (empty($referer)) {
         return false;
     }
     // Load page content (max. 500kb), using fsockopen:
     $url_parsed = @parse_url($referer);
     if (!$url_parsed) {
         return false;
     }
     if (empty($url_parsed['scheme'])) {
         $url_parsed = parse_url('http://' . $referer);
     }
     $host = $url_parsed['host'];
     $port = empty($url_parsed['port']) ? 80 : $url_parsed['port'];
     $path = empty($url_parsed['path']) ? '/' : $url_parsed['path'];
     if (!empty($url_parsed['query'])) {
         $path .= '?' . $url_parsed['query'];
     }
     $fp = @fsockopen($host, $port, $errno, $errstr, 30);
     if (!$fp) {
         // could not access referring page
         $this->debug_log('is_referer_linking_us(): could not access &laquo;' . $referer . '&raquo; (host: ' . $host . '): ' . $errstr . ' (#' . $errno . ')');
         return false;
     }
     // Set timeout for data:
     if (function_exists('stream_set_timeout')) {
         stream_set_timeout($fp, 20);
     } else {
         socket_set_timeout($fp, 20);
     }
     // PHP 4
     // Send request:
     $out = "GET {$path} HTTP/1.0\r\n";
     $out .= "Host: {$host}:{$port}\r\n";
     $out .= "Connection: Close\r\n\r\n";
     fwrite($fp, $out);
     // Skip headers:
     $i = 0;
     $source_charset = 'iso-8859-1';
     // default
     while (($s = fgets($fp, 4096)) !== false) {
         $i++;
         if ($s == "\r\n" || $i > 100) {
             break;
         }
         if (preg_match('~^Content-Type:.*?charset=([\\w-]+)~i', $s, $match)) {
             $source_charset = $match[1];
         }
     }
     // Get the refering page's content
     $content_ref_page = '';
     $bytes_read = 0;
     while (($s = fgets($fp, 4096)) !== false) {
         $content_ref_page .= $s;
         $bytes_read += strlen($s);
         if ($bytes_read > 512000) {
             // do not pull more than 500kb of data!
             break;
         }
     }
     fclose($fp);
     if (!strlen($content_ref_page)) {
         $this->debug_log('is_referer_linking_us(): empty $content_ref_page (' . bytesreadable($bytes_read) . ' read)');
         return false;
     }
     $have_idn_name = false;
     // Build the search pattern:
     // We match for basically for 'href="[SERVER][URI]', where [SERVER] is a list of possible hosts (especially IDNA)
     $search_pattern = '~\\shref=["\']?https?://(';
     $possible_hosts = array($_SERVER['HTTP_HOST']);
     if ($_SERVER['SERVER_NAME'] != $_SERVER['HTTP_HOST']) {
         $possible_hosts[] = $_SERVER['SERVER_NAME'];
     }
     $search_pattern_hosts = array();
     foreach ($possible_hosts as $l_host) {
         if (preg_match('~^([^.]+\\.)(.*?)([^.]+\\.[^.]+)$~', $l_host, $match)) {
             // we have subdomains in this hostname
             if (stristr($match[1], 'www')) {
                 // search also for hostname without 'www.'
                 $search_pattern_hosts[] = $match[2] . $match[3];
             }
         }
         $search_pattern_hosts[] = $l_host;
     }
     $search_pattern_hosts = array_unique($search_pattern_hosts);
     foreach ($search_pattern_hosts as $l_host) {
         // add IDN, because this could be linked:
         $l_idn_host = idna_decode($l_host);
         // the decoded puny-code ("xn--..") name (utf8)
         if ($l_idn_host != $l_host) {
             $have_idn_name = true;
             $search_pattern_hosts[] = $l_idn_host;
         }
     }
     // add hosts to pattern, preg_quoted
     for ($i = 0, $n = count($search_pattern_hosts); $i < $n; $i++) {
         $search_pattern_hosts[$i] = preg_quote($search_pattern_hosts[$i], '~');
     }
     $search_pattern .= implode('|', $search_pattern_hosts) . ')';
     if (empty($uri)) {
         // host(s) should end with "/", "'", '"', "?" or whitespace
         $search_pattern .= '[/"\'\\s?]';
     } else {
         $search_pattern .= preg_quote($uri, '~');
         // URI should end with "'", '"' or whitespace
         $search_pattern .= '["\'\\s]';
     }
     $search_pattern .= '~i';
     if ($have_idn_name) {
         // Convert charset to UTF-8, because the decoded domain name is UTF-8, too:
         if (can_convert_charsets('utf-8', $source_charset)) {
             $content_ref_page = convert_charset($content_ref_page, 'utf-8', $source_charset);
         } else {
             $this->debug_log('is_referer_linking_us(): warning: cannot convert charset of referring page');
         }
     }
     if (preg_match($search_pattern, $content_ref_page)) {
         $this->debug_log('is_referer_linking_us(): found current URL in page (' . bytesreadable($bytes_read) . ' read)');
         return true;
     } else {
         if (strpos($referer, $ReqHost) === 0 && !empty($uri)) {
             // Referer is the same host.. just search for $uri
             if (strpos($content_ref_page, $uri) !== false) {
                 $this->debug_log('is_referer_linking_us(): found current URI in page (' . bytesreadable($bytes_read) . ' read)');
                 return true;
             }
         }
         $this->debug_log('is_referer_linking_us(): ' . sprintf('did not find &laquo;%s&raquo; in &laquo;%s&raquo; (%s bytes read).', $search_pattern, $referer, bytesreadable($bytes_read)));
         return false;
     }
 }