/** sanitise a string to make it acceptable as a filename/directoryname
  *
  * This routine more or less borrowed from {@link waslib.php}.
  *
  * Note that this routine too is very ASCII-centric: in the end only
  * ASCII-characters (52 letters, 10 digits and dash, dot and underscore)
  * are allowed in the resulting file/directory name. However, by first
  * mapping UTF-8 to ASCII (getting rid of diacriticals) we can make names
  * more readable.
  *
  * @param string $filename the string to sanitise
  * @return string sanitised filename which is never empty
  */
 function sanitise_filename($filename)
 {
     // get rid of all diacriticals etc.
     $s = utf8_strtoascii($filename);
     // strip leading space/dot/dash/underscore/backslash/slash
     $s = preg_replace('/^[ .\\-_\\\\\\/]*/', '', $s);
     // strip trailing space/dot/dash/underscore/backslash/slash
     $s = preg_replace('/[ .\\-_\\\\\\/]*$/', '', $s);
     // replace embedded spaces/backslashes/slashes/at-signs/colons with underscores
     $s = strtr($s, ' \\/@:', '_____');
     // keep only letters/digits and embedded dots/dashes/underscores
     $s = preg_replace('/[^0-9A-Za-z.\\-_]/', '', $s);
     // replace sequences of underscores with a single underscore
     $s = preg_replace('/__+/', '_', $s);
     // 'forbidden' words
     $forbidden = array('', 'aux', 'com1', 'com2', 'com3', 'com4', 'con', 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'nul', 'prn');
     if (in_array(utf8_strtolower($s), $forbidden)) {
         $s = '_' . $s;
     }
     return $s;
 }
/** construct an alphanumeric string from a (node) title yielding a readable bookmark filename
 *
 * this strips everything from $title except alphanumerics. Runs of other characters
 * are translated to a single underscore. Length of result is limited to
 * a length of $maxlen bytes (default 50). This includes the length of the extension $ext.
 *
 * Note that the $title is UTF-8 and may contain non-ASCII characters.
 * Ths routine deals with that situation by first converting the UTF-8
 * string to ASCII as much as possible (e.g. convert 'e-aigu' to plain 'e')
 * and subsequently converting all remaining non-letters/digits to a underscores.
 *
 * Finally the result is stripped from leading/trailing underscores. If this
 * yields a non-empty string, the extension $ext (default '.html') is appended.
 *
 * Note: this route works best with latin-like text; if $title is completely
 * written in Chinese (or other UTF-8 characters without a corresponding ASCII
 * replacement) we end up with a single underscore which is subsequently trim()'ed,
 * yielding an empty string and no $ext added. I am not sure what to do about that.
 *
 * Note: the extension is not checked for non-alphanumerics because this is the
 * responsability of the caller to provide a decent $ext if the default '.html' is
 * not used.
 *
 * @param string $title input text
 * @param int $maxlen the maximum length of the result
 * @param int $ext the filename extension added to a non-empty result
 * @return string string with only alphanumerics and underscores, max $maxlen chars
 */
function friendly_bookmark($title, $maxlen = 50, $ext = '.html')
{
    $src = utf8_strtoascii($title);
    $tgt = '';
    $tgt_len = strlen($ext);
    // already count the extension length against maxlen
    $subst = FALSE;
    $n = utf8_strlen($src);
    for ($i = 0; $i < $n && $tgt_len < $maxlen; ++$i) {
        $c = utf8_substr($src, $i, 1);
        if (ctype_alnum($c)) {
            $tgt .= $c;
            $tgt_len++;
            $subst = FALSE;
        } else {
            if (!$subst) {
                $tgt .= "_";
                $tgt_len++;
                $subst = TRUE;
            }
        }
    }
    if (($tgt = trim($tgt, '_')) != '') {
        $tgt .= $ext;
    }
    return $tgt;
}