/** sanitise a string to make it acceptable as a filename/directoryname * * This routine more or less borrowed from {@link waslib.php}. * * Note that this routine too is very ASCII-centric: in the end only * ASCII-characters (52 letters, 10 digits and dash, dot and underscore) * are allowed in the resulting file/directory name. However, by first * mapping UTF-8 to ASCII (getting rid of diacriticals) we can make names * more readable. * * @param string $filename the string to sanitise * @return string sanitised filename which is never empty */ function sanitise_filename($filename) { // get rid of all diacriticals etc. $s = utf8_strtoascii($filename); // strip leading space/dot/dash/underscore/backslash/slash $s = preg_replace('/^[ .\\-_\\\\\\/]*/', '', $s); // strip trailing space/dot/dash/underscore/backslash/slash $s = preg_replace('/[ .\\-_\\\\\\/]*$/', '', $s); // replace embedded spaces/backslashes/slashes/at-signs/colons with underscores $s = strtr($s, ' \\/@:', '_____'); // keep only letters/digits and embedded dots/dashes/underscores $s = preg_replace('/[^0-9A-Za-z.\\-_]/', '', $s); // replace sequences of underscores with a single underscore $s = preg_replace('/__+/', '_', $s); // 'forbidden' words $forbidden = array('', 'aux', 'com1', 'com2', 'com3', 'com4', 'con', 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'nul', 'prn'); if (in_array(utf8_strtolower($s), $forbidden)) { $s = '_' . $s; } return $s; }
/** construct an alphanumeric string from a (node) title yielding a readable bookmark filename * * this strips everything from $title except alphanumerics. Runs of other characters * are translated to a single underscore. Length of result is limited to * a length of $maxlen bytes (default 50). This includes the length of the extension $ext. * * Note that the $title is UTF-8 and may contain non-ASCII characters. * Ths routine deals with that situation by first converting the UTF-8 * string to ASCII as much as possible (e.g. convert 'e-aigu' to plain 'e') * and subsequently converting all remaining non-letters/digits to a underscores. * * Finally the result is stripped from leading/trailing underscores. If this * yields a non-empty string, the extension $ext (default '.html') is appended. * * Note: this route works best with latin-like text; if $title is completely * written in Chinese (or other UTF-8 characters without a corresponding ASCII * replacement) we end up with a single underscore which is subsequently trim()'ed, * yielding an empty string and no $ext added. I am not sure what to do about that. * * Note: the extension is not checked for non-alphanumerics because this is the * responsability of the caller to provide a decent $ext if the default '.html' is * not used. * * @param string $title input text * @param int $maxlen the maximum length of the result * @param int $ext the filename extension added to a non-empty result * @return string string with only alphanumerics and underscores, max $maxlen chars */ function friendly_bookmark($title, $maxlen = 50, $ext = '.html') { $src = utf8_strtoascii($title); $tgt = ''; $tgt_len = strlen($ext); // already count the extension length against maxlen $subst = FALSE; $n = utf8_strlen($src); for ($i = 0; $i < $n && $tgt_len < $maxlen; ++$i) { $c = utf8_substr($src, $i, 1); if (ctype_alnum($c)) { $tgt .= $c; $tgt_len++; $subst = FALSE; } else { if (!$subst) { $tgt .= "_"; $tgt_len++; $subst = TRUE; } } } if (($tgt = trim($tgt, '_')) != '') { $tgt .= $ext; } return $tgt; }