protected function execute($value, EntityInterface $entity = null) { if (!is_string($value)) { $this->throwError('non_string_value', ['value' => $value], IncidentInterface::CRITICAL); return false; } // @see http://hakipedia.com/index.php/Poison_Null_Byte $value = str_replace(chr(0), '', $value); // remove zero-width space character from text $value = str_replace("", '', $value); // strip unicode characters 'RIGHT-TO-LEFT OVERRIDE' and 'LEFT-TO-RIGHT OVERRIDE' // which can be used to turn 'image[RTLO]gpj.exe' into 'imageexe.jpg' $value = str_replace("", '', $value); // 'RIGHT-TO-LEFT OVERRIDE' $value = str_replace("", '', $value); // 'LEFT-TO-RIGHT OVERRIDE' /** * Some links for illformed byte sequences etc.: * * @see http://php.net/manual/de/function.mb-check-encoding.php * @see http://www.w3.org/International/questions/qa-forms-utf-8.en.php * @see http://unicode.org/reports/tr36/#Ill-Formed_Subsequences * @see http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */ // strip invalid utf8 characters // use mbstring here instead of iconv with '//ignore' – https://bugs.php.net/bug.php?id=61484 // $value = iconv('UTF-8', 'UTF-8//IGNORE', $value); $prev = ini_set('mbstring.substitute_character', 'none'); $value = mb_convert_encoding($value, 'UTF-8', 'UTF-8'); ini_set('mbstring.substitute_character', $prev); // trim the input string // note: '/(*UTF8)[[:alnum:]]/' matches 'é' while '/[[:alnum:]]/' does not $pattern = '/(*UTF8)^[\\pZ\\pC]*+(?P<trimmed>.*?)[\\pZ\\pC]*+$/usDS'; if (preg_match($pattern, $value, $matches)) { $value = $matches['trimmed']; } // trim zero-width joiner and zero-width non-joiner (at the end of the text) // https://en.wikipedia.org/wiki/Zero-width_non-joiner $value = preg_replace("/\$/", '', $value); // zero-width non-joiner $value = preg_replace("/\$/", '', $value); // zero-width joiner // æ is a ligature in english but a distinct letter in icelandic and other languages // additionally remove some control characters // remove non-printable control characters including TAB, LINE FEED, CARRIAGE RETURN // $remove_pattern = "/[\x01-\x08\x09\x0A\x0B\x0C\x0D\x0E-\x1F\x7F]/u"; $remove_chars = ["", "", "", "", "", "", "", "", "\t", "\n", "\v", "\f", "\r", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "\n", "\r\n", "\r"]; $value = str_replace($remove_chars, '', $value); if (!is_string($value)) { $this->throwError('control_character_stripping_failed', [], IncidentInterface::CRITICAL); return false; } // TODO urldecode or similar to replace common %20 etc. patterns from URL downloading with characters? // solve relative paths like 'folder/../file.ext' – as we probably replace the '/' with '-' // anyways later on this might seem unnecessary, but leads to nicer filenames with less '-' do { $value = preg_replace('#[^/\\.]+/\\.\\./#', '', $value, -1, $count); } while ($count); $value = str_replace(['/./', '//'], '/', $value); // replace multiple occurrences of '.' with one '.' $value = preg_replace('/\\.{2,}/', '.', $value); $replace_special_chars = $this->getOption(self::OPTION_REPLACE_SPECIAL_CHARS, true); if ($replace_special_chars) { $replace_chars = ['#', '<', '$', '+', '%', '>', '!', '`', '&', '*', '‘', '|', '{', '?', '“', '=', '}', '/', ':', '\\', ' ', '@']; $replace_with = $this->getOption(self::OPTION_REPLACE_WITH, '-'); $value = str_replace($replace_chars, $replace_with, $value); if (!is_string($value)) { $this->throwError('character_replacing_failed', [], IncidentInterface::CRITICAL); return false; } } // trim '.' and '-' (so regardless of LTR or RTL script the filename doesn't start // with a dot to prevent generating a hidden dotfile filename $value = trim($value, '.-'); // check minimum string length if ($this->hasOption(self::OPTION_MIN_LENGTH)) { $min = filter_var($this->getOption(self::OPTION_MIN_LENGTH, -PHP_INT_MAX - 1), FILTER_VALIDATE_INT); if ($min === false) { throw new InvalidConfigException('Minimum string length specified is not interpretable as integer.'); } if (mb_strlen($value) < $min) { $this->throwError(self::OPTION_MIN_LENGTH, [self::OPTION_MIN_LENGTH => $min, 'value' => $value]); return false; } } // check maximum string length if ($this->hasOption(self::OPTION_MAX_LENGTH)) { $max = filter_var($this->getOption(self::OPTION_MAX_LENGTH, PHP_INT_MAX), FILTER_VALIDATE_INT); if ($max === false) { throw new InvalidConfigException('Maximum string length specified is not interpretable as integer.'); } if (mb_strlen($value) > $max) { $this->throwError(self::OPTION_MAX_LENGTH, [self::OPTION_MAX_LENGTH => $max, 'value' => $value]); return false; } } $lowercase = $this->getOption(self::OPTION_LOWERCASE, false); if ($lowercase) { $value = mb_strtolower($value, 'UTF-8'); // TODO it's probably advisable to manually lowercase some more variants as mentioned // in this comment: http://php.net/manual/de/function.mb-strtolower.php#105753 //$value = strtr($value, $additional_replacements); } $spoofcheck_resulting_value = $this->getOption(self::OPTION_SPOOFCHECK_RESULT, false); if ($spoofcheck_resulting_value) { $rule = new SpoofcheckerRule('spoofcheck-resulting-text', $this->getOptions()); if (!$rule->apply($value)) { foreach ($rule->getIncidents() as $incident) { $this->throwError($incident->getName(), $incident->getParameters(), $incident->getSeverity()); } return false; } else { $value = $rule->getSanitizedValue(); } } $this->setSanitizedValue($value); return true; }
protected function execute($value, EntityInterface $entity = null) { if (!is_string($value)) { $this->throwError('non_string_value', ['value' => $value], IncidentInterface::CRITICAL); return false; } $spoofcheck_incoming_value = $this->getOption(self::OPTION_SPOOFCHECK_INCOMING, false); if ($spoofcheck_incoming_value) { $rule = new SpoofcheckerRule('spoofcheck-incoming-text', $this->getOptions()); if (!$rule->apply($value)) { foreach ($rule->getIncidents() as $incident) { $this->throwError($incident->getName(), $incident->getParameters(), $incident->getSeverity()); } return false; } else { $value = $rule->getSanitizedValue(); } } // @see http://hakipedia.com/index.php/Poison_Null_Byte $strip_null_bytes = $this->getOption(self::OPTION_STRIP_NULL_BYTES, true); if ($strip_null_bytes) { $value = str_replace(chr(0), '', $value); } // remove zero-width space character from text $strip_zero_width_space = $this->getOption(self::OPTION_STRIP_ZERO_WIDTH_SPACE, false); if ($strip_zero_width_space) { $value = str_replace("", '', $value); } // strip unicode characters 'RIGHT-TO-LEFT OVERRIDE' and 'LEFT-TO-RIGHT OVERRIDE' if necessary $strip_direction_overrides = $this->getOption(self::OPTION_STRIP_DIRECTION_OVERRIDES, false); if ($strip_direction_overrides) { $value = str_replace("", '', $value); // 'RIGHT-TO-LEFT OVERRIDE' $value = str_replace("", '', $value); // 'LEFT-TO-RIGHT OVERRIDE' } // TODO should one allow trimming of zero-width non-joiner (only at the end of text)? /** * Some links for illformed byte sequences etc.: * * @see http://php.net/manual/de/function.mb-check-encoding.php * @see http://www.w3.org/International/questions/qa-forms-utf-8.en.php * @see http://unicode.org/reports/tr36/#Ill-Formed_Subsequences * @see http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */ // check for a valid utf8 string without certain byte sequences $reject_invalid_utf8 = $this->getOption(self::OPTION_REJECT_INVALID_UTF8, true); if ($reject_invalid_utf8) { if (!mb_check_encoding($value, 'UTF-8')) { $this->throwError('invalid_utf8', ['value' => $value, 'converted_value' => mb_convert_encoding($value, 'UTF-8', 'UTF-8')], IncidentInterface::CRITICAL); return false; } } // strip invalid utf8 characters // the stripping might not work as good as expected depending on php bugs etc. $strip_invalid_utf8 = $this->getOption(self::OPTION_STRIP_INVALID_UTF8, true); if ($strip_invalid_utf8) { // use mbstring here instead of iconv with '//ignore' – https://bugs.php.net/bug.php?id=61484 // $value = iconv('UTF-8', 'UTF-8//IGNORE', $value); // might be relevant as well: https://bugs.php.net/bug.php?id=65045 $prev = ini_set('mbstring.substitute_character', 'none'); $value = mb_convert_encoding($value, 'UTF-8', 'UTF-8'); ini_set('mbstring.substitute_character', $prev); } // trim the input string if necessary // this might actually not trim a lot when invalid utf8 is left from prior steps if ($this->getOption(self::OPTION_TRIM, true)) { //$value = trim($value); // note: '/(*UTF8)[[:alnum:]]/' matches 'é' while '/[[:alnum:]]/' does not // \p{Z}: any kind of whitespace or invisible separator // \p{C}: invisible control characters and unused code points // "*+" is not a mistake, but a possessive quantifier // @see http://www.regular-expressions.info/unicode.html $pattern = '/(*UTF8)^[\\pZ\\pC]*+(?P<trimmed>.*?)[\\pZ\\pC]*+$/usDS'; if (preg_match($pattern, $value, $matches)) { $value = $matches['trimmed']; } } $sanitized_value = $value; // additionally remove some control characters $strip_ctrl_chars = $this->getOption(self::OPTION_STRIP_CONTROL_CHARACTERS, true); if ($strip_ctrl_chars) { // remove non-printable control characters, but MAYBE allow TAB, LINE FEED, CARRIAGE RETURN // $remove_pattern = "/[\x01-\x08\x09\x0A\x0B\x0C\x0D\x0E-\x1F\x7F]/u"; $remove_chars = ["", "", "", "", "", "", "", "", "\t", "\n", "\v", "\f", "\r", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]; $allow_tab = $this->getOption(self::OPTION_ALLOW_TAB, true); if ($allow_tab) { unset($remove_chars[8]); // "\x09" } $allow_crlf = $this->getOption(self::OPTION_ALLOW_CRLF, false); if ($allow_crlf) { unset($remove_chars[9]); // "\x0A" unset($remove_chars[12]); // "\x0D" } $sanitized_value = str_replace($remove_chars, '', $value); if (!is_string($sanitized_value)) { $this->throwError('control_character_stripping_failed', [], IncidentInterface::CRITICAL); return false; } } $normalize_newlines = $this->getOption(self::OPTION_NORMALIZE_NEWLINES, false); if ($normalize_newlines) { $sanitized_value = str_replace(["\r\n", "\r"], "\n", $sanitized_value); if (!is_string($sanitized_value)) { $this->throwError('normalizing_newlines_failed', [], IncidentInterface::CRITICAL); return false; } } // check minimum string length if ($this->hasOption(self::OPTION_MIN_LENGTH)) { $min = filter_var($this->getOption(self::OPTION_MIN_LENGTH, -PHP_INT_MAX - 1), FILTER_VALIDATE_INT); if ($min === false) { throw new InvalidConfigException('Minimum string length specified is not interpretable as integer.'); } if (mb_strlen($sanitized_value) < $min) { $this->throwError(self::OPTION_MIN_LENGTH, [self::OPTION_MIN_LENGTH => $min, 'value' => $sanitized_value]); return false; } } // check maximum string length if ($this->hasOption(self::OPTION_MAX_LENGTH)) { $max = filter_var($this->getOption(self::OPTION_MAX_LENGTH, PHP_INT_MAX), FILTER_VALIDATE_INT); if ($max === false) { throw new InvalidConfigException('Maximum string length specified is not interpretable as integer.'); } if (mb_strlen($sanitized_value) > $max) { $this->throwError(self::OPTION_MAX_LENGTH, [self::OPTION_MAX_LENGTH => $max, 'value' => $sanitized_value]); return false; } } $spoofcheck_resulting_value = $this->getOption(self::OPTION_SPOOFCHECK_RESULT, false); if ($spoofcheck_resulting_value) { $rule = new SpoofcheckerRule('spoofcheck-resulting-text', $this->getOptions()); if (!$rule->apply($sanitized_value)) { foreach ($rule->getIncidents() as $incident) { $this->throwError($incident->getName(), $incident->getParameters(), $incident->getSeverity()); } return false; } else { $sanitized_value = $rule->getSanitizedValue(); } } $this->setSanitizedValue($sanitized_value); return true; }