<?php
function utf8_scrub($str, $substitute = "\xE3\x80\x93")
{
$regex = '/
([\x00-\x7F] # U+0000 - U+007F
|[\xC2-\xDF][\x80-\xBF] # U+0080 - U+07FF
| \xE0[\xA0-\xBF][\x80-\xBF] # U+0800 - U+0FFF
|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # U+1000 - U+CFFF
| \xED[\x80-\x9F][\x80-\xBF] # U+D000 - U+D7FF
| \xF0[\x90-\xBF][\x80-\xBF]{2} # U+10000 - U+3FFFF
|[\xF1-\xF3][\x80-\xBF]{3} # U+40000 - U+FFFFF
| \xF4[\x80-\x8F][\x80-\xBF]{2}) # U+100000 - U+10FFFF
|(\xE0[\xA0-\xBF] # U+0800 - U+0FFF (invalid)
|[\xE1-\xEC\xEE\xEF][\x80-\xBF] # U+1000 - U+CFFF (invalid)
| \xED[\x80-\x9F] # U+D000 - U+D7FF (invalid)
| \xF0[\x90-\xBF][\x80-\xBF]? # U+10000 - U+3FFFF (invalid)
|[\xF1-\xF3][\x80-\xBF]{1,2} # U+40000 - U+FFFFF (invalid)
| \xF4[\x80-\x8F][\x80-\xBF]?) # U+100000 - U+10FFFF (invalid)
|(.) # invalid 1-byte
/xs';
// $matches[1]: valid character
// $matches[2]: invalid 3-byte or 4-byte character
// $matches[3]: invalid 1-byte
$ret = preg_replace_callback(
$regex,
function ($matches) use ($substitute) {
if (isset($matches[2]) || isset($matches[3])) {
return $substitute;
}
return $matches[1];
},
$str
);
return $ret;
}
$examples = array(
'Valid ASCII' => "a",
'Valid 2 Octet Sequence' => "\xc3\xb1",
'Invalid 2 Octet Sequence' => "\xc3\x28",
'Invalid Sequence Identifier' => "\xa0\xa1",
'Valid 3 Octet Sequence' => "\xe2\x82\xa1",
'Invalid 3 Octet Sequence (in 2nd Octet)' => "\xe2\x28\xa1",
'Invalid 3 Octet Sequence (in 3rd Octet)' => "\xe2\x82\x28",
'Valid 4 Octet Sequence' => "\xf0\x90\x8c\xbc",
'Invalid 4 Octet Sequence (in 2nd Octet)' => "\xf0\x28\x8c\xbc",
'Invalid 4 Octet Sequence (in 3rd Octet)' => "\xf0\x90\x28\xbc",
'Invalid 4 Octet Sequence (in 4th Octet)' => "\xf0\x28\x8c\x28",
'Valid 5 Octet Sequence (but not Unicode!)' => "\xf8\xa1\xa1\xa1\xa1",
'Valid 6 Octet Sequence (but not Unicode!)' => "\xfc\xa1\xa1\xa1\xa1\xa1",
);
foreach ($examples as $k => $v) {
echo "{$k}\n";
echo utf8_scrub($v);
echo "\n";
}
- Output for 5.3.0 - 5.3.29, 5.4.0 - 5.4.45, 5.5.24 - 5.5.35, 5.6.8 - 5.6.28, 7.0.0 - 7.0.20, 7.1.0 - 7.1.20, 7.2.0 - 7.2.33, 7.3.12 - 7.3.33, 7.4.0 - 7.4.33, 8.0.0 - 8.0.30, 8.1.0 - 8.1.27, 8.2.0 - 8.2.18, 8.3.0 - 8.3.6
- Valid ASCII
a
Valid 2 Octet Sequence
ñ
Invalid 2 Octet Sequence
〓(
Invalid Sequence Identifier
〓〓
Valid 3 Octet Sequence
₡
Invalid 3 Octet Sequence (in 2nd Octet)
〓(〓
Invalid 3 Octet Sequence (in 3rd Octet)
〓(
Valid 4 Octet Sequence
𐌼
Invalid 4 Octet Sequence (in 2nd Octet)
〓(〓〓
Invalid 4 Octet Sequence (in 3rd Octet)
〓(〓
Invalid 4 Octet Sequence (in 4th Octet)
〓(〓(
Valid 5 Octet Sequence (but not Unicode!)
〓〓〓〓〓
Valid 6 Octet Sequence (but not Unicode!)
〓〓〓〓〓〓
- Output for 4.4.2 - 4.4.9, 5.1.0 - 5.1.6, 5.2.0 - 5.2.17
- Parse error: syntax error, unexpected T_FUNCTION in /in/t6HIV on line 28
Process exited with code 255. - Output for 4.3.0 - 4.3.1, 4.3.5 - 4.3.11, 4.4.0 - 4.4.1, 5.0.0 - 5.0.5
- Parse error: parse error, unexpected T_FUNCTION in /in/t6HIV on line 28
Process exited with code 255. - Output for 4.3.2 - 4.3.4
- Parse error: parse error in /in/t6HIV on line 28
Process exited with code 255.
preferences:
204.27 ms | 401 KiB | 313 Q