<?php
/*
* Some sophisticated tests against PHP's mb_detect_encoding() function,
* testing ASCII, ISO-8859-1, Windows-1252, UTF-8 and UTF-16 in both byte orders.
*
* Tim Weber, 2014-07-04
*/
$samples = array(
'ASCII only' => array(
'string' => 'Hallo',
'valid' => 'A 1252 L1 U8', // invalid in UTF-16: odd number of bytes
),
'ASCII with null in the middle' => array(
'string' => "Hal\x00lo.",
'valid' => 'A 1252 L1 U8', // invalid in UTF-16: odd number
),
'8-bit string with character not in Windows-1252' => array(
'string' => "Hal\x81lo", // 0x80 and 0x82 are used in 1252, 0x81 not
'valid' => 'L1 U16B U16L', // valid in UTF-16
),
'UTF-8 with a 2-byte BMP character' => array(
'string' => "Hall\xc3\xb6.", // Hallö.
'valid' => '1252 L1 U8',
),
'UTF-8 with a 3-byte BMP character' => array(
'string' => "Hal\xef\xbb\xbflo.", // Hal<U+FEFF>lo.
'valid' => '1252 L1 U8',
),
'UTF-8 with a 4-byte SP character' => array(
'string' => "Oh \xf0\x9f\x92\xa9!", // Oh <F+1F4A9 (pile of poo)>!
'valid' => '1252 L1 U8 U16B U16L',
),
'UTF-8 with a really high PUA-B character' => array(
'string' => "(\xf4\x8f\xbf\xba)", // (<U+10FFFA>)
'valid' => 'L1 U8 U16B U16L', // Invalid in 1252 b/c of 0x8f
),
'single surrogate lead in UTF-8' => array(
'string' => "\xed\xa0\x82", // (<U+D802>)
'valid' => '1252 L1',
),
'single surrogate lead in UTF-16BE' => array(
'string' => "\xd8\x02", // (<U+D802>)
'valid' => '1252 L1 U16L',
),
'single surrogate trail in UTF-8' => array(
'string' => "\xed\xb0\x82", // (<U+DC02>)
'valid' => '1252 L1',
),
'single surrogate trail in UTF-16LE' => array(
'string' => "\x02\xdc", // (<U+DC02>)
'valid' => '1252 L1 U16B',
),
'Plane 17 UTF-8 character' => array(
'string' => "\xf4\x92\x8d\x85", // (<U+11234>)
'valid' => 'L1 U16B U16L', // Invalid in 1252 b/c of 0x8d
),
'null character encoded in two UTF-8 bytes' => array(
'string' => "\xc0\x80", // <U+0000>
'valid' => '1252 L1 U16B U16L',
),
'UTF-8 with a BMP non-shortest sequence (3 instead of 2 bytes)' => array(
'string' => "Hall\xe0\x83\xb6.", // Hallö.
'valid' => '1252 L1 U16B U16L',
),
'UTF-16BE of an ASCII character' => array(
'string' => "\x00\x79", // y
'valid' => 'ASCII 1252 L1 U8 U16B U16L',
),
'UTF-16LE of an SMP character' => array(
'string' => "\x34\xd8\x1e\xdd", // <U+1D11E (musical symbol g clef)>
'valid' => '1252 L1 U16B U16L',
),
'UTF-16BE BOM' => array(
'string' => "\xfe\xff", // <U+FEFF>
'valid' => '1252 L1 U16B U16L', // U+FFFE is _not_ an invalid codepoint, therefore valid U16L
),
'UTF-16LE SP character' => array(
'string' => "\x3d\xd8\xa9\xdc", // <U+1F4A9>
'valid' => '1252 L1 U16L U16B',
),
'UTF-16BE SP character that is lone surrogate lead in UTF-16LE' => array(
'string' => "\xd8\x34\xdd\xd8", // <U+1D1D8 (musical symbol torculus>
'valid' => '1252 L1 U16B',
),
);
$encoding_mappings = array(
'ASCII' => 'A',
'Windows-1252' => '1252',
'ISO-8859-1' => 'L1',
'UTF-8' => 'U8',
'UTF-16BE' => 'U16B',
'UTF-16LE' => 'U16L',
);
$encodings = array_keys($encoding_mappings);
foreach ($samples as $name => &$data) {
// Convert string "valid" to array.
if (is_string($data['valid'])) {
$data['valid'] = explode(' ', $data['valid']);
}
// Convert abbreviated encodings to full name.
foreach ($data['valid'] as &$short) {
$found = array_search($short, $encoding_mappings);
if ($found !== false) {
$short = $found;
}
}
unset($short); // Remove reference.
// Check validity.
foreach ($encodings as $encoding) {
$valid = mb_check_encoding($data['string'], $encoding);
$expected = array_search($encoding, $data['valid']) !== false;
if ($valid != $expected) {
printf("\"%s\" should be %s in %s, but is detected as %s\n",
$name,
$expected ? 'valid' : 'invalid',
$encoding,
$valid ? 'valid' : 'invalid'
);
}
}
}
preferences:
32.49 ms | 402 KiB | 5 Q