Online PHP editor | refs for 7Xo9j

<?php // Clown Emoji // "🤡".length = 2 in Javascript (Firefox) // '🤡abc'.length = 5 in Javascript (Firefox) // . = Byte, () = Surrogate pairs $x = '🤡'; // 4 bytes (UTF-8 (....)) $y = '🤡abc'; // 4 + 3 = 7 bytes (UTF-8 (....) . . .) // In PHP, strings are simply raw byte streams. Right now $x and $y are stored as UTF-8 because // I copy pasted them from my browser. echo "--- These are UTF-8 ---"."\n"; echo "\$x Bytes: ".strlen($x)."\n"; echo "\$x Unicode Codepoint Count (\"characters\"): ".mb_strlen($x, "UTF-8")."\n"; echo "\$x Hex Representation: ".bin2hex($x)."\n"; echo "\$y Bytes: ".strlen($y)."\n"; echo "\$y Unicode Codepoint Count (\"characters\"): ".mb_strlen($y, "UTF-8")."\n"; echo "\$y Hex Representation: ".bin2hex($y)."\n"; echo "--- End ---"."\n"; // Now, lets convert them to UTF-16 where each codepoint is 2 bytes and a surrogate pair is 4 bytes $x1 = mb_convert_encoding($x, "UTF-16", "UTF-8"); // Still 4 bytes! (UTF-16 (.. ..)) $y1 = mb_convert_encoding($y, "UTF-16", "UTF-8"); // 4 + 6 = 10 bytes (UTF-16 (.. ..) .. .. ..) echo "--- These are UTF-16 ---"."\n"; echo "\$x1 Bytes: ".strlen($x1)."\n"; echo "\$x1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($x1, "UTF-16")."\n"; echo "\$x1 Hex Representation: ".bin2hex($x1)."\n"; echo "\$y1 Bytes: ".strlen($y1)."\n"; echo "\$y1 Unicode Codepoint Count (\"characters\"): ".mb_strlen($y1, "UTF-16")."\n"; echo "\$y1 Hex Representation: ".bin2hex($y1)."\n"; echo "--- End ---"."\n"; // Now, Javascript's String is sort of like PHP's raw string byte stream, except: // >>>>>> // JavaScript treats code units as individual characters, while humans generally think in terms of Unicode characters. // This has some unfortunate consequences for Unicode characters outside the BMP. Since surrogate pairs consist of // two code units, '𝌆'.length == 2, even though there’s only one Unicode character there. The individual surrogate // halves are being exposed as if they were characters: '𝌆' == '\uD834\uDF06'. // <<<<<< https://mathiasbynens.be/notes/javascript-encoding // What this basically means is that while proper counting of UTF-16 codepoints would count surrogate pairs (.. ..) as // length 1, Javascript counts them separately as .. .. = length 2. // So, our characters $x1 and $y1 are counted in Javascript as: // $x1 | .. .. = 2 // $y1 | .. .. .. .. .. = 5 // Now it looks obvious that, to emulate Javascript's behaviour we simply need to count the number of bytes // in the UTF-16 encoding, and divide that by half. echo "--- These are UTF-16 ---"."\n"; echo "\$x1 Javascript Emulated strlen/2: ".(strlen($x1)/2)."\n"; echo "\$y1 Javascript Emulated strlen/2: ".(strlen($y1)/2)."\n"; echo "--- End ---"."\n"; // And we can see that Javascript's length behaviour is emulated.