<?php
$charmap = [
'NUL' => "\x00", // NULL (U+0000)
'SOH' => "\x01", // START OF HEADING (U+0001)
'STX' => "\x02", // START OF TEXT (U+0002)
'ETX' => "\x03", // END OF TEXT (U+0003)
'EOT' => "\x04", // END OF TRANSMISSION (U+0004)
'ENQ' => "\x05", // ENQUIRY (U+0005)
'HT' => "\x09", // HORIZONTAL TAB (U+0009)
'LF' => "\x0a", // LINE FEED (U+000A)
'VT' => "\x0b", // VERTICAL TAB (U+000B)
'CR' => "\x0d", // CARRIAGE RETURN (U+000D)
'ETB' => "\x17", // END OF TRANSMISSION BLOCK (U+0017)
'SP' => "\x20", // SPACE (U+0020)
'ZWS' => "\xe2\x80\x8b", // ZERO WIDTH SPACE (U+200B)
'MSBS' => "\xf0\x9d\x85\xb7", // MUSICAL SYMBOL BEGIN SLUR (U+1D177)
'MSBP' => "\xf0\x9d\x85\xb9", // MUSICAL SYMBOL BEGIN PHRASE (U+1D179)
];
foreach ($charmap as $k => $v) {
define($k, $v);
}
$strings = [
'user6003859' =>
LF .
LF .
LF .
'a' . SP . 'b'. SP . SP . 'c' . SP . SP . SP . LF .
SP . 'd' . SP . SP . SP . SP . 'e' . LF .
MSBS . 'f' . MSBS . 'g' . MSBS . MSBS . 'h' . MSBS . MSBS . ZWS . ZWS . MSBP . MSBP . 'i' . MSBP . SP . MSBP . SP . 'j' . LF .
LF .
LF .
LF .
LF .
'k' . SP . 'l' . SP . 'm' . SP . 'n' . SP . SP . SP . LF .
MSBP . 'o' . MSBP . MSBP . 'p' . LF .
LF .
LF .
LF,
'mickmackusa' =>
NUL . LF .
LF .
SOH . LF .
CR . LF .
VT .
ETB . 'a' . SP . 'ab' . CR . LF .
HT . HT . CR .
CR . LF .
'cà' . SOH . 'ê߀' . NUL . NUL . 'abcbc'. SP . SP . SP . 'd' . LF .
LF .
HT . CR . LF .
ENQ . SP . SP . SP . 'e' . STX . LF .
ETX . LF .
EOT . LF
];
function display($str, $charmap) {
$converter = array_map(function ($i) { return '{'.$i.'}'; }, array_flip($charmap));
$handle = fopen("data:text/plain,$str", 'r');
while ( false !== $line = fgets($handle) ) {
echo strtr($line, $converter), PHP_EOL;
}
fclose($handle);
}
class Replacements {
const FUNC = 0;
const REGEX = 1;
protected $patterns;
protected $replacements;
protected $func;
protected $typeRegex;
public function __construct($arg) {
if ( is_array($arg) ) {
$this->type = self::REGEX;
$this->patterns = [];
$this->replacements = [];
$this->addPatterns($arg);
} elseif ( is_callable($arg) ) {
$this->type = self::FUNC;
$this->addFunction($arg);
} else throw new Exception('invalid argument type');
}
protected function addPatterns($replacements) {
foreach($replacements as $pattern => $replacement) {
$this->patterns[] = $pattern;
$this->replacements[] = $replacement;
}
}
protected function addFunction($func) {
$this->func = $func;
}
public function execute($str) {
if ( $this->type === self::REGEX )
return preg_replace($this->patterns, $this->replacements, $str);
return call_user_func_array($this->func, [&$str]);
}
};
$original = new Replacements([
'~\R~u' => "\n",
'/(?:^((\pZ)+|((?!\n)\pC)+)(?1)*)|((?1)$)|(?:((?2)+|(?3)+)(?=(?2)|(?3)))/um' => '',
'/(\pZ+)|((?!\n)\pC)/u' => ' ',
'/(^\n+)|(\n+$)|(\n(?=\n{2}))/u' => ''
]);
$simple = new Replacements([
'~\A[\pZ\pC]+|[\pZ\pC]+\z~u' => '', # trim the string
'~\R~u' => "\n", # normalize newlines
'~\pZ+|[^\n\PC]+~u' => ' ', # replace Z and C with space
'~^ +| +$| \K +~m' => '', # trim lines, delete consecutive spaces
'~\n\n\K\n+~' => '' # removes more than 2 consecutives newlines
]);
$optimized = new Replacements([
'~\r\n?|\x0b|\f|\xe2\x80[\xa8\xa9]~S' => "\n",
'~
[^\pZ\pC]+ \K
\pZ* (?:[^\PC\n]+\pZ*)*
(?: (\n) \pZ*+ (?:[^\PC\n]+\pZ*)*+ (?: (\n) [\pZ\pC]* )?+ (?!\z) | [\pZ\pC]+ )?
|
[\pZ\pC]+
~Aux' => '$1$2 ',
'~ (?:$|(?<=^ ))~m' => ''
]);
$func = new Replacements(function (&$str) {
$parts = preg_split('~^[\pC\pZ]+|[\pC\pZ]+$|\R(?:[\pC\pZ]*?(\R)[\pC\pZ]*)?~u', $str, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
return implode("\n", array_map(function($i) { return trim(preg_replace('~[\pC\pZ]+~u', ' ', $i));}, $parts));
});
// tests
$tests = ['original' => $original, 'simple' => $simple, 'function' => $func, 'optimized' => $optimized];
$str = $strings['user6003859'] . $strings['mickmackusa'];
$str = str_repeat($str, 10);
$res = [];
foreach ($tests as $k=>&$test) {
$res[$k] = $test->execute($str);
}
echo 'same result: ', var_dump(count(array_unique($res)) === 1);
$names = array_keys($tests);
$times = array_fill_keys($names, 0);
define('REPETITIONS', 100);
for ($i=0; $i < REPETITIONS; $i++) {
shuffle($names);
foreach ($names as $name) {
$start = microtime(true);
$tests[$name]->execute($str);
$stop = microtime(true);
$times[$name] += $stop - $start;
}
}
foreach($times as $k=>$v) {
printf("%-12s: %.2es\n", $k, $v/REPETITIONS);
}
// display($res['optimized'], $charmap);
preferences:
37.15 ms | 402 KiB | 5 Q