<?php
$pattern='/^(?=(.*[a-z])+)(?=(.*[A-Z]){2,})(?=(.*[0-9])+)(?=(.*[&#@=€$%*?\/:!\-+])+)([a-zA-Z0-9&#@=€$%*?\/:!\-+]){8,}$/';
echo "Using one byte from €:\\xe2 \\x82 \\xac", PHP_EOL;
var_dump (preg_match($pattern, "aAB0\xe2ABC"));
var_dump (preg_match($pattern, "aAB0\x82ABC"));
var_dump (preg_match($pattern, "aAB0\xacABC"));
echo "Avec €:", PHP_EOL;
var_dump (preg_match($pattern, "aAB0€ABC"));
echo PHP_EOL, "Since the regex engine reads the string byte by byte, the length of the string isn't what you expected.", PHP_EOL;
var_dump(preg_match($pattern, "€aBC01"));
###########################################################
echo PHP_EOL,PHP_EOL,'With the (*UTF8) verb',PHP_EOL,'###########################################################',PHP_EOL;
$pattern='/(*UTF8)^(?=(.*[a-z])+)(?=(.*[A-Z]){2,})(?=(.*[0-9])+)(?=(.*[&#@=€$%*?\/:!\-+])+)([a-zA-Z0-9&#@=€$%*?\/:!\-+]){8,}$/';
echo "(*UTF8): Now € is seen as an character, no more as a set of bytes", PHP_EOL;
var_dump (preg_match($pattern, "aAB0\xe2ABC"));
var_dump (preg_match($pattern, "aAB0\x82ABC"));
var_dump (preg_match($pattern, "aAB0\xacABC"));
echo "Avec €:", PHP_EOL;
var_dump (preg_match($pattern, "aAB0€ABC"));
echo PHP_EOL, "Test with '€aBC01'. Now the regex engine reads the string unicode point by unicode point (encoded in UTF-8), the length of the string is the number of characters. Since there's less than 8 characters the pattern fails..", PHP_EOL;
var_dump(preg_match($pattern, "€aBC01"));
###########################################################
echo PHP_EOL,PHP_EOL, 'With the u modifier', PHP_EOL,'###########################################################',PHP_EOL;
$pattern='/^(?=(.*[a-z])+)(?=(.*[A-Z]){2,})(?=(.*[0-9])+)(?=(.*[&#@=€$%*?\/:!\-+])+)([a-zA-Z0-9&#@=€$%*?\/:!\-+]){8,}$/u';
echo "The u modifier that is synonym of (*UTF8)(*UCP)", PHP_EOL;
var_dump (preg_match($pattern, "aAB0\xe2ABC"));
var_dump (preg_match($pattern, "aAB0\x82ABC"));
var_dump (preg_match($pattern, "aAB0\xacABC"));
echo "Avec €:", PHP_EOL;
var_dump (preg_match($pattern, "aAB0€ABC"));
echo PHP_EOL, "Test with '€aBC01'", PHP_EOL;
var_dump(preg_match($pattern, "€aBC01"));
Using one byte from €:\xe2 \x82 \xac
int(1)
int(1)
int(1)
Avec €:
int(1)
Since the regex engine reads the string byte by byte, the length of the string isn't what you expected.
int(1)
With the (*UTF8) verb
###########################################################
(*UTF8): Now € is seen as an character, no more as a set of bytes
int(0)
bool(false)
bool(false)
Avec €:
int(1)
Test with '€aBC01'. Now the regex engine reads the string unicode point by unicode point (encoded in UTF-8), the length of the string is the number of characters. Since there's less than 8 characters the pattern fails..
int(0)
With the u modifier
###########################################################
The u modifier that is synonym of (*UTF8)(*UCP)
bool(false)
bool(false)
bool(false)
Avec €:
int(1)
Test with '€aBC01'
int(0)
Output for 8.3.5
Warning: PHP Startup: Unable to load dynamic library 'sodium.so' (tried: /usr/lib/php/8.3.5/modules/sodium.so (libsodium.so.23: cannot open shared object file: No such file or directory), /usr/lib/php/8.3.5/modules/sodium.so.so (/usr/lib/php/8.3.5/modules/sodium.so.so: cannot open shared object file: No such file or directory)) in Unknown on line 0
Using one byte from €:\xe2 \x82 \xac
int(1)
int(1)
int(1)
Avec €:
int(1)
Since the regex engine reads the string byte by byte, the length of the string isn't what you expected.
int(1)
With the (*UTF8) verb
###########################################################
(*UTF8): Now € is seen as an character, no more as a set of bytes
int(0)
bool(false)
bool(false)
Avec €:
int(1)
Test with '€aBC01'. Now the regex engine reads the string unicode point by unicode point (encoded in UTF-8), the length of the string is the number of characters. Since there's less than 8 characters the pattern fails..
int(0)
With the u modifier
###########################################################
The u modifier that is synonym of (*UTF8)(*UCP)
bool(false)
bool(false)
bool(false)
Avec €:
int(1)
Test with '€aBC01'
int(0)
Output for 7.1.0 - 7.1.33
Using one byte from €:\xe2 \x82 \xac
int(1)
int(1)
int(1)
Avec €:
int(1)
Since the regex engine reads the string byte by byte, the length of the string isn't what you expected.
int(1)
With the (*UTF8) verb
###########################################################
(*UTF8): Now € is seen as an character, no more as a set of bytes
bool(false)
bool(false)
bool(false)
Avec €:
int(1)
Test with '€aBC01'. Now the regex engine reads the string unicode point by unicode point (encoded in UTF-8), the length of the string is the number of characters. Since there's less than 8 characters the pattern fails..
int(0)
With the u modifier
###########################################################
The u modifier that is synonym of (*UTF8)(*UCP)
bool(false)
bool(false)
bool(false)
Avec €:
int(1)
Test with '€aBC01'
int(0)