<?php
const TK_WHITESPACES = 0,
TK_OPENING_PARENTHESIS = 2, // 0b00000010
TK_CLOSING_PARENTHESIS = 4, // 0b00000100
TK_LOGICAL_CONJUNCTION = 8, // 0b00001000
TK_LOGICAL_DISJUNCTION = 16, // 0b00010000
TK_LOGICAL_NEGATION = 32, // etc.
TK_STRING = 64,
TK_EQUAL = 128,
TK_NOT_EQUAL = 256,
TK_COLUMN_NAME = 512,
TK_END = 1024;
const ERR_NONE = 0,
ERROR = 1; // 0b00000001
const TOKENIZATION_ERROR = 3, // 0b00000011
ERR_UNEXPECTED_CHAR = 7, // 0b00000111
ERR_UNCLOSED_STRING = 11, // 0b00001011
ERR_BAD_UTF8 = 19, // 0b00010011
ERR_EMPTY_STRING = 35; // TE
const LOGIC_ERROR = 129, // 0b10000001
ERR_UNEXPECTED_TOKEN = 133, // 0b10000101
ERR_UNEXPECTED_CLOSING_PARENTHESIS = 137, // 0b10001001
ERR_MISSING_CLOSING_PARENTHESIS = 145, // 0b10010001
ERR_UNKNOW_COLUMN_NAME = 161, // 0b10100001
ERR_EMPTY_QUERY = 193; // L E
const RULES = [
TK_COLUMN_NAME => TK_EQUAL | TK_NOT_EQUAL,
TK_EQUAL => TK_STRING,
TK_NOT_EQUAL => TK_STRING,
TK_STRING => TK_END | TK_CLOSING_PARENTHESIS | TK_LOGICAL_DISJUNCTION | TK_LOGICAL_CONJUNCTION,
TK_CLOSING_PARENTHESIS => TK_END | TK_CLOSING_PARENTHESIS | TK_LOGICAL_DISJUNCTION | TK_LOGICAL_CONJUNCTION,
TK_LOGICAL_DISJUNCTION => TK_LOGICAL_NEGATION | TK_OPENING_PARENTHESIS | TK_COLUMN_NAME,
TK_LOGICAL_CONJUNCTION => TK_LOGICAL_NEGATION | TK_OPENING_PARENTHESIS | TK_COLUMN_NAME,
TK_LOGICAL_NEGATION => TK_OPENING_PARENTHESIS | TK_COLUMN_NAME,
TK_OPENING_PARENTHESIS => TK_OPENING_PARENTHESIS | TK_COLUMN_NAME
];
const COLUMN_NAMES = ['family', 'conditions_of_use', 'taxonomical_group'];
const ERROR_MESSAGES = [
ERR_UNEXPECTED_CHAR => 'caractère inattendu à la position %d.',
ERR_UNCLOSED_STRING => 'chaîne de caractères non fermée à la position %d.',
ERR_BAD_UTF8 => 'erreur d\'encodage UTF-8 à la position %d.',
ERR_EMPTY_STRING => 'chaîne vide à la position %d.', // *****
ERR_UNEXPECTED_TOKEN => 'élément inattendu à la position %d.',
ERR_UNEXPECTED_CLOSING_PARENTHESIS => 'parenthèse fermante inattendue à la position %d.',
ERR_MISSING_CLOSING_PARENTHESIS => 'parenthèse fermante manquante à la position %d.',
ERR_UNKNOW_COLUMN_NAME => 'nom de colonne inconnu à la position %d.',
ERR_EMPTY_QUERY => 'requête vide à la position %d.'
];
const TOKEN2SQL = [
TK_EQUAL => ' = ',
TK_NOT_EQUAL => ' <> ',
TK_CLOSING_PARENTHESIS => ')',
TK_OPENING_PARENTHESIS => '(',
TK_LOGICAL_DISJUNCTION => ' OR ',
TK_LOGICAL_CONJUNCTION => ' AND ',
TK_LOGICAL_NEGATION => 'NOT ',
TK_STRING => '?' // placeholder
];
function tokenize(string $str) {
$pattern = <<<'REGEX'
~
\s+ (*:TK_WHITESPACES)
| \( (*:TK_OPENING_PARENTHESIS)
| \) (*:TK_CLOSING_PARENTHESIS)
| \b ET \b (*:TK_LOGICAL_CONJUNCTION)
| \b OU \b (*:TK_LOGICAL_DISJUNCTION)
| \b NON \b (*:TK_LOGICAL_NEGATION)
| \B " [^"\\]* (?s: \\ . [^"\\]* )*
(?: " \B (*:TK_STRING) | " (*:ERR_UNEXPECTED_CHAR) | \z (*:ERR_UNCLOSED_STRING) )
| \b \w+ (*:TK_COLUMN_NAME)
| = (*:TK_EQUAL)
| (?: != | <> ) (*:TK_NOT_EQUAL)
| \z (*:TK_END)
| (*COMMIT) (*FAIL) # UNEXPECTED CHARACTER ⮕ tokenization aborted
~xu
REGEX;
$count = preg_match_all($pattern, $str, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
if ( preg_last_error() === PREG_BAD_UTF8_ERROR ) {
preg_match('~(?: # séquences UTF-8 valides
[\x00-\x7F]
| [\xC2-\xDF][\x80-\xBF]
| \xE0[\xA0-\xBF][\x80-\xBF]
| \xED[\x80-\x9F][\x80-\xBF]
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
| \xF0[\x90-\xBF][\x80-\xBF]{2}
| \xF4[\x80-\x8F][\x80-\xBF]{2}
| [\xF1-\xF3][\x80-\xBF]{3}
)*~x', $str, $m);
return [ 'error' => ['type' => ERR_BAD_UTF8, 'offset' => strlen($m[0])] ];
}
if ( $count === 0 ) // (*COMMIT) (*FAIL) a été atteint dés la première position
return [ 'error' => ['type' => ERR_UNEXPECTED_CHAR, 'offset' => 0] ];
$errors = array_filter($matches, fn($m) => constant($m['MARK']) & ERROR);
if ( !empty($errors) ) { // la fin a été atteinte mais il y a des marqueurs d'erreurs parmi les tokens
$error = array_shift($errors);
list($match, $offset) = $error[0];
return [ 'error' => ['type' => constant($error['MARK']), 'offset' => $offset + strlen($match)] ];
}
$lastMatch = end($matches);
if ( constant($lastMatch['MARK']) !== TK_END ) { // (*COMMIT) (*FAIL) a été atteint avant la fin
list($match, $offset) = $lastMatch[0];
return [ 'error' => ['type' => ERR_UNEXPECTED_CHAR, 'offset' => $offset + strlen($match)] ];
}
if ( $count === 1 ) // l'unique token est TK_END, la chaîne est vide
return [ 'error' => ['type' => ERR_EMPTY_STRING, 'offset' => 0] ];
return [
'error' => ERR_NONE,
'tokens' => array_map(
fn($m) => [ 'value' => $m[0][0], 'offset' => $m[0][1], 'type' => constant($m['MARK']) ],
$matches
)
];
}
function checkLogic($tokens) {
$parenthesis = 0;
$current = current($tokens);
if ( $current['type'] === TK_END )
return [ 'type' => ERR_EMPTY_QUERY, 'offset' => $current['offset'] ];
do {
if ( $current['type'] === TK_COLUMN_NAME && !in_array($current['value'], COLUMN_NAMES) )
return [ 'type' => ERR_UNKNOW_COLUMN_NAME, 'offset' => $current['offset'] ];
if ( $current['type'] === TK_OPENING_PARENTHESIS )
$parenthesis++;
elseif ( $current['type'] === TK_CLOSING_PARENTHESIS && --$parenthesis < 0 )
return [ 'type' => ERR_UNEXPECTED_CLOSING_PARENTHESIS, 'offset' => $current['offset'] ];
$next = next($tokens);
if ( RULES[$current['type']] & $next['type'] ) // le token courant est suivi par un token autorisé
$current = $next;
else
return [ 'type' => ERR_UNEXPECTED_TOKEN, 'offset' => $next['offset'] ];
} while ( $next['type'] !== TK_END );
return $parenthesis ? [ 'type' => ERR_MISSING_CLOSING_PARENTHESIS, 'offset' => $current['offset'] ]
: true ;
}
function displayError($error, $query) {
$position = grapheme_strlen(substr($query, 0, $error['offset']));
$format = ERROR_MESSAGES[$error['type']]."\n%s\n" . str_repeat(' ', $position) . '^';
printf($format, $position, $query);
}
function buildQuery($tokens, $prefix = '') {
$SQLQuery = $prefix;
$params = [];
foreach ($tokens as $token) {
switch($token['type']) {
case TK_COLUMN_NAME:
$SQLQuery .= $token['value'];
break;
case TK_END: break;
case TK_STRING:
$param = substr($token['value'], 1, -1);
$params[] = strtr($param, ['\\\\' => '\\\\', '\\"' => '"']);
default:
$SQLQuery .= TOKEN2SQL[$token['type']];
}
}
return [$SQLQuery, $params];
}
$query = "".'family="_\\"bîd'."u\xCC\x8A".'le\\"_" OU (((taxonomical_group="machin")))';
$tokenization = tokenize($query);
$error = $tokenization['error'];
if ( $error !== ERR_NONE ) {
displayError($error, $query);
} else {
$tokens = array_filter($tokenization['tokens'], fn($token) => $token['type'] !== TK_WHITESPACES);
if ( true !== $error = checkLogic($tokens) ) {
displayError($error, $query);
} else {
$prefix = 'SELECT * FROM matable WHERE ';
list($SQLQuery, $params) = buildQuery($tokens, $prefix);
echo "requête: $query", PHP_EOL,
"requête préparée: $SQLQuery", PHP_EOL,
'paramètres: ', print_r($params, true);
// $sth = $dbh->prepare($SQLQuery);
// $sth->execute($params);
}
}
- Output for 7.4.0 - 7.4.33, 8.0.1 - 8.0.30, 8.1.0 - 8.1.33, 8.2.0 - 8.2.29, 8.3.0 - 8.3.25, 8.4.1 - 8.4.12
- requête: family="_\"bîdůle\"_" OU (((taxonomical_group="machin")))
requête préparée: SELECT * FROM matable WHERE family = ? OR (((taxonomical_group = ?)))
paramètres: Array
(
[0] => _"bîdůle"_
[1] => machin
)
preferences:
320.4 ms | 407 KiB | 5 Q