<?php
/**
* @param string $string The string from which the diacritic marks (¨, ´, ~, `, ^, etc.) will be removed.
* @return string String without diacritic marks.
*/
function removeDiacritics($string) {
return transliterator_transliterate('Any-Latin;Latin-ASCII', $string);
}
/**
* Ensures that $string is UTF-8 encoded.
* @param string $string The string to ensure encoding
* @return string The UTF-8 encoded string
* @throws Exception If $string has no valid encoding
*/
function ensureUTF8Encoding($string) {
$encoding = mb_detect_encoding($string, 'UTF-8', true);
if ($encoding === false) {
$encoding = mb_detect_encoding($string, 'auto');
}
if ($encoding === false) {
throw new Exception('No valid encoding detected');
}
return $encoding !== 'UTF-8' ? mb_convert_encoding($string, 'UTF-8', $encoding) : $string;
}
/**
* @param string $string The string to get lower-cased
* @return string Lower-cased and UTF-8 encoded value of $string
*/
function lowerCase($string) {
return mb_strtolower(ensureUTF8Encoding($string), 'UTF-8');
}
/**
* Performs a words comparision between $s1 and $s2. This function counts every word only once, so 'hello hello'
* would be 100% similar to 'hello'. Both strings will get converted to lowercase, with its diacritical marks
* (´, ¨, ^, `, ~, etc.) stripped out. This behaviour is very much like a 'keywords search'.
* @param string $s1 First string to compare
* @param string $s2 Second string to compare
* @param null|string[] $skipWords Value-only array of words that should'nt be taken into account when comparing $s1 and $s2
* @param null|string[] $skipText Value-only array of text that should'nt be taken into account when comparing $s1 and $s2.
* Please note that this text will be stripped from the end, start and middle parts of words.
* @return float|int Percent of similarity between $s1 and $s2, where 1 represents 100% and 0 represents 0%
*/
function compareWords($s1, $s2, $skipWords = [
'en', 'de', 'del', 'los', 'la', 'in', 'from', 'the'
], $skipText = [
'.', ',', ';', ':'
])
{
if ($s1 === null || $s2 === null) return 0;
if ($skipText !== null && count($skipText) > 0) {
$s1 = str_replace($skipText, '', $s1);
$s2 = str_replace($skipText, '', $s2);
}
if ($skipWords !== null && count($skipWords) > 0) {
$skipWords = array_map(function ($item) { return preg_quote($item, '/'); }, $skipWords);
$skipWordsRegex = '/(?:(?<=\s)|^)(?:' . implode('|', $skipWords) . ')(?:(?=\s)|$)/';
$s1 = preg_replace($skipWordsRegex, '', $s1);
$s2 = preg_replace($skipWordsRegex, '', $s2);
}
$s1 = trim(lowerCase(preg_replace('/\s+/', ' ', removeDiacritics($s1))));
$s2 = trim(lowerCase(preg_replace('/\s+/', ' ', removeDiacritics($s2))));
if (strlen($s1) === 0 || strlen($s2) === 0) return 0;
$s1Words = array_unique(explode(' ', $s1));
$s2Words = array_unique(explode(' ', $s2));
$s1WordsCount = count($s1Words);
$s2WordsCount = count($s2Words);
// make sure $s1Words is the smaller array, to have a smaller cycle
if ($s1WordsCount > $s2WordsCount) {
$temp = $s1Words;
$s1Words = $s2Words;
$s2Words = $temp;
}
$s2Words = array_flip($s2Words);
$maxWords = max($s1WordsCount, $s2WordsCount);
$matches = 0;
foreach ($s1Words as $s1Word) {
if (array_key_exists($s1Word, $s2Words)) $matches++;
}
return $matches / $maxWords;
}
var_dump(compareWords('The tomato sauce.', 'tomato sauce'));
Warning: PHP Startup: Unable to load dynamic library 'sodium.so' (tried: /usr/lib/php/8.3.5/modules/sodium.so (libsodium.so.23: cannot open shared object file: No such file or directory), /usr/lib/php/8.3.5/modules/sodium.so.so (/usr/lib/php/8.3.5/modules/sodium.so.so: cannot open shared object file: No such file or directory)) in Unknown on line 0
float(0.6666666666666666)
Output for 7.3.32 - 7.3.33, 7.4.33, 8.0.13
Fatal error: Uncaught Error: Call to undefined function transliterator_transliterate() in /in/fqaEA:8
Stack trace:
#0 /in/fqaEA(67): removeDiacritics('The tomato sauc...')
#1 /in/fqaEA(97): compareWords('The tomato sauc...', 'tomato sauce')
#2 {main}
thrown in /in/fqaEA on line 8
Process exited with code 255.