<?php
/**
* Performs a words comparision between $s1 and $s2. This function counts every word only once, so 'hello hello'
* would be 100% similar to 'hello'. Both strings will get converted to lowercase, with its diacritical marks
* (´, ¨, ^, `, ~, etc.) stripped out. This behaviour is very much like a 'keywords search'.
* @param string $s1 First string to compare
* @param string $s2 Second string to compare
* @param null|string[] $skipWords Value-only array of words that should'nt be taken into account when comparing $s1 and $s2
* @param null|string[] $skipText Value-only array of text that should'nt be taken into account when comparing $s1 and $s2.
* Please note that this text will be stripped from the end, start and middle parts of words.
* @return float|int Percent of similarity between $s1 and $s2, where 1 represents 100% and 0 represents 0%
*/
function compareWords($s1, $s2, $skipWords = [
'en', 'de', 'del', 'los', 'la', 'in', 'from', 'the'
], $skipText = [
'.', ',', ';', ':'
])
{
if ($s1 === null || $s2 === null) return 0;
if ($skipText !== null && count($skipText) > 0) {
$s1 = str_replace($skipText, '', $s1);
$s2 = str_replace($skipText, '', $s2);
}
if ($skipWords !== null && count($skipWords) > 0) {
$skipWords = array_map(function ($item) { return preg_quote($item, '/'); }, $skipWords);
$skipWordsRegex = '/(?:(?<=\s)|^)(?:' . implode('|', $skipWords) . ')(?:(?=\s)|$)/';
$s1 = preg_replace($skipWordsRegex, '', $s1);
$s2 = preg_replace($skipWordsRegex, '', $s2);
}
$s1 = trim(UString::lowerCase(preg_replace('/\s+/', ' ', UString::removeDiacritics($s1))));
$s2 = trim(UString::lowerCase(preg_replace('/\s+/', ' ', UString::removeDiacritics($s2))));
if (strlen($s1) === 0 || strlen($s2) === 0) return 0;
$s1Words = array_unique(explode(' ', $s1));
$s2Words = array_unique(explode(' ', $s2));
$s1WordsCount = count($s1Words);
$s2WordsCount = count($s2Words);
// make sure $s1Words is the smaller array, to have a smaller cycle
if ($s1WordsCount > $s2WordsCount) {
$temp = $s1Words;
$s1Words = $s2Words;
$s2Words = $temp;
}
$s2Words = array_flip($s2Words);
$maxWords = max($s1WordsCount, $s2WordsCount);
$matches = 0;
foreach ($s1Words as $s1Word) {
if (array_key_exists($s1Word, $s2Words)) $matches++;
}
return $matches / $maxWords;
}
var_dump(compareWords('The tomato sauce.', 'tomato sauce'));