Online PHP editor | output for RSgGe

<?php /** * @param string $string The string from which the diacritic marks (¨, ´, ~, `, ^, etc.) will be removed. * @return string String without diacritic marks. */ function removeDiacritics($string) { return transliterator_transliterate('Any-Latin;Latin-ASCII', $string); } /** * Ensures that $string is UTF-8 encoded. * @param string $string The string to ensure encoding * @return string The UTF-8 encoded string * @throws Exception If $string has no valid encoding */ function ensureUTF8Encoding($string) { $encoding = mb_detect_encoding($string, 'UTF-8', true); if ($encoding === false) { $encoding = mb_detect_encoding($string, 'auto'); } if ($encoding === false) { throw new Exception('No valid encoding detected'); } return $encoding !== 'UTF-8' ? mb_convert_encoding($string, 'UTF-8', $encoding) : $string; } /** * @param string $string The string to get lower-cased * @return string Lower-cased and UTF-8 encoded value of $string */ function lowerCase($string) { return mb_strtolower(ensureUTF8Encoding($string), 'UTF-8'); } /** * Performs a words comparision between $s1 and $s2. This function counts every word only once, so 'hello hello' * would be 100% similar to 'hello'. Both strings will get converted to lowercase, with its diacritical marks * (´, ¨, ^, `, ~, etc.) stripped out. This behaviour is very much like a 'keywords search'. * @param string $s1 First string to compare * @param string $s2 Second string to compare * @param null|string[] $skipWords Value-only array of words that should'nt be taken into account when comparing $s1 and $s2 * @param null|string[] $skipText Value-only array of text that should'nt be taken into account when comparing $s1 and $s2. * Please note that this text will be stripped from the end, start and middle parts of words. * @return float|int Percent of similarity between $s1 and $s2, where 1 represents 100% and 0 represents 0% */ function compareWords($s1, $s2, $skipWords = [ 'en', 'de', 'del', 'los', 'la', 'in', 'from', 'the' ], $skipText = [ '.', ',', ';', ':' ]) { if ($s1 === null || $s2 === null) return 0; if ($skipText !== null && count($skipText) > 0) { $s1 = str_replace($skipText, '', $s1); $s2 = str_replace($skipText, '', $s2); } $s1 = trim(lowerCase(preg_replace('/\s+/', ' ', removeDiacritics($s1)))); var_dump($s1); $s2 = trim(lowerCase(preg_replace('/\s+/', ' ', removeDiacritics($s2)))); if (strlen($s1) === 0 || strlen($s2) === 0) return 0; if ($skipWords !== null && count($skipWords) > 0) { $skipWords = array_map(function ($item) { return preg_quote($item, '/'); }, $skipWords); $skipWordsRegex = '/(?:(?<=\s)|^)(?:' . implode('|', $skipWords) . ')(?:(?=\s)|$)/'; $s1 = preg_replace($skipWordsRegex, '', $s1); var_dump($s1); $s2 = preg_replace($skipWordsRegex, '', $s2); } if (strlen($s1) === 0 || strlen($s2) === 0) return 0; $s1Words = array_unique(explode(' ', $s1)); $s2Words = array_unique(explode(' ', $s2)); $s1WordsCount = count($s1Words); $s2WordsCount = count($s2Words); // make sure $s1Words is the smaller array, to have a smaller cycle if ($s1WordsCount > $s2WordsCount) { $temp = $s1Words; $s1Words = $s2Words; $s2Words = $temp; } $s2Words = array_flip($s2Words); $maxWords = max($s1WordsCount, $s2WordsCount); $matches = 0; foreach ($s1Words as $s1Word) { if (array_key_exists($s1Word, $s2Words)) $matches++; } return $matches / $maxWords; } var_dump(compareWords('The tomato sauce.', 'tomato sauce'));