<?php
/**
* Over-engineered solution to most capitalisation issues.
*
* Version 1.0
*/
class str {
/**
* Words or abbreviations that should always be all uppercase
*/
const ALL_UPPERCASE = [
"UK",
"VAT",
];
/**
* Words or abbreviations that should always be all lowercase
*/
const ALL_LOWERCASE = [
"and",
"as",
"by",
"in",
"of",
"or",
"to",
];
/**
* Honorifics that only contain vowels.
*
*/
const CONSONANT_ONLY_HONORIFICS = [
# English
"Mr",
"Mrs",
"Ms",
"Dr",
"Br",
"Sr",
"Fr",
"Pr",
"St",
# Afrikaans
"Mnr",
];
/**
* Surname prefixes that should be lowercase,
* unless not following another word (firstname).
*/
const SURNAME_PREFIXES = [
"de la",
"de las",
"van de",
"van der",
"vit de",
"von",
"van",
"del",
"der",
];
/**
* Capitalises every (appropriate) word in a given string.
*
* @param string|null $string
*
* @return string|null
*/
public static function capitalise(?string $string): ?string
{
if(!$string){
return $string;
}
# Strip away multi-spaces
$string = preg_replace("/\s{2,}/", " ", $string);
# Ensure there is always a space after a comma
$string = preg_replace("/,([^\s])/", ", $1", $string);
# A word is anything separated by spaces or a dash
$string = preg_replace_callback("/([^\s\-\.]+)/", function($matches){
# Make the word lowercase
$word = mb_strtolower($matches[1]);
# If the word needs to be all lowercase
if(in_array($word, self::ALL_LOWERCASE)){
return strtolower($word);
}
# If the word needs to be all uppercase
if(in_array(mb_strtoupper($word), self::ALL_UPPERCASE)){
return strtoupper($word);
}
# Create a version without diacritics
$transliterator = \Transliterator::createFromRules(':: Any-Latin; :: Latin-ASCII; :: NFD; :: [:Nonspacing Mark:] Remove; :: Lower(); :: NFC;', \Transliterator::FORWARD);
$ascii_word = $transliterator->transliterate($word);
# If the word contains non-alpha characters (numbers, &, etc), with exceptions (comma, '), assume it's an abbreviation
if(preg_match("/[^a-z,']/i", $ascii_word)){
return strtoupper($word);
}
# If the word doesn't contain any vowels, assume it's an abbreviation
if(!preg_match("/[aeiouy]/i", $ascii_word)){
# Unless the word is an honorific
if(!in_array(ucfirst($word), self::CONSONANT_ONLY_HONORIFICS)){
return strtoupper($word);
}
}
# If the word contains two of the same vowel and is 3 characters or fewer, assume it's an abbreviation
if(strlen($word) <= 3 && preg_match("/([aeiouy])\1/", $word)){
return strtoupper($word);
}
# Ensure O'Connor, L'Oreal, etc, are double capitalised, with exceptions (d')
if(preg_match("/\b([a-z]')(\w+)\b/i", $word, $match)){
# Some prefixes (like d') are not capitalised
if(in_array($match[1], ["d'"])){
return $match[1] . ucfirst($match[2]);
}
# Otherwise, everything is capitalised
return strtoupper($match[1]) . ucfirst($match[2]);
}
# Otherwise, return the word with the first letter (only) capitalised
return ucfirst($word);
//The most common outcome
}, $string);
# Cater for the Mc prefix
$pattern = "/(Mc)([b-df-hj-np-tv-z])/";
//Mc followed by a consonant
$string = preg_replace_callback($pattern, function($matches){
return "Mc" . ucfirst($matches[2]);
}, $string);
# Cater for Roman numerals (need to be in all caps)
$pattern = "/\b((?<![MDCLXVI])(?=[MDCLXVI])M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I[XV]|V?I{0,3}))\b/i";
$string = preg_replace_callback($pattern, function($matches){
return strtoupper($matches[1]);
}, $string);
# Cater for surname prefixes (must be after the Roman numerals)
$pattern = "/\b (".implode("|", self::SURNAME_PREFIXES).") \b/i";
//A surname prefix, bookended by words
$string = preg_replace_callback($pattern, function($matches){
return strtolower(" {$matches[1]} ");
}, $string);
# Cater for ordinal numbers
$pattern = "/\b(\d+(?:st|nd|rd|th))\b/i";
//A number suffixed with an ordinal
$string = preg_replace_callback($pattern, function($matches){
return strtolower($matches[1]);
}, $string);
# And we're done
return $string;
}
}
$complicated_names = "
DONALD MCDONALD
SINEAD O'CONNOR
JOHAN VAN ZYL
OSCAR DE LA HOYA
P.F. CHANG
KFC
ST. JOHN
DR ZEUZ
PROF. GREEN
VAN DER BERG THE 3RD
SÃO JOÃO DOS SANTOS
KING HENRY VII
KUJE'S HIGH,ROAD
FLUG-HAFEN
FLUGIG-O'DONNALD
MARY O'CALLAHAN
JOHN O'DONALD
THE O'CALLAHAN-O'DONALD RESIDENCE
2ND NOVEMBER STREET
The 15th king of scotland
FCT
MICHAEL VIVA
GINA C.A. KOTOR
DUTCH NAMES
van der vaart
van vollenhoven
van 't zandt
van het zand
el hamdoie
van der Rooi-van Velzen
Zuidewijn - van rooien
teggelen onder t boven
guido op 't drooge
friso van drooge
Zuidewijn - van rooien
teggelen onder t boven
ZUID-HOLLAND
's hertogen-bosch
De Rooi Van Zuidewijn
van onder
Van Der Wijk-Zeewuster
de Vries-van der Leest
Den Oudsten - van 't Veldt
Hare Koninklijke Hoogheid Alexia Juliana Marcela Laurentien Prinses der Nederlanden, Prinses van Oranje-Nassau
Hare Koninklijke Hoogheid Máxima, Prinses der Nederlanden, Prinses van Oranje-Nassau, Mevrouw van Amsberg
van Lippe-Biesterfeld van Vollenhoven
";
var_dump(str::capitalise($complicated_names));
string(978) "
Donald McDonald
Sinead O'Connor
Johan van Zyl
Oscar de la Hoya
P.F. Chang
KFC
St. John
Dr Zeuz
Prof. Green
Van der Berg The 3rd
São João Dos Santos
King Henry VII
Kuje's High, Road
Flug-Hafen
Flugig-O'Donnald
Mary O'Callahan
John O'Donald
The O'Callahan-O'Donald Residence
2nd November Street
The 15th King of Scotland
FCT
Michael Viva
Gina C.A. Kotor
Dutch Names
Van der Vaart
Van Vollenhoven
Van 'T Zandt
Van Het Zand
El Hamdoie
Van der Rooi-Van Velzen
Zuidewijn - Van Rooien
Teggelen Onder T Boven
Guido Op 'T Drooge
Friso van Drooge
Zuidewijn - Van Rooien
Teggelen Onder T Boven
Zuid-Holland
'S Hertogen-Bosch
De Rooi van Zuidewijn
Van Onder
Van der Wijk-Zeewuster
De Vries-Van der Leest
Den Oudsten - Van 'T Veldt
Hare Koninklijke Hoogheid Alexia Juliana Marcela Laurentien Prinses der Nederlanden, Prinses van Oranje-Nassau
Hare Koninklijke Hoogheid Máxima, Prinses der Nederlanden, Prinses van Oranje-Nassau, Mevrouw van Amsberg
Van Lippe-Biesterfeld van Vollenhoven
"
Output for 8.3.5
Warning: PHP Startup: Unable to load dynamic library 'sodium.so' (tried: /usr/lib/php/8.3.5/modules/sodium.so (libsodium.so.23: cannot open shared object file: No such file or directory), /usr/lib/php/8.3.5/modules/sodium.so.so (/usr/lib/php/8.3.5/modules/sodium.so.so: cannot open shared object file: No such file or directory)) in Unknown on line 0
string(978) "
Donald McDonald
Sinead O'Connor
Johan van Zyl
Oscar de la Hoya
P.F. Chang
KFC
St. John
Dr Zeuz
Prof. Green
Van der Berg The 3rd
São João Dos Santos
King Henry VII
Kuje's High, Road
Flug-Hafen
Flugig-O'Donnald
Mary O'Callahan
John O'Donald
The O'Callahan-O'Donald Residence
2nd November Street
The 15th King of Scotland
FCT
Michael Viva
Gina C.A. Kotor
Dutch Names
Van der Vaart
Van Vollenhoven
Van 'T Zandt
Van Het Zand
El Hamdoie
Van der Rooi-Van Velzen
Zuidewijn - Van Rooien
Teggelen Onder T Boven
Guido Op 'T Drooge
Friso van Drooge
Zuidewijn - Van Rooien
Teggelen Onder T Boven
Zuid-Holland
'S Hertogen-Bosch
De Rooi van Zuidewijn
Van Onder
Van der Wijk-Zeewuster
De Vries-Van der Leest
Den Oudsten - Van 'T Veldt
Hare Koninklijke Hoogheid Alexia Juliana Marcela Laurentien Prinses der Nederlanden, Prinses van Oranje-Nassau
Hare Koninklijke Hoogheid Máxima, Prinses der Nederlanden, Prinses van Oranje-Nassau, Mevrouw van Amsberg
Van Lippe-Biesterfeld van Vollenhoven
"