Skip to content

Instantly share code, notes, and snippets.

@flacle
Last active February 13, 2016 20:07
Show Gist options
  • Save flacle/97a5baf72888d0f7b9c9 to your computer and use it in GitHub Desktop.
Save flacle/97a5baf72888d0f7b9c9 to your computer and use it in GitHub Desktop.
Replaces end quotes with the appropriate accent (Italian)
<?php
/**
* The MIT License (MIT)
*
* Copyright (c) 2016 Francis Laclé
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/**
* Replaces end quotes with the appropiate accent mark for Italian
* @param $str a string containing text.
* Author: Francis Laclé
*/
function replaceEndQuote(&$str) {
$out = '';
$words = explode (' ', $str);
$vowels = 'aioue'; // 'e' is last as this is handled separately
$acuteVowels = 'àìòù';
/**
* Diacritical rules taken from :
* http://italian.about.com/od/pronunciation/fl/italian-accent-marks.htm
* Limitations:
* - does not take da' (imperative of dare) into account
* - does not take di' (imperative form of dire) into account
*/
foreach($words as $word) {
// How long is the word?
$len = strlen($word);
// Does word end with a letter?
$endIsAlpha = preg_match('/[a-zA-Z]/i', substr($word, $len-1, $len));
// Track number of uppercases to apply back later
$upperCaseIndices = '';
for($l = 0; $l < $len; $l++) {
$letter = substr($word, $l, 1);
if(strtolower($letter) !== $letter) {
$upperCaseIndices.= '1';
} else {
$upperCaseIndices.= '0';
}
}
// Convert the word to lower case for further comparison
$word = strtolower($word);
// We assume a mismatch
$match = false;
// Storage for non-alpha character such as comma's, exclamation, dots, etc.
$lastChar = '';
// Check if word starts with a single quote as well (we discard those)
if(strpos($word, '\'') === 0) {
$match = true; // this is so we can skip everything else
}
// Check if word does not end with a letter
if(!$endIsAlpha && !$match) {
$lastChar = substr($word, $len-1, $len);
// If it is not a single quote we keep the copy and remove the char
if(strpos($lastChar, '\'') !== 0) {
$word = substr($word, 0, $len-1);
$len = strlen($word);
} else {
// We don't store the char as single-quotes have to be replaced
$lastChar = '';
}
}
// Check the special case for e' as a verb
if($len === 2 && $word === 'e\'' && !$match) {
$word = 'è';
$match = true;
}
// Check the special case of tea and not coffee :p
if($len === 3 && $word === 'te\'' && !$match) {
$word = 'tè';
$match = true;
}
// Check the special case of de'
if($len === 3 && $word === 'de\'' && !$match ) {
$match = true;
}
// Continue if mismatch is still valid
if(!$match) {
// Only process words that end with a single quote
if(strrpos($word, '\'') === $len-1) {
// This is to check if the last & second-last letters end with a vowel
$lastLetter = substr($word, $len-2, 1);
$secondLastLetter = substr($word, $len-3, 1);
// Check for 'e' again as it has more rules and exceptions
if($lastLetter === 'e') {
// Does 'e' come after a vowel?
if(strpos($vowels, $secondLastLetter) !== false) {
// We add a grave accent
$word = str_replace('e\'', 'è', $word);
} else {
// We add an acute accent
$word = str_replace('e\'', 'é', $word);
}
$match = true;
}
// Check for other vowels
if(strpos($vowels, $lastLetter) !== false && !$match) {
for($v = 0; $v < strlen($acuteVowels); $v++) {
if(substr($vowels, $v, 1) === $lastLetter) {
// Replace the vowel with corresponding acute accented letter
$word = str_replace(
$lastLetter.'\'',
mb_substr($acuteVowels, $v, 1),
$word
);
$match = true;
break;
}
}
}
}
}
// Convert back to uppercase if necessary
$casedWord = '';
for($l = 0; $l < $len; $l++) {
$letter = mb_substr($word, $l, 1);
$upperCaseIndex = substr($upperCaseIndices, $l, 1);
if($upperCaseIndex === '1') {
$casedWord.= mb_strtoupper($letter);
} else {
$casedWord.= $letter;
}
}
$word = $casedWord;
$out.= ' '.$word.$lastChar;
}
return $out;
}
// Example below tests both lower and upper case and some exceptions/edge-cases.
header('Content-type: text/plain; charset=utf-8');
$str = "Questa CitTA' citta' di nome 'ROMA' E' e' davvero bella ".
"poiche' Poiche' molto te' TE', antica piu' PIU'.";
echo 'Input: '.$str.PHP_EOL.'- - - - - -'.PHP_EOL;
echo 'Result: '.replaceEndQuote($str);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment