Last active
February 25, 2022 13:56
-
-
Save brunoro/e9ab9c1e662f7e8db188c3a29515246a to your computer and use it in GitHub Desktop.
Serbilian cyrilic: a cyrilic script for Brazilian Portuguese based in Serbian Cyrilic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'logger' | |
require 'pragmatic_tokenizer' | |
require 'unicode_utils' | |
class Serbilian | |
SINGLE_LETTER_WORDS = { | |
"à" => "a", | |
"ã" => "ɐ̃", | |
"â" => "ɐ̃", | |
"é" => "э", | |
"ê" => "e", | |
"o" => "у", | |
"ó" => "ɔ", | |
"ô" => "o", | |
"õ" => "õ", | |
"í" => "i", | |
"u" => "у" | |
} | |
PHONEME_TABLE = { | |
"" => ["", ""], | |
"a" => ["А", "а"], | |
"æ" => ["А", "а"], | |
"ɐ" => ["∀", "ɐ"], | |
"ɐ̃" => ["∀", "ɐ"], | |
"ã" => ["∀", "ɐ"], | |
"aɪ" => ["Aи", "aи"], | |
"ɐ̃ʊ̃" => ["∀o", "ɐo"], | |
"aʊ" => ["Aу", "aу"], | |
"b" => ["Б", "б"], | |
"d" => ["Д", "д"], | |
"dʒ" => ["Џ", "џ"], | |
"e" => ["Е", "е"], | |
"eɪ" => ["E", "e"], | |
"ɛ" => ["Э", "э"], | |
"ə" => ["", ""], | |
"f" => ["Ф", "ф"], | |
"ɡ" => ["Г", "г"], | |
"i" => ["И", "и"], | |
"iʊ" => ["Иу", "иу"], | |
"ɪ" => ["Ј", "ј"], | |
"j" => ["Ј", "ј"], | |
"ʒ" => ["Ж", "ж"], | |
"k" => ["К", "к"], | |
"l" => ["Л", "л"], | |
"ʎ" => ["Љ", "љ"], | |
"m" => ["М", "м"], | |
"n" => ["Н", "н"], | |
"ŋ" => ["Н", "н"], | |
"ɲ" => ["Њ", "њ"], | |
"o" => ["О", "о"], | |
"õ" => ["Õ", "õ"], | |
"oɪ" => ["Oи", "oи"], | |
"ɔ" => ["Ɔ", "ɔ"], | |
"p" => ["П", "п"], | |
"ɾ" => ["Р", "р"], | |
"r" => ["Р", "р"], | |
"x" => ["X", "x"], | |
"ʁ" => ["X", "x"], | |
"s" => ["С", "с"], | |
"ʃ" => ["Ш", "ш"], | |
"t" => ["Т", "т"], | |
"tʃ" => ["Ч", "ч"], | |
"ũ" => ["У", "у"], | |
"ũ" => ["У", "у"], | |
"ʊ" => ["У", "у"], | |
"ʊ̃" => ["У", "у"], | |
"у" => ["У", "у"], | |
"v" => ["В", "в"], | |
"w" => ["у", "у"], | |
"y" => ["I", "i"], | |
"z" => ["S", "s"], | |
} | |
TOKENIZER = PragmaticTokenizer::Tokenizer.new :language => "pt-br", :downcase => false | |
LOGGER = Logger.new(STDOUT) | |
LOGGER.level = Logger::INFO | |
def self.word?(token) | |
not token.match(/[[:word:]]+/).nil? | |
end | |
def self.downcase(letter) | |
UnicodeUtils.downcase(letter, :pt) | |
end | |
def self.word_to_ipa(word) | |
single_letter_replacement = SINGLE_LETTER_WORDS[downcase(word.strip)] | |
if single_letter_replacement | |
single_letter_replacement | |
else | |
%x{espeak -v pt-br --ipa=3 "#{word}" -q}.strip | |
end | |
end | |
def self.ipa_to_serbilian(ipa, capitalize) | |
serbilian = "" | |
ipa.split("_").each_with_index do |s, i| | |
clean_phoneme = s.tr("ˈ", "").tr("ˌ", "") | |
chars = PHONEME_TABLE[clean_phoneme] | |
LOGGER.warn ("missing phoneme table entry for /#{clean_phoneme}/") unless chars | |
lcase = (capitalize and i.zero?) ? 0 : 1 | |
serbilian << chars[lcase] if chars | |
end | |
serbilian | |
end | |
def self.upcase?(letter) | |
UnicodeUtils.upcase(letter, :pt) == letter | |
end | |
def self.word_to_serbilian(word) | |
ipa = word_to_ipa(word) | |
serbilian = ipa_to_serbilian(ipa, upcase?(word[0])) | |
LOGGER.debug("'#{word}' -> '#{ipa}' -> '#{serbilian}'") | |
serbilian | |
end | |
def self.join_tokens(tokens) | |
res = "" | |
tokens.each_with_index do |token, i| | |
res << token | |
next_token = tokens[i + 1] if i + 1 < tokens.length | |
res << " " if next_token and word?(next_token) | |
end | |
res | |
end | |
def self.ptbr_to_serbilian(ptbr) | |
LOGGER.debug("pt-br: #{ptbr}") | |
tokens = TOKENIZER.tokenize(ptbr).map do |token| | |
word?(token) ? word_to_serbilian(token) : token | |
end | |
join_tokens(tokens) | |
end | |
def self.test_pangrams | |
pangrams = [ | |
"Um pequeno jabuti xereta viu dez cegonhas felizes.", | |
"Blitz prende ex-vesgo com cheque fajuto.", | |
"Gazeta publica hoje no jornal uma breve nota de faxina na quermesse.", | |
"Zebras caolhas de Java querem passar fax para moças gigantes de New York.", | |
"Luís argüia à Júlia que brações, fé, chá, óxido, pôr, zângão eram palavras do português.", | |
"À noite, vovô Kowalsky vê o ímã cair no pé do pingüim queixoso e vovó põe açúcar no chá de tâmaras do jabuti feliz." | |
] | |
pangrams.map { |p| ptbr_to_serbilian(p) } | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment