Skip to content

Instantly share code, notes, and snippets.

@brunoro
Last active February 25, 2022 13:56
Show Gist options
  • Save brunoro/e9ab9c1e662f7e8db188c3a29515246a to your computer and use it in GitHub Desktop.
Save brunoro/e9ab9c1e662f7e8db188c3a29515246a to your computer and use it in GitHub Desktop.
Serbilian cyrilic: a cyrilic script for Brazilian Portuguese based in Serbian Cyrilic
#!/usr/bin/ruby
require 'logger'
require 'pragmatic_tokenizer'
require 'unicode_utils'
class Serbilian
SINGLE_LETTER_WORDS = {
"à" => "a",
"ã" => "ɐ̃",
"â" => "ɐ̃",
"é" => "э",
"ê" => "e",
"o" => "у",
"ó" => "ɔ",
"ô" => "o",
"õ" => "õ",
"í" => "i",
"u" => "у"
}
PHONEME_TABLE = {
"" => ["", ""],
"a" => ["А", "а"],
"æ" => ["А", "а"],
"ɐ" => ["∀", "ɐ"],
"ɐ̃" => ["∀", "ɐ"],
"ã" => ["∀", "ɐ"],
"aɪ" => ["Aи", "aи"],
"ɐ̃ʊ̃" => ["∀o", "ɐo"],
"aʊ" => ["Aу", "aу"],
"b" => ["Б", "б"],
"d" => ["Д", "д"],
"dʒ" => ["Џ", "џ"],
"e" => ["Е", "е"],
"eɪ" => ["E", "e"],
"ɛ" => ["Э", "э"],
"ə" => ["", ""],
"f" => ["Ф", "ф"],
"ɡ" => ["Г", "г"],
"i" => ["И", "и"],
"iʊ" => ["Иу", "иу"],
"ɪ" => ["Ј", "ј"],
"j" => ["Ј", "ј"],
"ʒ" => ["Ж", "ж"],
"k" => ["К", "к"],
"l" => ["Л", "л"],
"ʎ" => ["Љ", "љ"],
"m" => ["М", "м"],
"n" => ["Н", "н"],
"ŋ" => ["Н", "н"],
"ɲ" => ["Њ", "њ"],
"o" => ["О", "о"],
"õ" => ["Õ", "õ"],
"oɪ" => ["Oи", "oи"],
"ɔ" => ["Ɔ", "ɔ"],
"p" => ["П", "п"],
"ɾ" => ["Р", "р"],
"r" => ["Р", "р"],
"x" => ["X", "x"],
"ʁ" => ["X", "x"],
"s" => ["С", "с"],
"ʃ" => ["Ш", "ш"],
"t" => ["Т", "т"],
"tʃ" => ["Ч", "ч"],
"ũ" => ["У", "у"],
"ũ" => ["У", "у"],
"ʊ" => ["У", "у"],
"ʊ̃" => ["У", "у"],
"у" => ["У", "у"],
"v" => ["В", "в"],
"w" => ["у", "у"],
"y" => ["I", "i"],
"z" => ["S", "s"],
}
TOKENIZER = PragmaticTokenizer::Tokenizer.new :language => "pt-br", :downcase => false
LOGGER = Logger.new(STDOUT)
LOGGER.level = Logger::INFO
def self.word?(token)
not token.match(/[[:word:]]+/).nil?
end
def self.downcase(letter)
UnicodeUtils.downcase(letter, :pt)
end
def self.word_to_ipa(word)
single_letter_replacement = SINGLE_LETTER_WORDS[downcase(word.strip)]
if single_letter_replacement
single_letter_replacement
else
%x{espeak -v pt-br --ipa=3 "#{word}" -q}.strip
end
end
def self.ipa_to_serbilian(ipa, capitalize)
serbilian = ""
ipa.split("_").each_with_index do |s, i|
clean_phoneme = s.tr("ˈ", "").tr("ˌ", "")
chars = PHONEME_TABLE[clean_phoneme]
LOGGER.warn ("missing phoneme table entry for /#{clean_phoneme}/") unless chars
lcase = (capitalize and i.zero?) ? 0 : 1
serbilian << chars[lcase] if chars
end
serbilian
end
def self.upcase?(letter)
UnicodeUtils.upcase(letter, :pt) == letter
end
def self.word_to_serbilian(word)
ipa = word_to_ipa(word)
serbilian = ipa_to_serbilian(ipa, upcase?(word[0]))
LOGGER.debug("'#{word}' -> '#{ipa}' -> '#{serbilian}'")
serbilian
end
def self.join_tokens(tokens)
res = ""
tokens.each_with_index do |token, i|
res << token
next_token = tokens[i + 1] if i + 1 < tokens.length
res << " " if next_token and word?(next_token)
end
res
end
def self.ptbr_to_serbilian(ptbr)
LOGGER.debug("pt-br: #{ptbr}")
tokens = TOKENIZER.tokenize(ptbr).map do |token|
word?(token) ? word_to_serbilian(token) : token
end
join_tokens(tokens)
end
def self.test_pangrams
pangrams = [
"Um pequeno jabuti xereta viu dez cegonhas felizes.",
"Blitz prende ex-vesgo com cheque fajuto.",
"Gazeta publica hoje no jornal uma breve nota de faxina na quermesse.",
"Zebras caolhas de Java querem passar fax para moças gigantes de New York.",
"Luís argüia à Júlia que brações, fé, chá, óxido, pôr, zângão eram palavras do português.",
"À noite, vovô Kowalsky vê o ímã cair no pé do pingüim queixoso e vovó põe açúcar no chá de tâmaras do jabuti feliz."
]
pangrams.map { |p| ptbr_to_serbilian(p) }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment