Skip to content

Instantly share code, notes, and snippets.

@kimmobrunfeldt
Created March 15, 2012 19:44
Show Gist options
  • Select an option

  • Save kimmobrunfeldt/2046377 to your computer and use it in GitHub Desktop.

Select an option

Save kimmobrunfeldt/2046377 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from unicodedata import category
class Text(object):
def __init__(self, text):
if not isinstance(text, unicode):
raise ValueError("text parameter must be unicode.")
self.text_ = text
# Remove everything else but characters.
self.words_ = [''.join(ch for ch in word if category(ch)[0] == 'L') \
for word in text.split()]
self.text_character_count_ = sum([len(i) for i in self.words_])
self.CONSONANTS_ = u'bcdfghjklmnpqrstvwxyz'
self.VOWELS_ = u'aeiouåäö'
def get_ratios(self):
"""Ratio means (pair count) / (amount of total characters).
c means consonant, c means vowel. c-v means consonant-vowel pair.
"""
ratios = {
'c-v': self.get_pair_ratio_(self.is_consonant_vowel_pair_),
'v-c': self.get_pair_ratio_(self.is_vowel_consonant_pair_),
'c-c': self.get_pair_ratio_(self.is_consonant_consonant_pair_),
'v-v': self.get_pair_ratio_(self.is_vowel_vowel_pair_),
}
return ratios
# Non-public:
def get_pair_ratio_(self, pair_compare_function):
return self.count_pairs_(pair_compare_function) / \
float(self.text_character_count_)
def count_pairs_(self, pair_compare_function):
"""pair_compare_function takes 2 arguments,
character1 and character2. It returns 1 if characters match to
a filter and 0 if not."""
pairs = 0
for word in self.words_:
word = word.lower()
for i in xrange(len(word) - 1):
pairs += pair_compare_function(word[i], word[i + 1])
return pairs
def is_consonant_vowel_pair_(self, character1, character2):
return (character1 in self.CONSONANTS_ and character2 in self.VOWELS_)
def is_vowel_consonant_pair_(self, character1, character2):
return (character1 in self.VOWELS_ and character2 in self.CONSONANTS_)
def is_consonant_consonant_pair_(self, character1, character2):
return (character1 in self.CONSONANTS_ and \
character2 in self.CONSONANTS_)
def is_vowel_vowel_pair_(self, character1, character2):
return (character1 in self.VOWELS_ and character2 in self.VOWELS_)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
#
# Tries to detect if language is Finnish or English
#
from optparse import OptionParser
import extended_text
# English text has these ratios on average.
ENG_RATIOS = {
'c-v': 0.291695004912,
'v-c': 0.288974533364,
'c-c': 0.173807904481,
'v-v': 0.0470037028641,
}
def get_multiline_input(label):
text = raw_input(label + ', . in empty line stops input:\n')
tmp_text = ""
while tmp_text != '.':
tmp_text = raw_input()
text += '\n' + tmp_text
text = text[:-2] # Remove last line break and dot
return text
def main():
parser = OptionParser(usage="usage: %prog [options]",
version="%prog 0.1")
parser.add_option("-i", "--input",
action="store",
type="string",
dest="input",
help="input file to read text from.")
(options, args) = parser.parse_args()
if options.input is None:
text = get_multiline_input('Input text')
else:
text = open(options.input).read()
text = text.decode('UTF-8')
t = extended_text.Text(text)
ratios = t.get_ratios()
if ENG_RATIOS['c-v'] < ratios['c-v'] and ENG_RATIOS['v-v'] < ratios['v-v']:
print('Text is Finnish!')
else:
print('Text is English!')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment