Created
March 15, 2012 19:44
-
-
Save kimmobrunfeldt/2046377 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # -*- coding: UTF-8 -*- | |
| from unicodedata import category | |
| class Text(object): | |
| def __init__(self, text): | |
| if not isinstance(text, unicode): | |
| raise ValueError("text parameter must be unicode.") | |
| self.text_ = text | |
| # Remove everything else but characters. | |
| self.words_ = [''.join(ch for ch in word if category(ch)[0] == 'L') \ | |
| for word in text.split()] | |
| self.text_character_count_ = sum([len(i) for i in self.words_]) | |
| self.CONSONANTS_ = u'bcdfghjklmnpqrstvwxyz' | |
| self.VOWELS_ = u'aeiouåäö' | |
| def get_ratios(self): | |
| """Ratio means (pair count) / (amount of total characters). | |
| c means consonant, c means vowel. c-v means consonant-vowel pair. | |
| """ | |
| ratios = { | |
| 'c-v': self.get_pair_ratio_(self.is_consonant_vowel_pair_), | |
| 'v-c': self.get_pair_ratio_(self.is_vowel_consonant_pair_), | |
| 'c-c': self.get_pair_ratio_(self.is_consonant_consonant_pair_), | |
| 'v-v': self.get_pair_ratio_(self.is_vowel_vowel_pair_), | |
| } | |
| return ratios | |
| # Non-public: | |
| def get_pair_ratio_(self, pair_compare_function): | |
| return self.count_pairs_(pair_compare_function) / \ | |
| float(self.text_character_count_) | |
| def count_pairs_(self, pair_compare_function): | |
| """pair_compare_function takes 2 arguments, | |
| character1 and character2. It returns 1 if characters match to | |
| a filter and 0 if not.""" | |
| pairs = 0 | |
| for word in self.words_: | |
| word = word.lower() | |
| for i in xrange(len(word) - 1): | |
| pairs += pair_compare_function(word[i], word[i + 1]) | |
| return pairs | |
| def is_consonant_vowel_pair_(self, character1, character2): | |
| return (character1 in self.CONSONANTS_ and character2 in self.VOWELS_) | |
| def is_vowel_consonant_pair_(self, character1, character2): | |
| return (character1 in self.VOWELS_ and character2 in self.CONSONANTS_) | |
| def is_consonant_consonant_pair_(self, character1, character2): | |
| return (character1 in self.CONSONANTS_ and \ | |
| character2 in self.CONSONANTS_) | |
| def is_vowel_vowel_pair_(self, character1, character2): | |
| return (character1 in self.VOWELS_ and character2 in self.VOWELS_) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # -*- coding: UTF-8 -*- | |
| # | |
| # Tries to detect if language is Finnish or English | |
| # | |
| from optparse import OptionParser | |
| import extended_text | |
| # English text has these ratios on average. | |
| ENG_RATIOS = { | |
| 'c-v': 0.291695004912, | |
| 'v-c': 0.288974533364, | |
| 'c-c': 0.173807904481, | |
| 'v-v': 0.0470037028641, | |
| } | |
| def get_multiline_input(label): | |
| text = raw_input(label + ', . in empty line stops input:\n') | |
| tmp_text = "" | |
| while tmp_text != '.': | |
| tmp_text = raw_input() | |
| text += '\n' + tmp_text | |
| text = text[:-2] # Remove last line break and dot | |
| return text | |
| def main(): | |
| parser = OptionParser(usage="usage: %prog [options]", | |
| version="%prog 0.1") | |
| parser.add_option("-i", "--input", | |
| action="store", | |
| type="string", | |
| dest="input", | |
| help="input file to read text from.") | |
| (options, args) = parser.parse_args() | |
| if options.input is None: | |
| text = get_multiline_input('Input text') | |
| else: | |
| text = open(options.input).read() | |
| text = text.decode('UTF-8') | |
| t = extended_text.Text(text) | |
| ratios = t.get_ratios() | |
| if ENG_RATIOS['c-v'] < ratios['c-v'] and ENG_RATIOS['v-v'] < ratios['v-v']: | |
| print('Text is Finnish!') | |
| else: | |
| print('Text is English!') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment