kimmobrunfeldt · March 15, 2012 19:44
diff --git a/extended_text.py b/extended_text.py
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-


 from unicodedata import category


 class Text(object):

    def __init__(self, text):
        if not isinstance(text, unicode):
            raise ValueError("text parameter must be unicode.")

        self.text_ = text

        # Remove everything else but characters.
        self.words_ = [''.join(ch for ch in word if category(ch)[0] == 'L') \
                       for word in text.split()]

        self.text_character_count_ = sum([len(i) for i in self.words_])

        self.CONSONANTS_ = u'bcdfghjklmnpqrstvwxyz'
        self.VOWELS_ = u'aeiouåäö'

    def get_ratios(self):
        """Ratio means (pair count) / (amount of total characters).
        c means consonant, c means vowel. c-v means consonant-vowel pair.
        """
        ratios = {
            'c-v': self.get_pair_ratio_(self.is_consonant_vowel_pair_),
            'v-c': self.get_pair_ratio_(self.is_vowel_consonant_pair_),
            'c-c': self.get_pair_ratio_(self.is_consonant_consonant_pair_),
            'v-v': self.get_pair_ratio_(self.is_vowel_vowel_pair_),
        }

        return ratios

    # Non-public:

    def get_pair_ratio_(self, pair_compare_function):
        return self.count_pairs_(pair_compare_function) / \
               float(self.text_character_count_)

    def count_pairs_(self, pair_compare_function):
        """pair_compare_function takes 2 arguments,
        character1 and character2. It returns 1 if characters match to
        a filter and 0 if not."""
        pairs = 0

        for word in self.words_:
            word = word.lower()
            for i in xrange(len(word) - 1):
                pairs += pair_compare_function(word[i], word[i + 1])

        return pairs

    def is_consonant_vowel_pair_(self, character1, character2):
        return (character1 in self.CONSONANTS_ and character2 in self.VOWELS_)

    def is_vowel_consonant_pair_(self, character1, character2):
        return (character1 in self.VOWELS_ and character2 in self.CONSONANTS_)

    def is_consonant_consonant_pair_(self, character1, character2):
        return (character1 in self.CONSONANTS_ and \
                character2 in self.CONSONANTS_)

    def is_vowel_vowel_pair_(self, character1, character2):
        return (character1 in self.VOWELS_ and character2 in self.VOWELS_)
diff --git a/language_detect.py b/language_detect.py
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
 #
 # Tries to detect if language is Finnish or English
 #

 from optparse import OptionParser

 import extended_text


 # English text has these ratios on average.
 ENG_RATIOS = {
    'c-v': 0.291695004912,
    'v-c': 0.288974533364,
    'c-c': 0.173807904481,
    'v-v': 0.0470037028641,
 }


 def get_multiline_input(label):
    text = raw_input(label + ', . in empty line stops input:\n')

    tmp_text = ""
    while tmp_text != '.':
        tmp_text = raw_input()
        text += '\n' + tmp_text
    text = text[:-2]  # Remove last line break and dot
    return text


 def main():
    parser = OptionParser(usage="usage: %prog [options]",
                      version="%prog 0.1")

    parser.add_option("-i", "--input",
                      action="store",
                      type="string",
                      dest="input",
                      help="input file to read text from.")

    (options, args) = parser.parse_args()

    if options.input is None:
        text = get_multiline_input('Input text')
    else:
        text = open(options.input).read()

    text = text.decode('UTF-8')
    t = extended_text.Text(text)

    ratios = t.get_ratios()

    if ENG_RATIOS['c-v'] < ratios['c-v'] and ENG_RATIOS['v-v'] < ratios['v-v']:
        print('Text is Finnish!')
    else:
        print('Text is English!')

 if __name__ == '__main__':
    main()
	#!/usr/bin/python
	# -- coding: UTF-8 --


	from unicodedata import category


	class Text(object):

	def __init__(self, text):
	if not isinstance(text, unicode):
	raise ValueError("text parameter must be unicode.")

	self.text_ = text

	# Remove everything else but characters.
	self.words_ = [''.join(ch for ch in word if category(ch)[0] == 'L') \
	for word in text.split()]

	self.text_character_count_ = sum([len(i) for i in self.words_])

	self.CONSONANTS_ = u'bcdfghjklmnpqrstvwxyz'
	self.VOWELS_ = u'aeiouåäö'

	def get_ratios(self):
	"""Ratio means (pair count) / (amount of total characters).
	c means consonant, c means vowel. c-v means consonant-vowel pair.
	"""
	ratios = {
	'c-v': self.get_pair_ratio_(self.is_consonant_vowel_pair_),
	'v-c': self.get_pair_ratio_(self.is_vowel_consonant_pair_),
	'c-c': self.get_pair_ratio_(self.is_consonant_consonant_pair_),
	'v-v': self.get_pair_ratio_(self.is_vowel_vowel_pair_),
	}

	return ratios

	# Non-public:

	def get_pair_ratio_(self, pair_compare_function):
	return self.count_pairs_(pair_compare_function) / \
	float(self.text_character_count_)

	def count_pairs_(self, pair_compare_function):
	"""pair_compare_function takes 2 arguments,
	character1 and character2. It returns 1 if characters match to
	a filter and 0 if not."""
	pairs = 0

	for word in self.words_:
	word = word.lower()
	for i in xrange(len(word) - 1):
	pairs += pair_compare_function(word[i], word[i + 1])

	return pairs

	def is_consonant_vowel_pair_(self, character1, character2):
	return (character1 in self.CONSONANTS_ and character2 in self.VOWELS_)

	def is_vowel_consonant_pair_(self, character1, character2):
	return (character1 in self.VOWELS_ and character2 in self.CONSONANTS_)

	def is_consonant_consonant_pair_(self, character1, character2):
	return (character1 in self.CONSONANTS_ and \
	character2 in self.CONSONANTS_)

	def is_vowel_vowel_pair_(self, character1, character2):
	return (character1 in self.VOWELS_ and character2 in self.VOWELS_)
	#!/usr/bin/python
	# -- coding: UTF-8 --
	#
	# Tries to detect if language is Finnish or English
	#

	from optparse import OptionParser

	import extended_text


	# English text has these ratios on average.
	ENG_RATIOS = {
	'c-v': 0.291695004912,
	'v-c': 0.288974533364,
	'c-c': 0.173807904481,
	'v-v': 0.0470037028641,
	}


	def get_multiline_input(label):
	text = raw_input(label + ', . in empty line stops input:\n')

	tmp_text = ""
	while tmp_text != '.':
	tmp_text = raw_input()
	text += '\n' + tmp_text
	text = text[:-2] # Remove last line break and dot
	return text


	def main():
	parser = OptionParser(usage="usage: %prog [options]",
	version="%prog 0.1")

	parser.add_option("-i", "--input",
	action="store",
	type="string",
	dest="input",
	help="input file to read text from.")

	(options, args) = parser.parse_args()

	if options.input is None:
	text = get_multiline_input('Input text')
	else:
	text = open(options.input).read()

	text = text.decode('UTF-8')
	t = extended_text.Text(text)

	ratios = t.get_ratios()

	if ENG_RATIOS['c-v'] < ratios['c-v'] and ENG_RATIOS['v-v'] < ratios['v-v']:
	print('Text is Finnish!')
	else:
	print('Text is English!')

	if __name__ == '__main__':
	main()