miku · March 9, 2022 14:59 · AmineBarrak · Oct 29, 2021
diff --git a/.gitignore b/.gitignore
 *gz
diff --git a/README.md b/README.md
diff --git a/gutenbergfreq.py b/gutenbergfreq.py
 #!/usr/bin/env python
 # coding: utf-8

 import collections
 import glob
 import os
 import re
 import string


 if __name__ == '__main__':
    
    # pattern = re.compile('[\W_]+')
    # for path in glob.glob('/media/mtc/Data/tmp/gberg-*'):
    #     counter = collections.Counter()
    #     outfile = '%s.freq' % path
    #     print(path, outfile)
    #     if os.path.exists(outfile):
    #         continue
    #     with open(path) as handle:
    #         for line in handle:
    #             line = line.lower()
    #             for token in line.split():
    #                 token = pattern.sub('', token)
    #                 if token:
    #                     counter[token] += 1
    #     with open('%s.freq' % path, 'w') as output:
    #         for key, value in counter.iteritems():
    #             output.write('%s\t%s\n' % (key, value))

    counter = collections.Counter()
    for i, path in enumerate(glob.glob('/media/mtc/Data/tmp/gberg-*freq')):
        with open(path) as handle:
            print(path, len(counter), i)
            for line in handle:
                token, frequency = line.strip().split()
                frequency = int(frequency)
                if len(token) > 30:
                    continue
                else:
                    counter[token] += frequency

    with open('./gutenberg.freq', 'w') as output:
        for key, value in counter.iteritems():
            output.write('%s\t%s\n' % (key, value))
diff --git a/segmentation.py b/segmentation.py
 #!/usr/bin/env python
 # coding: utf-8
 # http://stackoverflow.com/q/19675106/89391

 import functools
 import math
 import os
 import sys
 import gzip
 import urllib


 def split_pairs(word):
    return [(word[:i + 1], word[i + 1:]) for i in range(len(word))]

 def segment(word, word_seq_fitness=None):
    if not word or not word_seq_fitness:
        return []
    all_segmentations = [[first] + segment(rest, word_seq_fitness=word_seq_fitness) 
                         for (first, rest) in split_pairs(word)]
    # print(len(all_segmentations))
    return max(all_segmentations, key=word_seq_fitness)


 class OneGramDist(dict):
    """
    1-gram probability distribution for corpora.
    Source: http://norvig.com/ngrams/count_1w.txt
    """
    def __init__(self, filename='count_1w_cleaned.txt'):
        self.total = 0
        print('building probability table...')
        _open = open
        if filename.endswith('gz'):
            _open = gzip.open
        with _open(filename) as handle:
            for line in handle:
                word, count = line.strip().split('\t')
                self[word] = int(count)
                self.total += int(count)

    def __call__(self, word):
        try:
            result = float(self[word]) / self.total
            # print(word, result)
        except KeyError:
            # result = 1.0 / self.total
            return 1.0 / (self.total * 10**(len(word) - 2))
            # return sys.float_info.min
        return result


 def onegram_log(onegrams, words):
    """
    Use the log trick to avoid tiny quantities.
    http://machineintelligence.tumblr.com/post/4998477107/the-log-sum-exp-trick
    """

    result = functools.reduce(lambda x, y: x + y,
        (math.log10(onegrams(w)) for w in words))

    print(words, result)
    return result


 if __name__ == '__main__':
    sentence = ''.join(sys.argv[1:]).lower()
    if not sentence:
        sentence = 'thequickbrown'
    # onegrams = OneGramDist(filename='count_10M_gb.txt')
    onegrams = OneGramDist(filename='count_1M_gb.txt.gz')
    # onegrams = OneGramDist(filename='count_1w.txt')
    onegram_fitness = functools.partial(onegram_log, onegrams)
    print(sentence)
    print(segment(sentence, word_seq_fitness=onegram_fitness))
diff --git a/segmetation_examples.txt b/segmetation_examples.txt
 Examples (using 1M or 10M frequency list from Gutenberg):

 $ python segmentation.py penisland
 ...
 (['p', 'en', 'island'], -15.638518785248923)
 (['pe', 'nis', 'land'], -15.108322797049299)
 (['pen', 'island'], -8.375691641879506)
 (['peni', 'sl', 'and'], -14.687786244444453)
 (['penis', 'land'], -9.672682030503244)
 (['penisl', 'and'], -15.235864890453465)
 (['penisla', 'nd'], -20.26133790488834)
 (['penislan', 'd'], -24.41870399665061)
 (['penisland'], -16.709351998325303)
 ['pen', 'island']

 $ python segmentation.py therapistfinder
 ...
 (['t', 'he', 'rapist', 'finder'], -24.748890963614368)
 (['th', 'era', 'pist', 'finder'], -21.212173151805697)
 (['the', 'rapist', 'finder'], -15.237632013697695)
 (['ther', 'a', 'pist', 'finder'], -19.589487264674233)
 (['thera', 'pist', 'finder'], -19.16492932696031)
 (['therap', 'ist', 'finder'], -17.992610282256244)
 (['therapi', 'st', 'finder'], -24.47141671545925)
 (['therapis', 'tf', 'in', 'der'], -21.371078882293887)
 (['therapist', 'finder'], -13.623547247250375)
 (['therapistf', 'in', 'der'], -22.861482061222056)
 (['therapistfi', 'nder'], -25.98413509261641)
 (['therapistfin', 'der'], -23.10614521275257)
 (['therapistfind', 'er'], -24.395855144970255)
 (['therapistfinde', 'r'], -30.418703996650606)
 (['therapistfinder'], -22.709351998325303)
 ['therapist', 'finder']

 $ python segmentation.py whorepresents
 ...
 (['w', 'ho', 'represents'], -18.14739742951673)
 (['wh', 'ore', 'presents'], -15.423500721603162)
 (['who', 'represents'], -7.398759138614212)
 (['whor', 'ep', 'resents'], -20.08562578090216)
 (['whore', 'presents'], -10.445817549886325)
 (['whorep', 'resents'], -15.673004770048102)
 (['whorepr', 'esents'], -23.816644005322647)
 (['whorepre', 'sents'], -23.098557710539556)
 (['whorepres', 'ents'], -23.31936871896465)
 (['whoreprese', 'nts'], -25.459662604329512)
 (['whorepresen', 'ts'], -24.458042989379624)
 (['whorepresent', 's'], -28.418703996650606)
 (['whorepresents'], -20.709351998325303)
 ['who', 'represents']

 It's funny to see it *work* even with Kauderwelsch:

 $ python segmentation.py zugarrivesatgaredunord

 (['z', 'ug', 'arrives', 'at', 'gare', 'du', 'nord'], -32.50373110714555)
 (['zu', 'g', 'arrives', 'at', 'gare', 'du', 'nord'], -29.771130940523193)
 (['zug', 'arrives', 'at', 'gare', 'du', 'nord'], -27.6931533335704)
 (['zuga', 'rrivesatgaredunord'], -37.43176816321052)
 (['zugar', 'rives', 'at', 'gare', 'du', 'nord'], -35.27313676138066)
 (['zugarr', 'ive', 'sat', 'gare', 'du', 'nord'], -35.62167660702414)
 (['zugarri', 'vesat', 'gare', 'du', 'nord'], -37.06730572297619)
 (['zugarriv', 'es', 'at', 'gare', 'du', 'nord'], -36.19798266766783)
 (['zugarrive', 'sat', 'gare', 'du', 'nord'], -34.778801974903615)
 (['zugarrives', 'at', 'gare', 'du', 'nord'], -34.347608226189784)
 (['zugarrivesa', 'tg', 'are', 'du', 'nord'], -36.61899603139459)
 (['zugarrivesat', 'gare', 'du', 'nord'], -34.06742498500572)
 (['zugarrivesatg', 'are', 'du', 'nord'], -31.83761040103289)
 (['zugarrivesatga', 'red', 'un', 'ord'], -34.25155677971655)
 (['zugarrivesatgar', 'e', 'du', 'nord'], -34.7329574229286)
 (['zugarrivesatgare', 'du', 'nord'], -32.353490844484725)
 (['zugarrivesatgared', 'un', 'ord'], -33.482848511720604)
 (['zugarrivesatgaredu', 'nord'], -30.97651206109257)
 (['zugarrivesatgaredun', 'ord'], -32.2095721169088)
 (['zugarrivesatgareduno', 'rd'], -33.55078317272377)
 (['zugarrivesatgaredunor', 'd'], -32.66984483306061)
 (['zugarrivesatgaredunord'], -29.71588408160526)
 ['zug', 'arrives', 'at', 'gare', 'du', 'nord']
diff --git a/test_segmentation.py b/test_segmentation.py
 #!/usr/bin/env python
 # coding: utf-8

 import unittest
 from segmentation import *

 class SegmentationTests(unittest.TestCase):
    """
    Tests for word segmentation functions.
    """
    def test_split_pairs(self):
        self.assertEquals(len(split_pairs("hello")), 5)
        self.assertEquals(split_pairs("hi"), [("h", "i"), ("hi", "")])
        self.assertEquals(split_pairs("yay"), [("y", "ay"),
                                               ("ya", "y"),
                                               ("yay", "")])
	#!/usr/bin/env python
	# coding: utf-8

	import collections
	import glob
	import os
	import re
	import string


	if __name__ == '__main__':

	# pattern = re.compile('[\W_]+')
	# for path in glob.glob('/media/mtc/Data/tmp/gberg-*'):
	# counter = collections.Counter()
	# outfile = '%s.freq' % path
	# print(path, outfile)
	# if os.path.exists(outfile):
	# continue
	# with open(path) as handle:
	# for line in handle:
	# line = line.lower()
	# for token in line.split():
	# token = pattern.sub('', token)
	# if token:
	# counter[token] += 1
	# with open('%s.freq' % path, 'w') as output:
	# for key, value in counter.iteritems():
	# output.write('%s\t%s\n' % (key, value))

	counter = collections.Counter()
	for i, path in enumerate(glob.glob('/media/mtc/Data/tmp/gberg-*freq')):
	with open(path) as handle:
	print(path, len(counter), i)
	for line in handle:
	token, frequency = line.strip().split()
	frequency = int(frequency)
	if len(token) > 30:
	continue
	else:
	counter[token] += frequency

	with open('./gutenberg.freq', 'w') as output:
	for key, value in counter.iteritems():
	output.write('%s\t%s\n' % (key, value))
	#!/usr/bin/env python
	# coding: utf-8
	# http://stackoverflow.com/q/19675106/89391

	import functools
	import math
	import os
	import sys
	import gzip
	import urllib


	def split_pairs(word):
	return [(word[:i + 1], word[i + 1:]) for i in range(len(word))]

	def segment(word, word_seq_fitness=None):
	if not word or not word_seq_fitness:
	return []
	all_segmentations = [[first] + segment(rest, word_seq_fitness=word_seq_fitness)
	for (first, rest) in split_pairs(word)]
	# print(len(all_segmentations))
	return max(all_segmentations, key=word_seq_fitness)


	class OneGramDist(dict):
	"""
	1-gram probability distribution for corpora.
	Source: http://norvig.com/ngrams/count_1w.txt
	"""
	def __init__(self, filename='count_1w_cleaned.txt'):
	self.total = 0
	print('building probability table...')
	_open = open
	if filename.endswith('gz'):
	_open = gzip.open
	with _open(filename) as handle:
	for line in handle:
	word, count = line.strip().split('\t')
	self[word] = int(count)
	self.total += int(count)

	def __call__(self, word):
	try:
	result = float(self[word]) / self.total
	# print(word, result)
	except KeyError:
	# result = 1.0 / self.total
	return 1.0 / (self.total * 10**(len(word) - 2))
	# return sys.float_info.min
	return result


	def onegram_log(onegrams, words):
	"""
	Use the log trick to avoid tiny quantities.
	http://machineintelligence.tumblr.com/post/4998477107/the-log-sum-exp-trick
	"""

	result = functools.reduce(lambda x, y: x + y,
	(math.log10(onegrams(w)) for w in words))

	print(words, result)
	return result


	if __name__ == '__main__':
	sentence = ''.join(sys.argv[1:]).lower()
	if not sentence:
	sentence = 'thequickbrown'
	# onegrams = OneGramDist(filename='count_10M_gb.txt')
	onegrams = OneGramDist(filename='count_1M_gb.txt.gz')
	# onegrams = OneGramDist(filename='count_1w.txt')
	onegram_fitness = functools.partial(onegram_log, onegrams)
	print(sentence)
	print(segment(sentence, word_seq_fitness=onegram_fitness))
	Examples (using 1M or 10M frequency list from Gutenberg):

	$ python segmentation.py penisland
	...
	(['p', 'en', 'island'], -15.638518785248923)
	(['pe', 'nis', 'land'], -15.108322797049299)
	(['pen', 'island'], -8.375691641879506)
	(['peni', 'sl', 'and'], -14.687786244444453)
	(['penis', 'land'], -9.672682030503244)
	(['penisl', 'and'], -15.235864890453465)
	(['penisla', 'nd'], -20.26133790488834)
	(['penislan', 'd'], -24.41870399665061)
	(['penisland'], -16.709351998325303)
	['pen', 'island']

	$ python segmentation.py therapistfinder
	...
	(['t', 'he', 'rapist', 'finder'], -24.748890963614368)
	(['th', 'era', 'pist', 'finder'], -21.212173151805697)
	(['the', 'rapist', 'finder'], -15.237632013697695)
	(['ther', 'a', 'pist', 'finder'], -19.589487264674233)
	(['thera', 'pist', 'finder'], -19.16492932696031)
	(['therap', 'ist', 'finder'], -17.992610282256244)
	(['therapi', 'st', 'finder'], -24.47141671545925)
	(['therapis', 'tf', 'in', 'der'], -21.371078882293887)
	(['therapist', 'finder'], -13.623547247250375)
	(['therapistf', 'in', 'der'], -22.861482061222056)
	(['therapistfi', 'nder'], -25.98413509261641)
	(['therapistfin', 'der'], -23.10614521275257)
	(['therapistfind', 'er'], -24.395855144970255)
	(['therapistfinde', 'r'], -30.418703996650606)
	(['therapistfinder'], -22.709351998325303)
	['therapist', 'finder']

	$ python segmentation.py whorepresents
	...
	(['w', 'ho', 'represents'], -18.14739742951673)
	(['wh', 'ore', 'presents'], -15.423500721603162)
	(['who', 'represents'], -7.398759138614212)
	(['whor', 'ep', 'resents'], -20.08562578090216)
	(['whore', 'presents'], -10.445817549886325)
	(['whorep', 'resents'], -15.673004770048102)
	(['whorepr', 'esents'], -23.816644005322647)
	(['whorepre', 'sents'], -23.098557710539556)
	(['whorepres', 'ents'], -23.31936871896465)
	(['whoreprese', 'nts'], -25.459662604329512)
	(['whorepresen', 'ts'], -24.458042989379624)
	(['whorepresent', 's'], -28.418703996650606)
	(['whorepresents'], -20.709351998325303)
	['who', 'represents']

	It's funny to see it work even with Kauderwelsch:

	$ python segmentation.py zugarrivesatgaredunord

	(['z', 'ug', 'arrives', 'at', 'gare', 'du', 'nord'], -32.50373110714555)
	(['zu', 'g', 'arrives', 'at', 'gare', 'du', 'nord'], -29.771130940523193)
	(['zug', 'arrives', 'at', 'gare', 'du', 'nord'], -27.6931533335704)
	(['zuga', 'rrivesatgaredunord'], -37.43176816321052)
	(['zugar', 'rives', 'at', 'gare', 'du', 'nord'], -35.27313676138066)
	(['zugarr', 'ive', 'sat', 'gare', 'du', 'nord'], -35.62167660702414)
	(['zugarri', 'vesat', 'gare', 'du', 'nord'], -37.06730572297619)
	(['zugarriv', 'es', 'at', 'gare', 'du', 'nord'], -36.19798266766783)
	(['zugarrive', 'sat', 'gare', 'du', 'nord'], -34.778801974903615)
	(['zugarrives', 'at', 'gare', 'du', 'nord'], -34.347608226189784)
	(['zugarrivesa', 'tg', 'are', 'du', 'nord'], -36.61899603139459)
	(['zugarrivesat', 'gare', 'du', 'nord'], -34.06742498500572)
	(['zugarrivesatg', 'are', 'du', 'nord'], -31.83761040103289)
	(['zugarrivesatga', 'red', 'un', 'ord'], -34.25155677971655)
	(['zugarrivesatgar', 'e', 'du', 'nord'], -34.7329574229286)
	(['zugarrivesatgare', 'du', 'nord'], -32.353490844484725)
	(['zugarrivesatgared', 'un', 'ord'], -33.482848511720604)
	(['zugarrivesatgaredu', 'nord'], -30.97651206109257)
	(['zugarrivesatgaredun', 'ord'], -32.2095721169088)
	(['zugarrivesatgareduno', 'rd'], -33.55078317272377)
	(['zugarrivesatgaredunor', 'd'], -32.66984483306061)
	(['zugarrivesatgaredunord'], -29.71588408160526)
	['zug', 'arrives', 'at', 'gare', 'du', 'nord']
	#!/usr/bin/env python
	# coding: utf-8

	import unittest
	from segmentation import *

	class SegmentationTests(unittest.TestCase):
	"""
	Tests for word segmentation functions.
	"""
	def test_split_pairs(self):
	self.assertEquals(len(split_pairs("hello")), 5)
	self.assertEquals(split_pairs("hi"), [("h", "i"), ("hi", "")])
	self.assertEquals(split_pairs("yay"), [("y", "ay"),
	("ya", "y"),
	("yay", "")])