tsuchm · December 24, 2023 02:59
diff --git a/textseg.py b/textseg.py
 # Python implementation of https://aclanthology.org/P01-1064.pdf
 #
 # How to use:
 #     from textseg import textseg
 #     segments = textseg(texts)

 from math import log2
 import unittest


 class TextSegParser():
    __tagger = None

    def __call__(self, text):
        cls = type(self)
        if not cls.__tagger:
            import MeCab
            cls.__tagger = MeCab.Tagger()
        node = cls.__tagger.parseToNode(text)
        nouns = []
        while node:
            features = node.feature.split(',')
            if features[0] == '名詞' and features[1] != '非自立':
                nouns.append(node.surface)
            node = node.next
        return nouns


 class TextSegUnit():
    def __init__(self, text, words):
        vocab = dict()
        for word in words:
            vocab[word] = vocab.get(word, 0) + 1
        self.text = text
        self.vocab = vocab
        self.length = len(words)
        self.bestpath = 0
        self.bestcost = 0

    def merge(self, other):
        self.length += other.length
        for word, freq in other.vocab.items():
            self.vocab[word] = self.vocab.get(word, 0) + freq

    # Note that we need to use the total vocabulary size of input
    # texts instead of the vocabulary size of each segment.  If we use
    # the vocabulary size of each segment, we obtain too small segments.
    def cost(self, vocabsize):
        cost = 0
        for freq in self.vocab.values():
            cost -= freq * log2(freq + 1)
        if self.length > 0:
            cost += self.length * log2(self.length + vocabsize)
            cost += log2(self.length)
        return cost


 def textseg(texts, parser=TextSegParser()):
    # calculate vocabulary size of input texts
    inputs = [TextSegUnit(text, parser(text)) for text in texts]
    vocab = dict()
    for unit in inputs:
        for word in unit.vocab.keys():
            vocab[word] = 1
    vocabsize = len(vocab.keys())

    # forward search
    units = []
    for last in inputs:
        for unit in units:
            unit.merge(last)
        units.append(last)
        bestpath = 0
        bestcost = float('inf')
        for i, unit in enumerate(units):
            cost = unit.cost(vocabsize)
            if i > 0:
                cost += units[i-1].bestcost
            if cost < bestcost:
                bestpath = i
                bestcost = cost
        last.bestpath = bestpath
        last.bestcost = bestcost

    # backward search
    segments = []
    while len(units) > 0:
        last = units[-1]
        segments.append([u.text for u in units[last.bestpath:]])
        units = units[:last.bestpath]
    segments.reverse()
    return segments


 class TextSegTest(unittest.TestCase):
    def test_parser(self):
        parser = TextSegParser()
        result = parser("形態素解析は重要な処理である")
        expected = ['形態素', '解析', '重要', '処理']
        self.assertEqual(result, expected)

    def test_unit(self):
        parser = TextSegParser()
        text = "ある単語と別の単語が繰り返し出現する"
        unit = TextSegUnit(text, parser(text))
        self.assertEqual(unit.text, text)
        self.assertEqual(unit.length, 4)
        self.assertEqual(len(unit.vocab.keys()), 3)
        expected_cost = -2 * log2(3/7) -2 * log2(2/7) + log2(4)
        self.assertAlmostEqual(unit.cost(3), expected_cost)
        other_text = "他の単語が出現する"
        other_unit = TextSegUnit(other_text, parser(other_text))
        self.assertEqual(other_unit.text, other_text)
        self.assertEqual(other_unit.length, 3)
        self.assertEqual(len(other_unit.vocab.keys()), 3)
        unit.merge(other_unit)
        self.assertEqual(unit.length, 7)
        self.assertEqual(len(unit.vocab.keys()), 4)

    def test_textseg(self):
        with open('samples/gijiroku.txt', 'r') as fp:
            segments = textseg(fp)
        self.assertEqual(len(segments), 10)


 # COPYRIGHT

 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.

 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.

 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
	# Python implementation of https://aclanthology.org/P01-1064.pdf
	#
	# How to use:
	# from textseg import textseg
	# segments = textseg(texts)

	from math import log2
	import unittest


	class TextSegParser():
	__tagger = None

	def __call__(self, text):
	cls = type(self)
	if not cls.__tagger:
	import MeCab
	cls.__tagger = MeCab.Tagger()
	node = cls.__tagger.parseToNode(text)
	nouns = []
	while node:
	features = node.feature.split(',')
	if features[0] == '名詞' and features[1] != '非自立':
	nouns.append(node.surface)
	node = node.next
	return nouns


	class TextSegUnit():
	def __init__(self, text, words):
	vocab = dict()
	for word in words:
	vocab[word] = vocab.get(word, 0) + 1
	self.text = text
	self.vocab = vocab
	self.length = len(words)
	self.bestpath = 0
	self.bestcost = 0

	def merge(self, other):
	self.length += other.length
	for word, freq in other.vocab.items():
	self.vocab[word] = self.vocab.get(word, 0) + freq

	# Note that we need to use the total vocabulary size of input
	# texts instead of the vocabulary size of each segment. If we use
	# the vocabulary size of each segment, we obtain too small segments.
	def cost(self, vocabsize):
	cost = 0
	for freq in self.vocab.values():
	cost -= freq * log2(freq + 1)
	if self.length > 0:
	cost += self.length * log2(self.length + vocabsize)
	cost += log2(self.length)
	return cost


	def textseg(texts, parser=TextSegParser()):
	# calculate vocabulary size of input texts
	inputs = [TextSegUnit(text, parser(text)) for text in texts]
	vocab = dict()
	for unit in inputs:
	for word in unit.vocab.keys():
	vocab[word] = 1
	vocabsize = len(vocab.keys())

	# forward search
	units = []
	for last in inputs:
	for unit in units:
	unit.merge(last)
	units.append(last)
	bestpath = 0
	bestcost = float('inf')
	for i, unit in enumerate(units):
	cost = unit.cost(vocabsize)
	if i > 0:
	cost += units[i-1].bestcost
	if cost < bestcost:
	bestpath = i
	bestcost = cost
	last.bestpath = bestpath
	last.bestcost = bestcost

	# backward search
	segments = []
	while len(units) > 0:
	last = units[-1]
	segments.append([u.text for u in units[last.bestpath:]])
	units = units[:last.bestpath]
	segments.reverse()
	return segments


	class TextSegTest(unittest.TestCase):
	def test_parser(self):
	parser = TextSegParser()
	result = parser("形態素解析は重要な処理である")
	expected = ['形態素', '解析', '重要', '処理']
	self.assertEqual(result, expected)

	def test_unit(self):
	parser = TextSegParser()
	text = "ある単語と別の単語が繰り返し出現する"
	unit = TextSegUnit(text, parser(text))
	self.assertEqual(unit.text, text)
	self.assertEqual(unit.length, 4)
	self.assertEqual(len(unit.vocab.keys()), 3)
	expected_cost = -2 * log2(3/7) -2 * log2(2/7) + log2(4)
	self.assertAlmostEqual(unit.cost(3), expected_cost)
	other_text = "他の単語が出現する"
	other_unit = TextSegUnit(other_text, parser(other_text))
	self.assertEqual(other_unit.text, other_text)
	self.assertEqual(other_unit.length, 3)
	self.assertEqual(len(other_unit.vocab.keys()), 3)
	unit.merge(other_unit)
	self.assertEqual(unit.length, 7)
	self.assertEqual(len(unit.vocab.keys()), 4)

	def test_textseg(self):
	with open('samples/gijiroku.txt', 'r') as fp:
	segments = textseg(fp)
	self.assertEqual(len(segments), 10)


	# COPYRIGHT

	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.

	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.

	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.