dreampuf · July 1, 2013 06:11
diff --git a/make_plda_data.py b/make_plda_data.py
 #!/usr/bin/env python
 #vi: fileencoding=utf-8

 __twitter__ = "dreampuf"

 """
 Make the text data suit PLDA+

 This script usage:
  $ python make_data.py ../sgyy.utf-8.txt > data.txt

 Data usage: https://code.google.com/p/plda/wiki/PLDAManual
 """

 import re
 import sys
 from collections import Counter

 import jieba
 jieba.enable_parallel(4)

 sgyy = open(sys.argv[1]).read()
 sgyy_ls = re.split("[\r\n]{4,}", sgyy)
 sgyy_m = map(
    lambda i: 
        " ".join(
            u"{} {}".format(k, v) for k, v in Counter( # the easy way to make the "word_0 count_0"
                filter(
                    lambda x: x not in u"\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b", # filter punctuation
                    jieba.cut(i)
                    )
            ).iteritems() if k.strip() # filter the chinese space
        ),
    sgyy_ls
 )
 for line in sgyy_m:
    print line.encode("u8")
	#!/usr/bin/env python
	#vi: fileencoding=utf-8

	__twitter__ = "dreampuf"

	"""
	Make the text data suit PLDA+

	This script usage:
	$ python make_data.py ../sgyy.utf-8.txt > data.txt

	Data usage: https://code.google.com/p/plda/wiki/PLDAManual
	"""

	import re
	import sys
	from collections import Counter

	import jieba
	jieba.enable_parallel(4)

	sgyy = open(sys.argv[1]).read()
	sgyy_ls = re.split("[\r\n]{4,}", sgyy)
	sgyy_m = map(
	lambda i:
	" ".join(
	u"{} {}".format(k, v) for k, v in Counter( # the easy way to make the "word_0 count_0"
	filter(
	lambda x: x not in u"\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b", # filter punctuation
	jieba.cut(i)
	)
	).iteritems() if k.strip() # filter the chinese space
	),
	sgyy_ls
	)
	for line in sgyy_m:
	print line.encode("u8")