Skip to content

Instantly share code, notes, and snippets.

@dreampuf
Created July 1, 2013 06:11
Show Gist options
  • Save dreampuf/5898698 to your computer and use it in GitHub Desktop.
Save dreampuf/5898698 to your computer and use it in GitHub Desktop.
make the data suit PLDA+
#!/usr/bin/env python
#vi: fileencoding=utf-8
__twitter__ = "dreampuf"
"""
Make the text data suit PLDA+
This script usage:
$ python make_data.py ../sgyy.utf-8.txt > data.txt
Data usage: https://code.google.com/p/plda/wiki/PLDAManual
"""
import re
import sys
from collections import Counter
import jieba
jieba.enable_parallel(4)
sgyy = open(sys.argv[1]).read()
sgyy_ls = re.split("[\r\n]{4,}", sgyy)
sgyy_m = map(
lambda i:
" ".join(
u"{} {}".format(k, v) for k, v in Counter( # the easy way to make the "word_0 count_0"
filter(
lambda x: x not in u"\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b", # filter punctuation
jieba.cut(i)
)
).iteritems() if k.strip() # filter the chinese space
),
sgyy_ls
)
for line in sgyy_m:
print line.encode("u8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment