Created
November 7, 2017 02:32
-
-
Save wklken/f58bdcafa509f3ae2fefb00d55088ef9 to your computer and use it in GitHub Desktop.
get chinese words and sentenses from js/py files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# just quick & dirty job | |
import sys | |
import re | |
def process_one_file(file_path): | |
results = [] | |
# 1. all "" and '' | |
double_quotes = re.compile(u'".+?"') | |
single_quote = re.compile(u"'.+?'") | |
line_list = [] | |
with open(file_path) as f: | |
for line in f: | |
line = line.strip() | |
line = line.decode("utf-8") | |
words = re.findall(double_quotes, line) | |
for w in words: | |
w = w.strip("'\"") | |
line_list.append(w) | |
words = re.findall(single_quote, line) | |
for w in words: | |
w = w.strip("'\"") | |
line_list.append(w) | |
# 2. with chinese chars | |
filtered_line = [] | |
chinese = re.compile(u'[\u4e00-\u9fa5]+') | |
for line in line_list: | |
if chinese.search(line): | |
filtered_line.append(line) | |
# 3. just strip | |
not_chinese_chars_start = re.compile(u'^[^\u4e00-\u9fa5]+') | |
not_chinese_chars_ends = re.compile(u'[^\u4e00-\u9fa5]+$') | |
for l in filtered_line: | |
# print l | |
middle = not_chinese_chars_start.sub('', l) | |
last = not_chinese_chars_ends.sub('', middle) | |
# print last | |
results.append(last) | |
return results | |
def generate_csv(file_words, is_filter=True): | |
has_showed = dict() | |
lines = [] | |
for file, words in file_words: | |
lines.append("%s," % file) | |
for word in words: | |
if "," in word: | |
word = '"%s"' % word | |
if word not in has_showed: | |
lines.append(",%s" % word) | |
has_showed[word] = "" | |
else: | |
if is_filter: | |
continue | |
else: | |
lines.append(",%s" % word) | |
import codecs | |
with open("result.csv", "w") as f: | |
f.write(codecs.BOM_UTF8) | |
for line in lines: | |
f.write('%s\n' % line.encode("utf-8")) | |
if __name__ == '__main__': | |
file_words = [] | |
# e.g. /tmp/*.js, default, ignore *.min.js | |
print sys.argv | |
if len(sys.argv) <= 2: | |
print "glob path required" | |
sys.exit(1) | |
files = sys.argv[1:] | |
print "files: %s" % files | |
for file_path in files: | |
if file_path.endswith(".min.js"): | |
print "%s is min.js, filtered" % file_path | |
continue | |
if file_path.endswith("models.py"): | |
print "%s is models.py, filtered" % file_path | |
continue | |
if file_path.endswith(".html.py"): | |
print "%s is .html.py, filtered" % file_path | |
continue | |
words = process_one_file(file_path) | |
if not words: | |
print "%s has no chinese words" % file_path | |
continue | |
print "process %s ......" % file_path | |
file_words.append((file_path, words)) | |
print "generate csv" | |
generate_csv(file_words) | |
print "done!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment