Skip to content

Instantly share code, notes, and snippets.

@wklken
Created November 7, 2017 02:32
Show Gist options
  • Save wklken/f58bdcafa509f3ae2fefb00d55088ef9 to your computer and use it in GitHub Desktop.
Save wklken/f58bdcafa509f3ae2fefb00d55088ef9 to your computer and use it in GitHub Desktop.
get chinese words and sentenses from js/py files
# -*- coding: utf-8 -*-
# just quick & dirty job
import sys
import re
def process_one_file(file_path):
results = []
# 1. all "" and ''
double_quotes = re.compile(u'".+?"')
single_quote = re.compile(u"'.+?'")
line_list = []
with open(file_path) as f:
for line in f:
line = line.strip()
line = line.decode("utf-8")
words = re.findall(double_quotes, line)
for w in words:
w = w.strip("'\"")
line_list.append(w)
words = re.findall(single_quote, line)
for w in words:
w = w.strip("'\"")
line_list.append(w)
# 2. with chinese chars
filtered_line = []
chinese = re.compile(u'[\u4e00-\u9fa5]+')
for line in line_list:
if chinese.search(line):
filtered_line.append(line)
# 3. just strip
not_chinese_chars_start = re.compile(u'^[^\u4e00-\u9fa5]+')
not_chinese_chars_ends = re.compile(u'[^\u4e00-\u9fa5]+$')
for l in filtered_line:
# print l
middle = not_chinese_chars_start.sub('', l)
last = not_chinese_chars_ends.sub('', middle)
# print last
results.append(last)
return results
def generate_csv(file_words, is_filter=True):
has_showed = dict()
lines = []
for file, words in file_words:
lines.append("%s," % file)
for word in words:
if "," in word:
word = '"%s"' % word
if word not in has_showed:
lines.append(",%s" % word)
has_showed[word] = ""
else:
if is_filter:
continue
else:
lines.append(",%s" % word)
import codecs
with open("result.csv", "w") as f:
f.write(codecs.BOM_UTF8)
for line in lines:
f.write('%s\n' % line.encode("utf-8"))
if __name__ == '__main__':
file_words = []
# e.g. /tmp/*.js, default, ignore *.min.js
print sys.argv
if len(sys.argv) <= 2:
print "glob path required"
sys.exit(1)
files = sys.argv[1:]
print "files: %s" % files
for file_path in files:
if file_path.endswith(".min.js"):
print "%s is min.js, filtered" % file_path
continue
if file_path.endswith("models.py"):
print "%s is models.py, filtered" % file_path
continue
if file_path.endswith(".html.py"):
print "%s is .html.py, filtered" % file_path
continue
words = process_one_file(file_path)
if not words:
print "%s has no chinese words" % file_path
continue
print "process %s ......" % file_path
file_words.append((file_path, words))
print "generate csv"
generate_csv(file_words)
print "done!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment