wklken · November 7, 2017 02:32
diff --git a/get_chinese_words.py b/get_chinese_words.py
 # -*- coding: utf-8 -*-
 # just quick & dirty job

 import sys
 import re


 def process_one_file(file_path):
    results = []
    # 1. all "" and ''
    double_quotes = re.compile(u'".+?"')
    single_quote = re.compile(u"'.+?'")

    line_list = []
    with open(file_path) as f:
        for line in f:
            line = line.strip()
            line = line.decode("utf-8")

            words = re.findall(double_quotes, line)
            for w in words:
                w = w.strip("'\"")
                line_list.append(w)

            words = re.findall(single_quote, line)
            for w in words:
                w = w.strip("'\"")
                line_list.append(w)

    # 2. with chinese chars
    filtered_line = []
    chinese = re.compile(u'[\u4e00-\u9fa5]+')
    for line in line_list:
        if chinese.search(line):
            filtered_line.append(line)

    # 3. just strip
    not_chinese_chars_start = re.compile(u'^[^\u4e00-\u9fa5]+')
    not_chinese_chars_ends = re.compile(u'[^\u4e00-\u9fa5]+$')
    for l in filtered_line:
        # print l
        middle = not_chinese_chars_start.sub('', l)
        last = not_chinese_chars_ends.sub('', middle)
        # print last
        results.append(last)

    return results


 def generate_csv(file_words, is_filter=True):
    has_showed = dict()

    lines = []
    for file, words in file_words:
        lines.append("%s," % file)

        for word in words:
            if "," in word:
                word = '"%s"' % word

            if word not in has_showed:
                lines.append(",%s" % word)
                has_showed[word] = ""
            else:
                if is_filter:
                    continue
                else:
                    lines.append(",%s" % word)

    import codecs
    with open("result.csv", "w") as f:
        f.write(codecs.BOM_UTF8)
        for line in lines:
            f.write('%s\n' % line.encode("utf-8"))


 if __name__ == '__main__':
    file_words = []
    # e.g.     /tmp/*.js, default, ignore *.min.js
    print sys.argv
    if len(sys.argv) <= 2:
        print "glob path required"
        sys.exit(1)

    files = sys.argv[1:]
    print "files: %s" % files
    for file_path in files:
        if file_path.endswith(".min.js"):
            print "%s is min.js, filtered" % file_path
            continue

        if file_path.endswith("models.py"):
            print "%s is models.py, filtered" % file_path
            continue

        if file_path.endswith(".html.py"):
            print "%s is .html.py, filtered" % file_path
            continue

        words = process_one_file(file_path)
        if not words:
            print "%s has no chinese words" % file_path
            continue
        print "process %s ......" % file_path
        file_words.append((file_path, words))

    print "generate csv"
    generate_csv(file_words)
    print "done!"
	# -- coding: utf-8 --
	# just quick & dirty job

	import sys
	import re


	def process_one_file(file_path):
	results = []
	# 1. all "" and ''
	double_quotes = re.compile(u'".+?"')
	single_quote = re.compile(u"'.+?'")

	line_list = []
	with open(file_path) as f:
	for line in f:
	line = line.strip()
	line = line.decode("utf-8")

	words = re.findall(double_quotes, line)
	for w in words:
	w = w.strip("'\"")
	line_list.append(w)

	words = re.findall(single_quote, line)
	for w in words:
	w = w.strip("'\"")
	line_list.append(w)

	# 2. with chinese chars
	filtered_line = []
	chinese = re.compile(u'[\u4e00-\u9fa5]+')
	for line in line_list:
	if chinese.search(line):
	filtered_line.append(line)

	# 3. just strip
	not_chinese_chars_start = re.compile(u'^[^\u4e00-\u9fa5]+')
	not_chinese_chars_ends = re.compile(u'[^\u4e00-\u9fa5]+$')
	for l in filtered_line:
	# print l
	middle = not_chinese_chars_start.sub('', l)
	last = not_chinese_chars_ends.sub('', middle)
	# print last
	results.append(last)

	return results


	def generate_csv(file_words, is_filter=True):
	has_showed = dict()

	lines = []
	for file, words in file_words:
	lines.append("%s," % file)

	for word in words:
	if "," in word:
	word = '"%s"' % word

	if word not in has_showed:
	lines.append(",%s" % word)
	has_showed[word] = ""
	else:
	if is_filter:
	continue
	else:
	lines.append(",%s" % word)

	import codecs
	with open("result.csv", "w") as f:
	f.write(codecs.BOM_UTF8)
	for line in lines:
	f.write('%s\n' % line.encode("utf-8"))


	if __name__ == '__main__':
	file_words = []
	# e.g. /tmp/.js, default, ignore .min.js
	print sys.argv
	if len(sys.argv) <= 2:
	print "glob path required"
	sys.exit(1)

	files = sys.argv[1:]
	print "files: %s" % files
	for file_path in files:
	if file_path.endswith(".min.js"):
	print "%s is min.js, filtered" % file_path
	continue

	if file_path.endswith("models.py"):
	print "%s is models.py, filtered" % file_path
	continue

	if file_path.endswith(".html.py"):
	print "%s is .html.py, filtered" % file_path
	continue

	words = process_one_file(file_path)
	if not words:
	print "%s has no chinese words" % file_path
	continue
	print "process %s ......" % file_path
	file_words.append((file_path, words))

	print "generate csv"
	generate_csv(file_words)
	print "done!"