magicshui · December 31, 2015 19:58
diff --git a/tsji.py b/tsji.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # 分别统计中文和英文的字数，不包括标点符号。
 # Author: Pan Junyong from zopen.cn, panjy at zopen dot cn
 import re
 import sys
 from types import StringType
 import operator
 import urllib2
 
 # See CJKSplitter
 rx = re.compile(u"[a-zA-Z0-9_\u0392-\u03c9]+|[\u4E00-\u9FFF\u3400-\u4dbf\uf900-\ufaff\u3040-\u309f\uac00-\ud7af]+", re.UNICODE)
 
 def caculateWords(s, encoding='utf-8'):
    result = []
 
    if type(s) is StringType: # not unicode
        s = unicode(s, encoding, 'ignore')
 
    splitted = rx.findall(s)
    cjk_len = 0
    asc_len = 0
    for w in splitted:
        if ord(w[0]) >= 12352:  # \u3040
            cjk_len += len(w)
            # result.append(w)
        else:
            #result.append(w)
            asc_len += 1
    return (cjk_len, asc_len)
 
 def main():
    index=0
    total_words = (0, 0)
 
    for filename in sys.argv[1:]:
        s = open(filename).read()
        # TODO: check encoding
        words = caculateWords(s)
        index += 1
 
        total_words = map(operator.add, total_words, words)
        print "%2d" % index, filename.ljust(18), '(Chinese, English):', words
 
    print "total: %2d files,     " % index, '(Chinese, English):', tuple(total_words)
 
 def get_words_count(url):
    url = "http://infoqhelp.sinaapp.com/queryit?url="+url
    print url
    data = urllib2.urlopen(url).read()
    return caculateWords(data)[0]

 if __name__ == '__main__':
    print get_words_count("http://www.infoq.com/cn/news/2013/11/yourkit-2013")
	#!/usr/bin/python
	# -- coding: utf-8 --
	# 分别统计中文和英文的字数，不包括标点符号。
	# Author: Pan Junyong from zopen.cn, panjy at zopen dot cn
	import re
	import sys
	from types import StringType
	import operator
	import urllib2

	# See CJKSplitter
	rx = re.compile(u"[a-zA-Z0-9_\u0392-\u03c9]+\|[\u4E00-\u9FFF\u3400-\u4dbf\uf900-\ufaff\u3040-\u309f\uac00-\ud7af]+", re.UNICODE)

	def caculateWords(s, encoding='utf-8'):
	result = []

	if type(s) is StringType: # not unicode
	s = unicode(s, encoding, 'ignore')

	splitted = rx.findall(s)
	cjk_len = 0
	asc_len = 0
	for w in splitted:
	if ord(w[0]) >= 12352: # \u3040
	cjk_len += len(w)
	# result.append(w)
	else:
	#result.append(w)
	asc_len += 1
	return (cjk_len, asc_len)

	def main():
	index=0
	total_words = (0, 0)

	for filename in sys.argv[1:]:
	s = open(filename).read()
	# TODO: check encoding
	words = caculateWords(s)
	index += 1

	total_words = map(operator.add, total_words, words)
	print "%2d" % index, filename.ljust(18), '(Chinese, English):', words

	print "total: %2d files, " % index, '(Chinese, English):', tuple(total_words)

	def get_words_count(url):
	url = "http://infoqhelp.sinaapp.com/queryit?url="+url
	print url
	data = urllib2.urlopen(url).read()
	return caculateWords(data)[0]

	if __name__ == '__main__':
	print get_words_count("http://www.infoq.com/cn/news/2013/11/yourkit-2013")
No results found