Skip to content

Instantly share code, notes, and snippets.

@l34marr
Last active December 24, 2015 13:19
Show Gist options
  • Save l34marr/6804548 to your computer and use it in GitHub Desktop.
Save l34marr/6804548 to your computer and use it in GitHub Desktop.
NLTK patch for zh characters.
# !/usr/bin/python
# -*- coding: gbk -*-
# http://hi.baidu.com/xiaopch/item/3a54eec3d93f9e2def466548
from __future__ import division
import os
import codecs
import nltk
from nltk import *
##from nltk.book import *
import re, pprint
from urllib import urlopen
import sys,os,shutil,traceback,time
from chardet.universaldetector import UniversalDetector
import chardet
FileCode='utf-8'
encodes=["gb2312","gb18030","gbk","gb18030"]
def ReadNews(url,filename):
html = urlopen(url).read()
raw = nltk.clean_html(html)
fileHandle = open(filename,'wb')
fileHandle.write(raw)
fileHandle.close()
def ReadFile(filename):
s=""
s=open(filename).read()
return s.decode(FileCode)
def TransferToEncoding(filename,toCode):
'''
save the content of filename to filename with toCode text encoding
@param filename{string}: text file
@param toCode{string}: text encoding code ,gbk,utf-8...etc
@return{boolean}: operation success true/false
'''
if(os.path.isdir(filename)):
print "error:not file"
return False
try:
detector = UniversalDetector()
#print filename
#read content
f=open(filename,"r")
ls=f.readlines();
f.close();
#detect encoding
for l in ls:
detector.feed(l)
if detector.done: break
detector.close()
#print detector.result
#print dir(detector.result)
fromcode=detector.result['encoding']
if fromcode==None:
print "error:"+ str(detector.result)
return False
#print "original encoding:",encode
if(fromcode.lower()!= toCode.lower()):
## #backup orginal file
## if not os.path.exists(filename+".bak"):
## shutil.copy(filename, filename+".bak")
##
# convert the concent
fp = open( filename, 'rb' )
content = fp.read()
fp.close()
fp = open( filename, 'wb' )
new_content = content.decode( fromcode,'ignore' ).encode( toCode )
#save to another encoding
fp.write( new_content )
fp.close()
print "result encoding:"+fromcode+" to "+toCode
else:
print "result encoding:"+fromcode+" to "+toCode
pass
#print "same encoding"
except BaseException, e:
#print "error:",e
traceback.print_exc()
## #restore
## if(os.path.exists(filename+".bak")):
## shutil.copy(filename+".bak", filename)
return False
finally:
pass
return True
filename='News.txt'
##ReadNews("http://news.baidu.com",filename)
##print chardet.detect(filename)
TransferToEncoding(filename,FileCode)
raw=ReadFile(filename)
tokens = nltk.word_tokenize(raw)
##print raw
##text = nltk.Text(tokens)#nltk无法处理中文,修改text.py文件,将init中自动生成name的部分注释掉就可以
text = nltk.Text(tokens,FileCode)####添加一个参数FileCode,增加中文支持
print "Text Length:"+str(len(text))
fdist1 = FreqDist(text)
vocabulary1 = fdist1.keys()
print ' '.join(vocabulary1[:50]).encode(FileCode)
fdist1.plot(50,cumulative=True)
##text.concordance(u"新")
##text.similar(u"新")
##text.common_contexts([u"新", u"闻"])
##text.dispersion_plot([u"新", u"闻"])
##text.generate()
##lset =sorted(set(text))
##for c in vocabulary1[:50]:
## print c.encode(FileCode)
##
####t=set(text)
####for c in t:
#### print c.encode("gbk")
##print sys.getdefaultencoding()
##
##print text#测试
##text.collocations()
#http://news.baidu.com/
##proxies = {'http': 'http://www.someproxy.com:3128'}
##raw = urlopen(url, proxies=proxies).read()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment