Last active
December 24, 2015 13:19
-
-
Save l34marr/6804548 to your computer and use it in GitHub Desktop.
NLTK patch for zh characters.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !/usr/bin/python | |
# -*- coding: gbk -*- | |
# http://hi.baidu.com/xiaopch/item/3a54eec3d93f9e2def466548 | |
from __future__ import division | |
import os | |
import codecs | |
import nltk | |
from nltk import * | |
##from nltk.book import * | |
import re, pprint | |
from urllib import urlopen | |
import sys,os,shutil,traceback,time | |
from chardet.universaldetector import UniversalDetector | |
import chardet | |
FileCode='utf-8' | |
encodes=["gb2312","gb18030","gbk","gb18030"] | |
def ReadNews(url,filename): | |
html = urlopen(url).read() | |
raw = nltk.clean_html(html) | |
fileHandle = open(filename,'wb') | |
fileHandle.write(raw) | |
fileHandle.close() | |
def ReadFile(filename): | |
s="" | |
s=open(filename).read() | |
return s.decode(FileCode) | |
def TransferToEncoding(filename,toCode): | |
''' | |
save the content of filename to filename with toCode text encoding | |
@param filename{string}: text file | |
@param toCode{string}: text encoding code ,gbk,utf-8...etc | |
@return{boolean}: operation success true/false | |
''' | |
if(os.path.isdir(filename)): | |
print "error:not file" | |
return False | |
try: | |
detector = UniversalDetector() | |
#print filename | |
#read content | |
f=open(filename,"r") | |
ls=f.readlines(); | |
f.close(); | |
#detect encoding | |
for l in ls: | |
detector.feed(l) | |
if detector.done: break | |
detector.close() | |
#print detector.result | |
#print dir(detector.result) | |
fromcode=detector.result['encoding'] | |
if fromcode==None: | |
print "error:"+ str(detector.result) | |
return False | |
#print "original encoding:",encode | |
if(fromcode.lower()!= toCode.lower()): | |
## #backup orginal file | |
## if not os.path.exists(filename+".bak"): | |
## shutil.copy(filename, filename+".bak") | |
## | |
# convert the concent | |
fp = open( filename, 'rb' ) | |
content = fp.read() | |
fp.close() | |
fp = open( filename, 'wb' ) | |
new_content = content.decode( fromcode,'ignore' ).encode( toCode ) | |
#save to another encoding | |
fp.write( new_content ) | |
fp.close() | |
print "result encoding:"+fromcode+" to "+toCode | |
else: | |
print "result encoding:"+fromcode+" to "+toCode | |
pass | |
#print "same encoding" | |
except BaseException, e: | |
#print "error:",e | |
traceback.print_exc() | |
## #restore | |
## if(os.path.exists(filename+".bak")): | |
## shutil.copy(filename+".bak", filename) | |
return False | |
finally: | |
pass | |
return True | |
filename='News.txt' | |
##ReadNews("http://news.baidu.com",filename) | |
##print chardet.detect(filename) | |
TransferToEncoding(filename,FileCode) | |
raw=ReadFile(filename) | |
tokens = nltk.word_tokenize(raw) | |
##print raw | |
##text = nltk.Text(tokens)#nltk无法处理中文,修改text.py文件,将init中自动生成name的部分注释掉就可以 | |
text = nltk.Text(tokens,FileCode)####添加一个参数FileCode,增加中文支持 | |
print "Text Length:"+str(len(text)) | |
fdist1 = FreqDist(text) | |
vocabulary1 = fdist1.keys() | |
print ' '.join(vocabulary1[:50]).encode(FileCode) | |
fdist1.plot(50,cumulative=True) | |
##text.concordance(u"新") | |
##text.similar(u"新") | |
##text.common_contexts([u"新", u"闻"]) | |
##text.dispersion_plot([u"新", u"闻"]) | |
##text.generate() | |
##lset =sorted(set(text)) | |
##for c in vocabulary1[:50]: | |
## print c.encode(FileCode) | |
## | |
####t=set(text) | |
####for c in t: | |
#### print c.encode("gbk") | |
##print sys.getdefaultencoding() | |
## | |
##print text#测试 | |
##text.collocations() | |
#http://news.baidu.com/ | |
##proxies = {'http': 'http://www.someproxy.com:3128'} | |
##raw = urlopen(url, proxies=proxies).read() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment