Created
September 18, 2015 10:11
-
-
Save ItoTomoki/02a308e3a8fbd16f42fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #encoding:utf8 | |
| #commenttext = uni_text.decode('unicode_escape') | |
| import json | |
| import os | |
| from ast import literal_eval | |
| import re | |
| import MeCab | |
| import unicodedata | |
| import sys | |
| import ngram | |
| import jcconv | |
| argvs = sys.argv # コマンドライン引数を格納したリストの取得 | |
| argc = len(argvs) # 引数の個数 | |
| # デバッグプリント | |
| print argvs[1] | |
| #print argc | |
| #ID = '0002' | |
| def n_gram(uni,n): | |
| return [uni[k:k+n] for k in range(len(uni)-n+1)] | |
| ID = str(argvs[1]) | |
| files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID) | |
| thread = {} | |
| thread[ID] = {} | |
| index = ngram.NGram(N=2) | |
| index3 = ngram.NGram(N=3) | |
| kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~" | |
| for nfile in files: | |
| filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + str(nfile)) | |
| #filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + "sm20158.dat") | |
| f = open(filepass) | |
| print nfile | |
| lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる) | |
| Lines2 = {} | |
| count = 0 | |
| for line in lines2: | |
| try: | |
| Lines2[count] = literal_eval(line) | |
| except: | |
| print line | |
| print count | |
| print nfile | |
| line = line.replace('null', '"null"') | |
| print line | |
| try: | |
| Lines2[count] = literal_eval(line) | |
| except: | |
| continue | |
| try: | |
| Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape') | |
| except: | |
| try: | |
| #print ("Eroor1" + Lines2[count]['comment']) | |
| Lines2[count]['comment'] = Lines2[count]['comment'][0:-1] | |
| except: | |
| print ("Eroor2" + line) | |
| #print Lines2[count]['comment'] | |
| count += 1 | |
| thread[ID][nfile] = Lines2 | |
| tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic') | |
| #commentfiles = os.listdir('comment') | |
| for j in thread[ID].keys(): | |
| filename = ("comment2_kai" + ID + "/" + j[0:-3] +"txt") | |
| #filename = ("comment2_kai" + ID + "/" + "sm20158." +"txt") | |
| fo = file(filename,'w') | |
| print filename | |
| commenttext = '' | |
| for i in range(0,len(thread[ID][j])): | |
| if i > 20000: | |
| print i,j | |
| break | |
| commenttext += thread[ID][j][i]["comment"] | |
| try: | |
| thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"]) | |
| except: | |
| print "normalize Eroor" | |
| pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8')) | |
| #pluscomment = jcconv.hira2kata(pluscomment) #後で追加 | |
| pluscomment = pluscomment.replace("█", "") | |
| pluscomment = pluscomment.replace("□", "") | |
| pluscomment = pluscomment.replace("※", "") | |
| pluscomment = pluscomment.replace("∴", "") | |
| pluscomment = pluscomment.replace("*", "") | |
| pluscomment = pluscomment.replace("+", "") | |
| pluscomment = pluscomment.replace("・", "") | |
| pluscomment = pluscomment.replace("°", "") | |
| pluscomment = pluscomment.replace("w", "") | |
| pluscomment = pluscomment.replace("null", "") | |
| #pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー") | |
| pluscomment = pluscomment.replace("\n", "") | |
| pluscomment = pluscomment.replace("\t", "") | |
| pluscomment = pluscomment.replace(" ", "") | |
| pluscomment = pluscomment.replace(" ", "") | |
| pluscomment = pluscomment.replace("ぁ", "あ") | |
| pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment) | |
| #さけび声対策 | |
| pluscommentlist = list(index.ngrams(index.pad(pluscomment.decode("utf-8")))) | |
| text = '' | |
| word1 = '' | |
| word2 ='' | |
| for word in pluscommentlist: | |
| if word == u"ーー": | |
| continue | |
| if word != word1: | |
| text += word[0] | |
| word1 = word | |
| if len(text) > 0: | |
| pluscomment = text#[1::] | |
| #繰り返し対策 | |
| """" | |
| pluscommentlist = list(index.ngrams(index.pad(pluscomment))) | |
| text = '' | |
| word1 = '' | |
| word2 ='' | |
| for n in range(0,len(pluscommentlist)): | |
| word = pluscommentlist[n] | |
| if n >= 2: | |
| if word == pluscommentlist[n-2]: | |
| text += (" " + word[0]) | |
| continue | |
| text += word[0] | |
| if len(text) > 0: | |
| pluscomment = text | |
| """ | |
| pluscommentlist = n_gram(pluscomment,3) | |
| text = '' | |
| word1 = '' | |
| word2 ='' | |
| #くりかえす対策/消す | |
| for n in range(0,len(pluscommentlist)): | |
| word = pluscommentlist[n] | |
| if n >= 3: | |
| if pluscommentlist[n] == pluscommentlist[n-3]: | |
| continue | |
| if n >= 4: | |
| if pluscommentlist[n] == pluscommentlist[n-4]: | |
| continue | |
| if n >= 5: | |
| if pluscommentlist[n] == pluscommentlist[n-5]: | |
| continue | |
| text += word[0] | |
| if len(text) > 0: | |
| pluscomment = text | |
| if pluscomment != '': | |
| pluscomment = pluscomment.replace("$"," ") | |
| pluscomment = tagger.parse(pluscomment.encode("utf-8")) | |
| pluscomment = pluscomment.replace("\n"," ") | |
| fo.write(pluscomment) | |
| thread[ID][j]["comment"] = commenttext | |
| fo.write("\n") | |
| fo.close() | |
| files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video') | |
| for nfile in files[1:2]: | |
| #print file | |
| nfile = (ID + ".dat") | |
| filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile)) | |
| f = open(filepass) | |
| lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる) | |
| f.close() | |
| Lines2 = {} | |
| count = 0 | |
| for line in lines2: | |
| try: | |
| Lines2[count] = literal_eval(line) | |
| print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape') | |
| thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape') | |
| count += 1 | |
| except: | |
| print line | |
| #pluscommentlist = list(index.ngrams(index3.pad(u"HAHAHAHAHAHAHAHAHA")))#HAHAHAHAHAHAHAHAHA"))) | |
| """ | |
| def n_gram(uni,n): | |
| return [uni[k:k+n] for k in range(len(uni)-n+1)] | |
| pluscommentlist = n_gram(u"フンフンフンフンフンフンフンフンフンフンフン",3) | |
| text = "" | |
| for n in range(0,len(pluscommentlist)): | |
| word = pluscommentlist[n] | |
| if n > 3: | |
| if (pluscommentlist[n] == pluscommentlist[n-4]): | |
| continue | |
| if n >= 3: | |
| if ((pluscommentlist[n] == pluscommentlist[n-3]) & (n >= 3)): | |
| print word | |
| continue | |
| if n >= 5: | |
| if pluscommentlist[n] == pluscommentlist[n-5]: | |
| continue | |
| print word, word[0] | |
| text += word[0] | |
| """ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment