Skip to content

Instantly share code, notes, and snippets.

@ItoTomoki
Last active September 18, 2015 01:47
Show Gist options
  • Save ItoTomoki/5ef581303e86bad58881 to your computer and use it in GitHub Desktop.
Save ItoTomoki/5ef581303e86bad58881 to your computer and use it in GitHub Desktop.
#encoding:utf8
import os
from ast import literal_eval
import re
import MeCab
import unicodedata
import sys
import ngram
argvs = sys.argv # コマンドライン引数を格納したリストの取得
argc = len(argvs) # 引数の個数
# デバッグプリント
print argvs[1]
#print argc
#ID = '0002'
ID = str(argvs[1])
files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID)
thread = {}
thread[ID] = {}
kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"
index = ngram.NGram(N=2)
for nfile in files:
filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + str(nfile))
f = open(filepass)
lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
#data1 = f.read() # ファイル終端まで全て読んだデータを返す
f.close()
Lines2 = {}
count = 0
for line in lines2:
try:
Lines2[count] = literal_eval(line)
except:
print line
print count
print nfile
line = line.replace('null', '"null"')
print line
try:
Lines2[count] = literal_eval(line)
except:
continue
try:
Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
except:
try:
#print ("Eroor1" + Lines2[count]['comment'])
Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
except:
print ("Eroor2" + line)
#print Lines2[count]['comment']
count += 1
thread[ID][nfile] = Lines2
#tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
#commentfiles = os.listdir('comment')
for j in thread[ID].keys():
filename = ("comment2_bigram" + ID + "/" + j[0:-3] +"txt")
fo = file(filename,'w')
print filename
commenttext = ''
for i in range(0,len(thread[ID][j])):
if i > 20000:
print i,j
break
commenttext += thread[ID][j][i]["comment"]
try:
thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
except:
print "normalize Eroor"
pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
pluscomment = pluscomment.replace("█", "")
pluscomment = pluscomment.replace("□", "")
pluscomment = pluscomment.replace("※", "")
pluscomment = pluscomment.replace("∴", "")
pluscomment = pluscomment.replace("*", "")
pluscomment = pluscomment.replace("+", "")
pluscomment = pluscomment.replace("・", "")
pluscomment = pluscomment.replace("°", "")
pluscomment = pluscomment.replace("w", "")
pluscomment = pluscomment.replace("null", "")
pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
pluscomment = pluscomment.replace("\n", "")
pluscomment = pluscomment.replace("\t", "")
pluscomment = pluscomment.replace(" ", "")
pluscomment = pluscomment.replace(" ", "")
pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
if pluscomment != '':
#pluscomment = tagger.parse(pluscomment)
for text in list(index.ngrams(index.pad(pluscomment.decode("utf-8")))):
fo.write(text.encode("utf-8") + " ")
#thread[ID][j]["comment"] =
fo.write("\n")
fo.close()
files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
for nfile in files[1:2]:
#print file
nfile = (ID + ".dat")
filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
f = open(filepass)
lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
f.close()
Lines2 = {}
count = 0
for line in lines2:
try:
Lines2[count] = literal_eval(line)
print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
count += 1
except:
print line
#encoding:utf-8
from gensim.models import word2vec
import numpy as np
import json
import os
from ast import literal_eval
import re
import sys
import MeCab
from collections import defaultdict
from mpl_toolkits.mplot3d.axes3d import Axes3D
import sklearn.decomposition
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.feature_extraction.text import TfidfVectorizer
modelnico = word2vec.Word2Vec.load("allcomment2.model")
model = modelnico
def wordvec(word,model = modelnico):
try:
v = model[word]/np.linalg.norm(model[word])
return v
except:
return np.zeros(len(model[model.vocab.keys()[0]]))
def morphological_analysis(text):
word2freq = defaultdict(int)
mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
node = mecab.parseToNode(text)
while node:
if (node.feature.split(",")[0] == "名詞") | (node.feature.split(",")[0] == "形容詞") | (node.feature.split(",")[0] == "形容動詞"):
word2freq[node.surface] += 1
node = node.next
return word2freq
def output(word2freq):
for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
print str(freq), word
def makevec(word2freq):
freqcount = 0
v = np.zeros(len(model[model.vocab.keys()[0]]))
for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
if int(freq) > 5:
v += freq * wordvec(word.decode("utf-8"))
freqcount += freq
if (v == np.zeros(len(model[model.vocab.keys()[0]]))).all():
return np.zeros(len(model[model.vocab.keys()[0]]))
else:
return (v/np.linalg.norm(v))
def createvector(video_id,ID="0000"):
if video_id == "sm9":
return np.zeros(len(model[model.vocab.keys()[0]]))
else:
filename = ("comment" + ID + "/" + str(video_id) + ".txt")
f = open(filename)
data = f.read()
f.close()
v = makevec(morphological_analysis(data))
return v
vectorinfo = {}
files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
textinfo = {}
thread = {}
count = 0
#for file in files[1:2]:
for ID in ["0000","0001","0002","0003"]:
#print file
filename = ID + ".dat"
filepass = '../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(filename)
f = open(filepass)
lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
data1 = f.read() # ファイル終端まで全て読んだデータを返す
f.close()
Lines2 = {}
count = 0
textinfo[ID] = {}
thread[ID] = {}
for line in lines2:
try:
Lines2[count] = literal_eval(line)
except:
line = line.replace('null', '"null"')
Lines2[count] = literal_eval(line)
thread[ID][(Lines2[count]["video_id"] + ".dat")] = Lines2[count]
#thread["0000"][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
textinfo[ID][Lines2[count]["video_id"]] = Lines2[count]["title"].decode('unicode_escape')
count += 1
def makewordlist(ID,video_id):
filename = ("comment2_" + ID + "/" + str(video_id) + ".txt")
f = open(filename)
text = f.read()
f.close()
wordlist = ""
word2freq = defaultdict(int)
mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
node = mecab.parseToNode(text)
while node:
if (node.feature.split(",")[0] == "名詞") | (node.feature.split(",")[0] == "形容詞") | (node.feature.split(",")[0] == "形容動詞"):
wordlist += node.surface
wordlist += " "
word2freq[node.surface] += 1
node = node.next
return word2freq,wordlist[0:-1]
word2freqlist = {}
wordlist = {}
for ID in ["0000","0001","0002","0003"]:
vectorinfo[ID] = {}
word2freqlist[ID] = {}
wordlist[ID] = {}
for j in textinfo[ID].keys():
#print j
try:
vectorinfo[ID][j] = createvector(video_id = j, ID = ID)
except:
vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]]))
print ID,j
try:
word2freqlist[ID][j], wordlist[ID][j] = makewordlist(ID,j)
except:
print ID,j
tfidfTextList = {}
voc = model.vocab.keys()
for ID in ["0000","0001","0002","0003"]:
for n in wordlist[ID].keys():
tfidfTextList[n] = ""
for w in wordlist[ID][n].split(' '):
try:
k = model[w.decode("utf-8")]
tfidfTextList[n] += w
tfidfTextList[n] += " "
except:
print n,w
def tokenize(text):
wakatilist = text.split(" ")
return wakatilist
tfidf = TfidfVectorizer(tokenizer=tokenize)
tfs = tfidf.fit_transform(tfidfTextList.values())
feature_names = tfidf.get_feature_names()
tfsdic = {}
n = 0
idlist = tfidfTextList.keys()
def maketfidfvec(number):
d = dict(zip(feature_names, tfs.toarray()[number]))
videoid = idlist[number]
for ID in ["0000","0001","0002","0003"]:
try:
k = word2freqlist[ID][videoid]
break
except:
continue
v = np.zeros(len(model[model.vocab.keys()[0]]))
for word, freq in sorted(k.items(),key = lambda x: x[1], reverse=True):
if int(freq) > 1:
try:
v += freq * wordvec(word.decode("utf-8"))* d[word.decode("utf-8")]
except:
print word,ID,videoid
if np.linalg.norm(v) > 0:
return v/np.linalg.norm(v)
else:
return v
for k in
tfidfvectorinfo = {}
for ID in ["0000","0001","0002","0003"]:
tfidfvectorinfo[ID] = {}
for n in range(0,tfs.toarray().shape[0]):
tfidfvectorinfo[ID][idlist[n]] = maketfidfvec(n)
#encoding:utf8
#commenttext = uni_text.decode('unicode_escape')
import json
import os
from ast import literal_eval
import re
import MeCab
import unicodedata
import sys
import ngram
import jcconv
argvs = sys.argv # コマンドライン引数を格納したリストの取得
argc = len(argvs) # 引数の個数
# デバッグプリント
print argvs[1]
#print argc
#ID = '0002'
ID = str(argvs[1])
files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID)
thread = {}
thread[ID] = {}
index = ngram.NGram(N=2)
index3 = ngram.NGram(N=3)
kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"
for nfile in files:
filepass = ('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + str(nfile))
f = open(filepass)
lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
#data1 = f.read() # ファイル終端まで全て読んだデータを返す
f.close()
Lines2 = {}
count = 0
for line in lines2:
try:
Lines2[count] = literal_eval(line)
except:
print line
print count
print nfile
line = line.replace('null', '"null"')
print line
try:
Lines2[count] = literal_eval(line)
except:
continue
try:
Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
except:
try:
#print ("Eroor1" + Lines2[count]['comment'])
Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
except:
print ("Eroor2" + line)
#print Lines2[count]['comment']
count += 1
thread[ID][nfile] = Lines2
tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/ncnc.dic')
#commentfiles = os.listdir('comment')
for j in thread[ID].keys():
filename = ("comment2_kai" + ID + "/" + j[0:-3] +"txt")
fo = file(filename,'w')
print filename
commenttext = ''
for i in range(0,len(thread[ID][j])):
if i > 20000:
print i,j
break
commenttext += thread[ID][j][i]["comment"]
try:
thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
except:
print "normalize Eroor"
pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
#pluscomment = jcconv.hira2kata(pluscomment) #後で追加
pluscomment = pluscomment.replace("█", "")
pluscomment = pluscomment.replace("□", "")
pluscomment = pluscomment.replace("※", "")
pluscomment = pluscomment.replace("∴", "")
pluscomment = pluscomment.replace("*", "")
pluscomment = pluscomment.replace("+", "")
pluscomment = pluscomment.replace("・", "")
pluscomment = pluscomment.replace("°", "")
pluscomment = pluscomment.replace("w", "")
pluscomment = pluscomment.replace("null", "")
#pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
pluscomment = pluscomment.replace("\n", "")
pluscomment = pluscomment.replace("\t", "")
pluscomment = pluscomment.replace(" ", "")
pluscomment = pluscomment.replace(" ", "")
pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
#さけび声対策
pluscommentlist = list(index.ngrams(index.pad(pluscomment.decode("utf-8"))))
text = ''
word1 = ''
word2 =''
for word in pluscommentlist:
if word == u"ーー" :
continue
if word != word1:
text += word[0]
word1 = word
if len(text) > 0:
pluscomment = text#[1::]
pluscommentlist = list(index.ngrams(index.pad(pluscomment)))
text = ''
word1 = ''
word2 =''
for n in range(0,len(pluscommentlist)):
word = pluscommentlist[n]
if n >= 2:
if word == pluscommentlist[n-2]:
text += (" " + word[0])
continue
text += word[0]
if len(text) > 0:
pluscomment = text
pluscommentlist = list(index.ngrams(index3.pad(pluscomment)))
text = ''
word1 = ''
word2 =''
#くりかえす対策
for n in range(0,len(pluscommentlist)):
word = pluscommentlist[n]
if n > 3:
if ((word == pluscommentlist[n-3]) & (n >= 3)):
text += (" " + word[0])
continue
elif n >= 4:
if word == pluscommentlist[n-4]:
text += (" " + word[0])
continue
elif n >= 5:
if word == pluscommentlist[n-5]:
text += (" " + word[0])
continue
text += word[0]
if len(text) > 0:
pluscomment = text
if pluscomment != '':
pluscomment = pluscomment.replace("$"," ")
pluscomment = tagger.parse(pluscomment.encode("utf-8"))
pluscomment = pluscomment.replace("\n"," ")
fo.write(pluscomment)
thread[ID][j]["comment"] = commenttext
fo.write("\n")
fo.close()
"""
files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
for nfile in files[1:2]:
#print file
nfile = (ID + ".dat")
filepass = ('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
f = open(filepass)
lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
f.close()
Lines2 = {}
count = 0
for line in lines2:
try:
Lines2[count] = literal_eval(line)
print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
count += 1
except:
print line
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment