Skip to content

Instantly share code, notes, and snippets.

@ItoTomoki
Created September 15, 2015 10:05
Show Gist options
  • Save ItoTomoki/acf2710474a3960cad01 to your computer and use it in GitHub Desktop.
Save ItoTomoki/acf2710474a3960cad01 to your computer and use it in GitHub Desktop.
#encoding:utf-8
from gensim.models import word2vec
import numpy as np
import json
import os
from ast import literal_eval
import re
import sys
import MeCab
from collections import defaultdict
from mpl_toolkits.mplot3d.axes3d import Axes3D
import sklearn.decomposition
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.feature_extraction.text import TfidfVectorizer
modelnico = word2vec.Word2Vec.load("allcomment2.model")
model = modelnico
def wordvec(word,model = modelnico):
try:
v = model[word]/np.linalg.norm(model[word])
return v
except:
return np.zeros(len(model[model.vocab.keys()[0]]))
def morphological_analysis(text):
word2freq = defaultdict(int)
mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
node = mecab.parseToNode(text)
while node:
if (node.feature.split(",")[0] == "名詞") | (node.feature.split(",")[0] == "形容詞") | (node.feature.split(",")[0] == "形容動詞"):
word2freq[node.surface] += 1
node = node.next
return word2freq
def output(word2freq):
for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
print str(freq), word
def makevec(word2freq):
freqcount = 0
v = np.zeros(len(model[model.vocab.keys()[0]]))
for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
if int(freq) > 5:
v += freq * wordvec(word.decode("utf-8"))
freqcount += freq
if (v == np.zeros(len(model[model.vocab.keys()[0]]))).all():
return np.zeros(len(model[model.vocab.keys()[0]]))
else:
return (v/np.linalg.norm(v))
def createvector(video_id,ID="0000"):
if video_id == "sm9":
return np.zeros(len(model[model.vocab.keys()[0]]))
else:
filename = ("comment" + ID + "/" + str(video_id) + ".txt")
f = open(filename)
data = f.read()
f.close()
v = makevec(morphological_analysis(data))
return v
vectorinfo = {}
files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
textinfo = {}
thread = {}
count = 0
#for file in files[1:2]:
for ID in ["0000","0001","0002","0003"]:
#print file
filename = ID + ".dat"
filepass = '../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(filename)
f = open(filepass)
lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
data1 = f.read() # ファイル終端まで全て読んだデータを返す
f.close()
Lines2 = {}
count = 0
textinfo[ID] = {}
thread[ID] = {}
for line in lines2:
try:
Lines2[count] = literal_eval(line)
except:
line = line.replace('null', '"null"')
Lines2[count] = literal_eval(line)
thread[ID][(Lines2[count]["video_id"] + ".dat")] = Lines2[count]
#thread["0000"][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
textinfo[ID][Lines2[count]["video_id"]] = Lines2[count]["title"].decode('unicode_escape')
count += 1
def makewordlist(ID,video_id):
filename = ("comment2_" + ID + "/" + str(video_id) + ".txt")
f = open(filename)
text = f.read()
f.close()
wordlist = ""
word2freq = defaultdict(int)
mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
node = mecab.parseToNode(text)
while node:
if (node.feature.split(",")[0] == "名詞") | (node.feature.split(",")[0] == "形容詞") | (node.feature.split(",")[0] == "形容動詞"):
wordlist += node.surface
wordlist += " "
word2freq[node.surface] += 1
node = node.next
return word2freq,wordlist[0:-1]
word2freqlist = {}
wordlist = {}
for ID in ["0000","0001","0002","0003"]:
vectorinfo[ID] = {}
word2freqlist[ID] = {}
wordlist[ID] = {}
for j in textinfo[ID].keys():
#print j
try:
vectorinfo[ID][j] = createvector(video_id = j, ID = ID)
except:
vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]]))
print ID,j
try:
word2freqlist[ID][j], wordlist[ID][j] = makewordlist(ID,j)
except:
print ID,j
tfidfTextList = {}
voc = model.vocab.keys()
for ID in ["0000","0001","0002","0003"]:
for n in wordlist[ID].keys():
tfidfTextList[n] = ""
for w in wordlist[ID][n].split(' '):
try:
k = model[w.decode("utf-8")]
tfidfTextList[n] += w
tfidfTextList[n] += " "
except:
print n,w
def tokenize(text):
wakatilist = text.split(" ")
return wakatilist
tfidf = TfidfVectorizer(tokenizer=tokenize)
tfs = tfidf.fit_transform(tfidfTextList.values())
feature_names = tfidf.get_feature_names()
tfsdic = {}
n = 0
idlist = tfidfTextList.keys()
def maketfidfvec(number):
d = dict(zip(feature_names, tfs.toarray()[number]))
videoid = idlist[number]
for ID in ["0000","0001","0002","0003"]:
try:
k = word2freqlist[ID][videoid]
break
except:
continue
v = np.zeros(len(model[model.vocab.keys()[0]]))
for word, freq in sorted(k.items(),key = lambda x: x[1], reverse=True):
if int(freq) > 1:
try:
v += freq * wordvec(word.decode("utf-8"))* d[word.decode("utf-8")]
except:
print word,ID,videoid
if np.linalg.norm(v) > 0:
return v/np.linalg.norm(v)
else:
return v
for k in
tfidfvectorinfo = {}
for ID in ["0000","0001","0002","0003"]:
tfidfvectorinfo[ID] = {}
for n in range(0,tfs.toarray().shape[0]):
tfidfvectorinfo[ID][idlist[n]] = maketfidfvec(n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment