ItoTomoki · September 18, 2015 01:47
diff --git a/bigramyomikomi.py b/bigramyomikomi.py
 #encoding:utf8
 import os
 from ast import literal_eval
 import re
 import MeCab
 import unicodedata
 import sys
 import ngram


 argvs = sys.argv  # コマンドライン引数を格納したリストの取得
 argc = len(argvs) # 引数の個数
 # デバッグプリント
 print argvs[1]
 #print argc
 #ID = '0002'
 ID = str(argvs[1])
 files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID)
 thread = {}
 thread[ID] = {}
 kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"
 index = ngram.NGram(N=2)
 for nfile in files:
 	filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' +  str(nfile))
 	f = open(filepass)
 	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
 	#data1 = f.read()  # ファイル終端まで全て読んだデータを返す
 	f.close()
 	Lines2 = {}
 	count = 0
 	for line in lines2:
 		try:
 			Lines2[count] = literal_eval(line)
 		except:
 			print line
 			print count
 			print nfile
 			line = line.replace('null', '"null"')
 			print line
 			try:
 				Lines2[count] = literal_eval(line)
 			except:
 				continue			
                        
 		try:
 			Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
 		except:
 			try:
 				#print ("Eroor1" + Lines2[count]['comment'])
 				Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
 			except:
 				print ("Eroor2" + line)
 		#print Lines2[count]['comment']
 		count += 1
 	thread[ID][nfile] = Lines2
 #tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
 #commentfiles = os.listdir('comment')
 for j in thread[ID].keys():
 	filename = ("comment2_bigram" + ID + "/" + j[0:-3] +"txt")
 	fo = file(filename,'w')
 	print filename
 	commenttext = ''
 	for i in range(0,len(thread[ID][j])):
 		if i > 20000:
 			print i,j
 			break
 		commenttext += thread[ID][j][i]["comment"]
 		try:
 			thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
 		except:
 			print "normalize Eroor"
 		pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
 		pluscomment = pluscomment.replace("█", "")
 		pluscomment = pluscomment.replace("□", "")
 		pluscomment = pluscomment.replace("※", "")
 		pluscomment = pluscomment.replace("∴", "")
 		pluscomment = pluscomment.replace("*", "")
 		pluscomment = pluscomment.replace("+", "")
 		pluscomment = pluscomment.replace("・", "")
 		pluscomment = pluscomment.replace("°", "")
 		pluscomment = pluscomment.replace("w", "")
 		pluscomment = pluscomment.replace("null", "")
 		pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
 		pluscomment = pluscomment.replace("\n", "")
 		pluscomment = pluscomment.replace("\t", "")
 		pluscomment = pluscomment.replace(" ", "")
 		pluscomment = pluscomment.replace("　", "")
 		pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
 		if pluscomment != '':
 			#pluscomment = tagger.parse(pluscomment)
 			for text in list(index.ngrams(index.pad(pluscomment.decode("utf-8")))):
 				fo.write(text.encode("utf-8") + " ")
 	#thread[ID][j]["comment"] = 
 	fo.write("\n")
 	fo.close()

 files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
 for nfile in files[1:2]:
 	#print file
 	nfile = (ID + ".dat")
 	filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
 	f = open(filepass)
 	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
 	f.close()
 	Lines2 = {}
 	count = 0
 	
 	for line in lines2:
 		try:
 			Lines2[count] = literal_eval(line)
 			print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
 			thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
 			count += 1
 		except:
 			print line
diff --git a/niconicotfidf.py b/niconicotfidf.py
 #encoding:utf-8
 from gensim.models import word2vec
 import numpy as np
 import json
 import os
 from ast import literal_eval
 import re
 import sys
 import MeCab
 from collections import defaultdict
 from mpl_toolkits.mplot3d.axes3d import Axes3D
 import sklearn.decomposition
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import precision_recall_curve
 from sklearn.feature_extraction.text import TfidfVectorizer

 modelnico = word2vec.Word2Vec.load("allcomment2.model")
 model = modelnico

 def wordvec(word,model = modelnico):
    try:
        v = model[word]/np.linalg.norm(model[word])
        return v
    except:
        return np.zeros(len(model[model.vocab.keys()[0]]))

 def morphological_analysis(text):
    word2freq = defaultdict(int)
    mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
    node = mecab.parseToNode(text)
    while node:
        if (node.feature.split(",")[0] == "名詞") | (node.feature.split(",")[0] == "形容詞") | (node.feature.split(",")[0] == "形容動詞"):
            word2freq[node.surface] += 1
        node = node.next
    return word2freq
 def output(word2freq):
    for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
        print str(freq), word
 def makevec(word2freq):
    freqcount = 0
    v = np.zeros(len(model[model.vocab.keys()[0]]))
    for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
        if int(freq) > 5:
            v += freq * wordvec(word.decode("utf-8"))
            freqcount += freq
    if (v == np.zeros(len(model[model.vocab.keys()[0]]))).all():
        return np.zeros(len(model[model.vocab.keys()[0]]))
    else:
        return (v/np.linalg.norm(v))

 def createvector(video_id,ID="0000"):
    if video_id == "sm9":
        return np.zeros(len(model[model.vocab.keys()[0]]))
    else:
        filename = ("comment" + ID + "/" + str(video_id) + ".txt")
        f = open(filename)
        data = f.read()
        f.close()
        v = makevec(morphological_analysis(data))
        return v
 vectorinfo = {}

 files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
 textinfo = {}
 thread = {}
 count = 0
 #for file in files[1:2]:
 for ID in ["0000","0001","0002","0003"]:
    #print file
    filename = ID + ".dat"
    filepass = '../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(filename)
    f = open(filepass)
    lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
    data1 = f.read()  # ファイル終端まで全て読んだデータを返す
    f.close()
    Lines2 = {}
    count = 0
    textinfo[ID] = {}
    thread[ID] = {}
    for line in lines2:
        try:
            Lines2[count] = literal_eval(line)
        except:
            line = line.replace('null', '"null"')
            Lines2[count] = literal_eval(line)
        thread[ID][(Lines2[count]["video_id"] + ".dat")] = Lines2[count]
        #thread["0000"][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
        textinfo[ID][Lines2[count]["video_id"]] = Lines2[count]["title"].decode('unicode_escape')
        count += 1

 def makewordlist(ID,video_id):
 	filename = ("comment2_" + ID + "/" + str(video_id) + ".txt")
 	f = open(filename)
 	text = f.read()
 	f.close()
 	wordlist = ""
 	word2freq = defaultdict(int)
 	mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
 	node = mecab.parseToNode(text)
 	while node:
 		if (node.feature.split(",")[0] == "名詞") | (node.feature.split(",")[0] == "形容詞") | (node.feature.split(",")[0] == "形容動詞"):
 			wordlist += node.surface
 			wordlist += " "
 			word2freq[node.surface] += 1
 		node = node.next
 	return word2freq,wordlist[0:-1]

 word2freqlist = {}
 wordlist = {}
 for ID in ["0000","0001","0002","0003"]:
    vectorinfo[ID] = {}
    word2freqlist[ID] = {}
    wordlist[ID] = {}
    for j in textinfo[ID].keys():
        #print j
        try:
            vectorinfo[ID][j] = createvector(video_id = j, ID = ID)
        except:
            vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]]))
            print ID,j
        try:
    		word2freqlist[ID][j], wordlist[ID][j] = makewordlist(ID,j)
    	except:
    		print ID,j

 tfidfTextList = {}
 voc = model.vocab.keys()
 for ID in ["0000","0001","0002","0003"]:
 	for n in wordlist[ID].keys():
 		tfidfTextList[n] = ""
 		for w in wordlist[ID][n].split(' '):
 			try:
 				k = model[w.decode("utf-8")]
 				tfidfTextList[n] += w
 				tfidfTextList[n] += " "
 			except:
 				print n,w

 def tokenize(text):
    wakatilist = text.split(" ")
    return wakatilist

 tfidf = TfidfVectorizer(tokenizer=tokenize)
 tfs = tfidf.fit_transform(tfidfTextList.values())
 feature_names = tfidf.get_feature_names()

 tfsdic = {}
 n = 0
 idlist = tfidfTextList.keys()

 def maketfidfvec(number):
 	d = dict(zip(feature_names, tfs.toarray()[number]))
 	videoid = idlist[number]
 	for ID in ["0000","0001","0002","0003"]:
 		try:
 			k = word2freqlist[ID][videoid]
 			break
 		except:
 			continue
 	v = np.zeros(len(model[model.vocab.keys()[0]]))
 	for word, freq in sorted(k.items(),key = lambda x: x[1], reverse=True):
 		if int(freq) > 1:
 			try:
 				v += freq * wordvec(word.decode("utf-8"))* d[word.decode("utf-8")]
 			except:
 				print word,ID,videoid
 	if np.linalg.norm(v) > 0:
 		return v/np.linalg.norm(v)
 	else:
 		return v
 for k in 


 tfidfvectorinfo = {}
 for ID in ["0000","0001","0002","0003"]:
 	tfidfvectorinfo[ID] = {}
 	for n in range(0,tfs.toarray().shape[0]):
 		tfidfvectorinfo[ID][idlist[n]] = maketfidfvec(n)
diff --git a/videothreadyomikomi.py b/videothreadyomikomi.py
 #encoding:utf8
 #commenttext =  uni_text.decode('unicode_escape')
 import json
 import os
 from ast import literal_eval
 import re
 import MeCab
 import unicodedata
 import sys
 import ngram
 import jcconv

 argvs = sys.argv  # コマンドライン引数を格納したリストの取得
 argc = len(argvs) # 引数の個数
 # デバッグプリント
 print argvs[1]
 #print argc
 #ID = '0002'
 ID = str(argvs[1])
 files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID)
 thread = {}
 thread[ID] = {}
 index = ngram.NGram(N=2)
 index3 = ngram.NGram(N=3)
 kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"
 for nfile in files:
 	filepass = ('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' +  str(nfile))
 	f = open(filepass)
 	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
 	#data1 = f.read()  # ファイル終端まで全て読んだデータを返す
 	f.close()
 	Lines2 = {}
 	count = 0
 	for line in lines2:
 		try:
 			Lines2[count] = literal_eval(line)
 		except:
 			print line
 			print count
 			print nfile
 			line = line.replace('null', '"null"')
 			print line
 			try:
 				Lines2[count] = literal_eval(line)
 			except:
 				continue			
                        
 		try:
 			Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
 		except:
 			try:
 				#print ("Eroor1" + Lines2[count]['comment'])
 				Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
 			except:
 				print ("Eroor2" + line)
 		#print Lines2[count]['comment']
 		count += 1
 	thread[ID][nfile] = Lines2
 tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/ncnc.dic')
 #commentfiles = os.listdir('comment')
 for j in thread[ID].keys():
 	filename = ("comment2_kai" + ID + "/" + j[0:-3] +"txt")
 	fo = file(filename,'w')
 	print filename
 	commenttext = ''
 	for i in range(0,len(thread[ID][j])):
 		if i > 20000:
 			print i,j
 			break
 		commenttext += thread[ID][j][i]["comment"]
 		try:
 			thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
 		except:
 			print "normalize Eroor"
 		pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
 		#pluscomment = jcconv.hira2kata(pluscomment) #後で追加
 		pluscomment = pluscomment.replace("█", "")
 		pluscomment = pluscomment.replace("□", "")
 		pluscomment = pluscomment.replace("※", "")
 		pluscomment = pluscomment.replace("∴", "")
 		pluscomment = pluscomment.replace("*", "")
 		pluscomment = pluscomment.replace("+", "")
 		pluscomment = pluscomment.replace("・", "")
 		pluscomment = pluscomment.replace("°", "")
 		pluscomment = pluscomment.replace("w", "")
 		pluscomment = pluscomment.replace("null", "")
 		#pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
 		pluscomment = pluscomment.replace("\n", "")
 		pluscomment = pluscomment.replace("\t", "")
 		pluscomment = pluscomment.replace(" ", "")
 		pluscomment = pluscomment.replace("　", "")
 		pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
 		#さけび声対策
 		pluscommentlist = list(index.ngrams(index.pad(pluscomment.decode("utf-8"))))
 		text = ''
 		word1 = ''
 		word2  =''
 		for word in pluscommentlist:
 			if word == u"ーー" :
 				continue
 			if word != word1:
 				text += word[0]
 			word1 = word
 		if len(text) > 0:
 			pluscomment =  text#[1::]
 		pluscommentlist = list(index.ngrams(index.pad(pluscomment)))
 		text = ''
 		word1 = ''
 		word2  =''
 		for n in range(0,len(pluscommentlist)):
 			word = pluscommentlist[n]
 			if n >= 2:
 				if word == pluscommentlist[n-2]:
 					text += (" " + word[0])
 					continue
 			text += word[0]
 		if len(text) > 0:
 			pluscomment =  text
 		pluscommentlist = list(index.ngrams(index3.pad(pluscomment)))
 		text = ''
 		word1 = ''
 		word2  =''
 		#くりかえす対策
 		for n in range(0,len(pluscommentlist)):
 			word = pluscommentlist[n]
 			if n > 3:
 				if ((word == pluscommentlist[n-3]) & (n >= 3)):
 					text += (" " + word[0]) 
 					continue
 			elif n >= 4:
 				if word == pluscommentlist[n-4]:
 					text += (" " + word[0])
 					continue
 			elif n >= 5:
 				if word == pluscommentlist[n-5]:
 					text += (" " + word[0])
 					continue
 			text += word[0]
 		if len(text) > 0:
 			pluscomment = text
 		if pluscomment != '':
 			pluscomment = pluscomment.replace("$"," ")
 			pluscomment = tagger.parse(pluscomment.encode("utf-8"))
 			pluscomment = pluscomment.replace("\n"," ")
 			fo.write(pluscomment)
 	thread[ID][j]["comment"] = commenttext
 	fo.write("\n")
 	fo.close()
 """
 files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
 for nfile in files[1:2]:
 	#print file
 	nfile = (ID + ".dat")
 	filepass = ('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
 	f = open(filepass)
 	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
 	f.close()
 	Lines2 = {}
 	count = 0
 	
 	for line in lines2:
 		try:
 			Lines2[count] = literal_eval(line)
 			print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
 			thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
 			count += 1
 		except:
 			print line
 """
	#encoding:utf8
	import os
	from ast import literal_eval
	import re
	import MeCab
	import unicodedata
	import sys
	import ngram


	argvs = sys.argv # コマンドライン引数を格納したリストの取得
	argc = len(argvs) # 引数の個数
	# デバッグプリント
	print argvs[1]
	#print argc
	#ID = '0002'
	ID = str(argvs[1])
	files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID)
	thread = {}
	thread[ID] = {}
	kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{\|}~"
	index = ngram.NGram(N=2)
	for nfile in files:
	filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + str(nfile))
	f = open(filepass)
	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
	#data1 = f.read() # ファイル終端まで全て読んだデータを返す
	f.close()
	Lines2 = {}
	count = 0
	for line in lines2:
	try:
	Lines2[count] = literal_eval(line)
	except:
	print line
	print count
	print nfile
	line = line.replace('null', '"null"')
	print line
	try:
	Lines2[count] = literal_eval(line)
	except:
	continue

	try:
	Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
	except:
	try:
	#print ("Eroor1" + Lines2[count]['comment'])
	Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
	except:
	print ("Eroor2" + line)
	#print Lines2[count]['comment']
	count += 1
	thread[ID][nfile] = Lines2
	#tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
	#commentfiles = os.listdir('comment')
	for j in thread[ID].keys():
	filename = ("comment2_bigram" + ID + "/" + j[0:-3] +"txt")
	fo = file(filename,'w')
	print filename
	commenttext = ''
	for i in range(0,len(thread[ID][j])):
	if i > 20000:
	print i,j
	break
	commenttext += thread[ID][j][i]["comment"]
	try:
	thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
	except:
	print "normalize Eroor"
	pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
	pluscomment = pluscomment.replace("█", "")
	pluscomment = pluscomment.replace("□", "")
	pluscomment = pluscomment.replace("※", "")
	pluscomment = pluscomment.replace("∴", "")
	pluscomment = pluscomment.replace("*", "")
	pluscomment = pluscomment.replace("+", "")
	pluscomment = pluscomment.replace("・", "")
	pluscomment = pluscomment.replace("°", "")
	pluscomment = pluscomment.replace("w", "")
	pluscomment = pluscomment.replace("null", "")
	pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
	pluscomment = pluscomment.replace("\n", "")
	pluscomment = pluscomment.replace("\t", "")
	pluscomment = pluscomment.replace(" ", "")
	pluscomment = pluscomment.replace("　", "")
	pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
	if pluscomment != '':
	#pluscomment = tagger.parse(pluscomment)
	for text in list(index.ngrams(index.pad(pluscomment.decode("utf-8")))):
	fo.write(text.encode("utf-8") + " ")
	#thread[ID][j]["comment"] =
	fo.write("\n")
	fo.close()

	files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
	for nfile in files[1:2]:
	#print file
	nfile = (ID + ".dat")
	filepass = ('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
	f = open(filepass)
	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
	f.close()
	Lines2 = {}
	count = 0

	for line in lines2:
	try:
	Lines2[count] = literal_eval(line)
	print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
	thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
	count += 1
	except:
	print line
	#encoding:utf-8
	from gensim.models import word2vec
	import numpy as np
	import json
	import os
	from ast import literal_eval
	import re
	import sys
	import MeCab
	from collections import defaultdict
	from mpl_toolkits.mplot3d.axes3d import Axes3D
	import sklearn.decomposition
	import matplotlib.pyplot as plt
	from sklearn.metrics import roc_auc_score
	from sklearn.metrics import precision_recall_curve
	from sklearn.feature_extraction.text import TfidfVectorizer

	modelnico = word2vec.Word2Vec.load("allcomment2.model")
	model = modelnico

	def wordvec(word,model = modelnico):
	try:
	v = model[word]/np.linalg.norm(model[word])
	return v
	except:
	return np.zeros(len(model[model.vocab.keys()[0]]))

	def morphological_analysis(text):
	word2freq = defaultdict(int)
	mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
	node = mecab.parseToNode(text)
	while node:
	if (node.feature.split(",")[0] == "名詞") \| (node.feature.split(",")[0] == "形容詞") \| (node.feature.split(",")[0] == "形容動詞"):
	word2freq[node.surface] += 1
	node = node.next
	return word2freq
	def output(word2freq):
	for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
	print str(freq), word
	def makevec(word2freq):
	freqcount = 0
	v = np.zeros(len(model[model.vocab.keys()[0]]))
	for word, freq in sorted(word2freq.items(),key = lambda x: x[1], reverse=True):
	if int(freq) > 5:
	v += freq * wordvec(word.decode("utf-8"))
	freqcount += freq
	if (v == np.zeros(len(model[model.vocab.keys()[0]]))).all():
	return np.zeros(len(model[model.vocab.keys()[0]]))
	else:
	return (v/np.linalg.norm(v))

	def createvector(video_id,ID="0000"):
	if video_id == "sm9":
	return np.zeros(len(model[model.vocab.keys()[0]]))
	else:
	filename = ("comment" + ID + "/" + str(video_id) + ".txt")
	f = open(filename)
	data = f.read()
	f.close()
	v = makevec(morphological_analysis(data))
	return v
	vectorinfo = {}

	files = os.listdir('../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
	textinfo = {}
	thread = {}
	count = 0
	#for file in files[1:2]:
	for ID in ["0000","0001","0002","0003"]:
	#print file
	filename = ID + ".dat"
	filepass = '../tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(filename)
	f = open(filepass)
	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
	data1 = f.read() # ファイル終端まで全て読んだデータを返す
	f.close()
	Lines2 = {}
	count = 0
	textinfo[ID] = {}
	thread[ID] = {}
	for line in lines2:
	try:
	Lines2[count] = literal_eval(line)
	except:
	line = line.replace('null', '"null"')
	Lines2[count] = literal_eval(line)
	thread[ID][(Lines2[count]["video_id"] + ".dat")] = Lines2[count]
	#thread["0000"][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
	textinfo[ID][Lines2[count]["video_id"]] = Lines2[count]["title"].decode('unicode_escape')
	count += 1

	def makewordlist(ID,video_id):
	filename = ("comment2_" + ID + "/" + str(video_id) + ".txt")
	f = open(filename)
	text = f.read()
	f.close()
	wordlist = ""
	word2freq = defaultdict(int)
	mecab = MeCab.Tagger('-u /usr/local/Cellar/mecab-ipadic/2.7.0-20070801/lib/mecab/dic/ipadic/ncnc.dic')
	node = mecab.parseToNode(text)
	while node:
	if (node.feature.split(",")[0] == "名詞") \| (node.feature.split(",")[0] == "形容詞") \| (node.feature.split(",")[0] == "形容動詞"):
	wordlist += node.surface
	wordlist += " "
	word2freq[node.surface] += 1
	node = node.next
	return word2freq,wordlist[0:-1]

	word2freqlist = {}
	wordlist = {}
	for ID in ["0000","0001","0002","0003"]:
	vectorinfo[ID] = {}
	word2freqlist[ID] = {}
	wordlist[ID] = {}
	for j in textinfo[ID].keys():
	#print j
	try:
	vectorinfo[ID][j] = createvector(video_id = j, ID = ID)
	except:
	vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]]))
	print ID,j
	try:
	word2freqlist[ID][j], wordlist[ID][j] = makewordlist(ID,j)
	except:
	print ID,j

	tfidfTextList = {}
	voc = model.vocab.keys()
	for ID in ["0000","0001","0002","0003"]:
	for n in wordlist[ID].keys():
	tfidfTextList[n] = ""
	for w in wordlist[ID][n].split(' '):
	try:
	k = model[w.decode("utf-8")]
	tfidfTextList[n] += w
	tfidfTextList[n] += " "
	except:
	print n,w

	def tokenize(text):
	wakatilist = text.split(" ")
	return wakatilist

	tfidf = TfidfVectorizer(tokenizer=tokenize)
	tfs = tfidf.fit_transform(tfidfTextList.values())
	feature_names = tfidf.get_feature_names()

	tfsdic = {}
	n = 0
	idlist = tfidfTextList.keys()

	def maketfidfvec(number):
	d = dict(zip(feature_names, tfs.toarray()[number]))
	videoid = idlist[number]
	for ID in ["0000","0001","0002","0003"]:
	try:
	k = word2freqlist[ID][videoid]
	break
	except:
	continue
	v = np.zeros(len(model[model.vocab.keys()[0]]))
	for word, freq in sorted(k.items(),key = lambda x: x[1], reverse=True):
	if int(freq) > 1:
	try:
	v += freq * wordvec(word.decode("utf-8"))* d[word.decode("utf-8")]
	except:
	print word,ID,videoid
	if np.linalg.norm(v) > 0:
	return v/np.linalg.norm(v)
	else:
	return v
	for k in


	tfidfvectorinfo = {}
	for ID in ["0000","0001","0002","0003"]:
	tfidfvectorinfo[ID] = {}
	for n in range(0,tfs.toarray().shape[0]):
	tfidfvectorinfo[ID][idlist[n]] = maketfidfvec(n)
	#encoding:utf8
	#commenttext = uni_text.decode('unicode_escape')
	import json
	import os
	from ast import literal_eval
	import re
	import MeCab
	import unicodedata
	import sys
	import ngram
	import jcconv

	argvs = sys.argv # コマンドライン引数を格納したリストの取得
	argc = len(argvs) # 引数の個数
	# デバッグプリント
	print argvs[1]
	#print argc
	#ID = '0002'
	ID = str(argvs[1])
	files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID)
	thread = {}
	thread[ID] = {}
	index = ngram.NGram(N=2)
	index3 = ngram.NGram(N=3)
	kigou = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{\|}~"
	for nfile in files:
	filepass = ('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/thread/' + ID +'/' + str(nfile))
	f = open(filepass)
	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
	#data1 = f.read() # ファイル終端まで全て読んだデータを返す
	f.close()
	Lines2 = {}
	count = 0
	for line in lines2:
	try:
	Lines2[count] = literal_eval(line)
	except:
	print line
	print count
	print nfile
	line = line.replace('null', '"null"')
	print line
	try:
	Lines2[count] = literal_eval(line)
	except:
	continue

	try:
	Lines2[count]['comment'] = Lines2[count]['comment'].decode('unicode_escape')
	except:
	try:
	#print ("Eroor1" + Lines2[count]['comment'])
	Lines2[count]['comment'] = Lines2[count]['comment'][0:-1]
	except:
	print ("Eroor2" + line)
	#print Lines2[count]['comment']
	count += 1
	thread[ID][nfile] = Lines2
	tagger = MeCab.Tagger( '-Owakati -u /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/ncnc.dic')
	#commentfiles = os.listdir('comment')
	for j in thread[ID].keys():
	filename = ("comment2_kai" + ID + "/" + j[0:-3] +"txt")
	fo = file(filename,'w')
	print filename
	commenttext = ''
	for i in range(0,len(thread[ID][j])):
	if i > 20000:
	print i,j
	break
	commenttext += thread[ID][j][i]["comment"]
	try:
	thread[ID][j][i]["comment"] = unicodedata.normalize('NFKC', thread[ID][j][i]["comment"])
	except:
	print "normalize Eroor"
	pluscomment = str(thread[ID][j][i]["comment"].encode('utf-8'))
	#pluscomment = jcconv.hira2kata(pluscomment) #後で追加
	pluscomment = pluscomment.replace("█", "")
	pluscomment = pluscomment.replace("□", "")
	pluscomment = pluscomment.replace("※", "")
	pluscomment = pluscomment.replace("∴", "")
	pluscomment = pluscomment.replace("*", "")
	pluscomment = pluscomment.replace("+", "")
	pluscomment = pluscomment.replace("・", "")
	pluscomment = pluscomment.replace("°", "")
	pluscomment = pluscomment.replace("w", "")
	pluscomment = pluscomment.replace("null", "")
	#pluscomment = ((((pluscomment.replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")).replace("ーー","ー")
	pluscomment = pluscomment.replace("\n", "")
	pluscomment = pluscomment.replace("\t", "")
	pluscomment = pluscomment.replace(" ", "")
	pluscomment = pluscomment.replace("　", "")
	pluscomment = re.sub(re.compile("[!-/:-@[-`{-~]"), '', pluscomment)
	#さけび声対策
	pluscommentlist = list(index.ngrams(index.pad(pluscomment.decode("utf-8"))))
	text = ''
	word1 = ''
	word2 =''
	for word in pluscommentlist:
	if word == u"ーー" :
	continue
	if word != word1:
	text += word[0]
	word1 = word
	if len(text) > 0:
	pluscomment = text#[1::]
	pluscommentlist = list(index.ngrams(index.pad(pluscomment)))
	text = ''
	word1 = ''
	word2 =''
	for n in range(0,len(pluscommentlist)):
	word = pluscommentlist[n]
	if n >= 2:
	if word == pluscommentlist[n-2]:
	text += (" " + word[0])
	continue
	text += word[0]
	if len(text) > 0:
	pluscomment = text
	pluscommentlist = list(index.ngrams(index3.pad(pluscomment)))
	text = ''
	word1 = ''
	word2 =''
	#くりかえす対策
	for n in range(0,len(pluscommentlist)):
	word = pluscommentlist[n]
	if n > 3:
	if ((word == pluscommentlist[n-3]) & (n >= 3)):
	text += (" " + word[0])
	continue
	elif n >= 4:
	if word == pluscommentlist[n-4]:
	text += (" " + word[0])
	continue
	elif n >= 5:
	if word == pluscommentlist[n-5]:
	text += (" " + word[0])
	continue
	text += word[0]
	if len(text) > 0:
	pluscomment = text
	if pluscomment != '':
	pluscomment = pluscomment.replace("$"," ")
	pluscomment = tagger.parse(pluscomment.encode("utf-8"))
	pluscomment = pluscomment.replace("\n"," ")
	fo.write(pluscomment)
	thread[ID][j]["comment"] = commenttext
	fo.write("\n")
	fo.close()
	"""
	files = os.listdir('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video')
	for nfile in files[1:2]:
	#print file
	nfile = (ID + ".dat")
	filepass = ('../data/tcserv.nii.ac.jp/access/[email protected]/832c5b059b15f647/nicocomm/data/video/' + str(nfile))
	f = open(filepass)
	lines2 = f.readlines() # 1行毎にファイル終端まで全て読む(改行文字も含まれる)
	f.close()
	Lines2 = {}
	count = 0

	for line in lines2:
	try:
	Lines2[count] = literal_eval(line)
	print Lines2[count]["video_id"], Lines2[count]["title"].decode('unicode_escape')
	thread[ID][(Lines2[count]["video_id"] + ".dat")]["title"] = Lines2[count]["title"].decode('unicode_escape')
	count += 1
	except:
	print line
	"""