Last active
August 29, 2015 14:07
-
-
Save yoshi0309/1dbcd8ddea554f9b35af to your computer and use it in GitHub Desktop.
Connpass Event Classifiere using Pocket - Jubatus Hackathon - Team Paper.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# jubatus server info. | |
host = 'localhost' | |
port = 9199 | |
import sys | |
import json | |
import random | |
import jubatus | |
from jubatus.common import Datum | |
# for connpass json data | |
def getTitleList(filepath): | |
f = open(filepath) | |
data = json.load(f) | |
f.close | |
# print(json.dumps(data, sort_keys=True, indent=4)) | |
list = data["events"] | |
titleList = [] | |
for event in list: | |
titleList.append(event["title"]) | |
return titleList | |
# for hatebu feed data | |
def getTitleFromFeed(path): | |
import feedparser | |
entries = feedparser.parse(path)['entries'] | |
titleList = [] | |
for entry in entries: | |
titleList.append(entry['title']) | |
return titleList | |
def getTitleFromTxt(path): | |
titleList = [] | |
f = open(path) | |
for line in f: | |
titleList.append(line) | |
f.close | |
return titleList | |
def buildTrainData(titleList,classLabel): | |
traindata = [] | |
for title in titleList: | |
traindata.append((classLabel, Datum({'title': title}))) | |
return traindata | |
def train(client,traindata): | |
random.shuffle(traindata) | |
client.train(traindata) | |
def traindata_ty(): | |
t1 = getTitleFromFeed('data/hatebu-1.xml') | |
t1.extend(getTitleFromFeed('data/hatebu-2.xml')) | |
t1.extend(getTitleFromFeed('data/hatebu-3.xml')) | |
t1.extend(getTitleFromFeed('data/hatebu-4.xml')) | |
t1.extend(getTitleFromFeed('data/hatebu-5.xml')) | |
t1.extend(getTitleFromFeed('data/hatebu-6.xml')) | |
traindata = buildTrainData(t1,'興味あり') | |
t2 = [] | |
# t2 = getTitleFromFeed('data/hot-economics.xml') | |
t2.extend(getTitleFromFeed('data/hot-entertainment.xml')) | |
t2.extend(getTitleFromFeed('data/hot-general.xml')) | |
t2.extend(getTitleFromFeed('data/hot-lif.xml')) | |
t2.extend(getTitleFromFeed('data/hot-social.xml')) | |
traindata_n = buildTrainData(t2,'興味なし') | |
traindata.extend(traindata_n) | |
return traindata | |
def traindata_mo(): | |
t1 = getTitleFromTxt('data/pocket.txt') | |
traindata = buildTrainData(t1,'興味あり') | |
t2 = [] | |
# t2 = getTitleFromFeed('data/hot-economics.xml') | |
t2.extend(getTitleFromFeed('data/hot-entertainment.xml')) | |
t2.extend(getTitleFromFeed('data/hot-general.xml')) | |
t2.extend(getTitleFromFeed('data/hot-lif.xml')) | |
t2.extend(getTitleFromFeed('data/hot-social.xml')) | |
traindata_n = buildTrainData(t2,'興味なし') | |
traindata.extend(traindata_n) | |
return traindata | |
def traindata(userid): | |
import psycopg2 | |
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password") | |
cur = conn.cursor() | |
cur.execute("select given_title from Crawled_Pockets where user_id=" + str(userid)) | |
records = cur.fetchall() | |
titleList = [] | |
for record in records: | |
titleList.append(record[0]) | |
traindata = buildTrainData(titleList,'1') #興味あり | |
t2 = [] | |
# t2 = getTitleFromFeed('data/hot-economics.xml') | |
t2.extend(getTitleFromFeed('data/hot-entertainment.xml')) | |
t2.extend(getTitleFromFeed('data/hot-general.xml')) | |
t2.extend(getTitleFromFeed('data/hot-lif.xml')) | |
t2.extend(getTitleFromFeed('data/hot-social.xml')) | |
traindata_n = buildTrainData(t2,'0') #興味なし | |
traindata.extend(traindata_n) | |
return traindata | |
def predict(client): | |
#data = [ | |
# Datum({'title': u'ダイエー'}), | |
# Datum({'title': u'機械学習'}), | |
# Datum({'title': u'オムニ・チャネル'}), | |
# Datum({'title': u'AKB48'}), #OMG! | |
# Datum({'title': u'消費税増税'}), | |
# Datum({'title': u'Apache'}), | |
# Datum({'title': u'プロジェクト管理'}), | |
# Datum({'title': u'Webデザイン'}), | |
# Datum({'title': u'Ocaml'}), | |
# Datum({'title': u'Java'}), | |
# Datum({'title': u'Solr'}), | |
# Datum({'title': u'Elasticsearch'}) | |
# ] | |
eventIdList = [] | |
import psycopg2 | |
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password") | |
cur = conn.cursor() | |
cur.execute("select * from connpasses") | |
records = cur.fetchall() | |
for record in records: | |
# print record[1] | |
# print record[2] | |
d = Datum({'title': record[2]}) | |
# print record[2] | |
# print d | |
res = client.classify([d]) | |
# sys.stdout.write(max(res[0], key=lambda x: x.score).label) | |
# sys.stdout.write(' ') | |
# sys.stdout.write(d.string_values[0][1].encode('utf-8')) | |
# sys.stdout.write('\n') | |
result = max(res[0], key=lambda x: x.score) | |
# print res[0] | |
# print d.string_values[0][1] | |
# max かつ score が 0.3 よりも大きい物 | |
if result.label == '1' and result.score >= 0.3: | |
eventIdList.append(record[1]) | |
print record[1], record[2], result.score # for debug | |
conn.close() | |
cur.close() | |
return eventIdList | |
def getUserIds(): | |
import psycopg2 | |
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password") | |
cur = conn.cursor() | |
cur.execute("select * from Users") | |
records = cur.fetchall() | |
userIds = [] | |
for rec in records: | |
userIds.append((rec[0],rec[1])) | |
cur.close() | |
conn.close() | |
return userIds | |
def saveDataToTable(userid,eventIdList): | |
import psycopg2 | |
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password") | |
cur = conn.cursor() | |
cur.execute("delete from recommends where user_id=" + str(userid)) | |
conn.commit() | |
for eventId in eventIdList: | |
cur.execute("INSERT INTO Recommends(user_id,event_id) VALUES (%s,%s)", [userid,eventId]) | |
conn.commit() | |
cur.close() | |
conn.close() | |
if __name__ == '__main__': | |
# connect to the jubatus | |
# client_ty = jubatus.Classifier(host, port, 'yoshida') | |
# client_ty.clear() | |
# train(client_ty, traindata_ty()) | |
# print 'predict for ty .............................' | |
# predict(client_ty) | |
# client_mo = jubatus.Classifier(host, port, 'morimoto') | |
# client_mo.clear() | |
# train(client_mo,traindata_mo()) | |
# print 'predict for mo .............................' | |
# predict(client_mo) | |
for user in getUserIds(): | |
client = jubatus.Classifier(host, port, str(user[0])) | |
client.clear() | |
train(client, traindata(user[0])) | |
print 'predict for ' + str(user[0]) + ' ' + str(user[1]) + ' .............................' | |
eventIdList = predict(client) | |
saveDataToTable(user[0],eventIdList) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"method": "AROW", | |
"converter": { | |
"num_filter_types": {}, | |
"num_filter_rules": [], | |
"string_filter_types": {}, | |
"string_filter_rules": [], | |
"num_types": {}, | |
"num_rules": [], | |
"string_types": { | |
"bigram": { "method": "ngram", "char_num": "3" }, | |
"mecab": { | |
"method": "dynamic", | |
"path": "libmecab_splitter.so", | |
"function": "create", | |
"arg": "-d /var/lib/mecab/dic/ipadic" | |
} | |
}, | |
"string_rules": [ | |
{ "key": "*", "type": "bigram", "sample_weight": "bin", "global_weight": "bin" } | |
] | |
}, | |
"parameter": { | |
"regularization_weight" : 1.0 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment