Last active
December 21, 2015 13:09
-
-
Save mia-0032/6310570 to your computer and use it in GitHub Desktop.
wコメントとGJコメントの相関ってあるのか調べるためのスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import codecs | |
| import MySQLdb | |
| import sys | |
| import config # my_config | |
| sys.stdout = codecs.getwriter('utf_8')(sys.stdout) | |
| def getDbCursor(): | |
| db = MySQLdb.connect(user=config.MYSQL_USER, | |
| passwd=config.MYSQL_PASS, | |
| db=config.MYSQL_SCHEMA, | |
| charset=config.MYSQL_CHARSET) | |
| return db.cursor() | |
| def countRegComment(c, regexp): | |
| sql_www = ("SELECT video_id, COUNT(*) AS count " + | |
| "FROM comment " + | |
| "WHERE regularized_comment REGEXP '" + | |
| regexp + | |
| "' GROUP BY video_id;") | |
| print(sql_www) | |
| c.execute(sql_www) | |
| return c.fetchall() | |
| c = getDbCursor() | |
| www_video_list = countRegComment(c, 'w+') | |
| gj_video_list = countRegComment(c, '(GJ)+') | |
| videos = {} | |
| all_video_ids = [] | |
| www_videos = {} | |
| gj_videos = {} | |
| for video_id, count in www_video_list: | |
| all_video_ids.append(video_id) | |
| www_videos[video_id] = count | |
| for video_id, count in gj_video_list: | |
| all_video_ids.append(video_id) | |
| gj_videos[video_id] = count | |
| all_video_ids = set(all_video_ids) | |
| f = open('result.txt', 'w+') | |
| for video_id in all_video_ids: | |
| result = [video_id] | |
| if video_id in www_videos: | |
| result.append(str(www_videos[video_id])) | |
| else: | |
| result.append('0') | |
| if video_id in gj_videos: | |
| result.append(str(gj_videos[video_id])) | |
| else: | |
| result.append('0') | |
| f.write("\t".join(result) + "\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data = read.table('result.txt', header=TRUE, sep="\t", fileEncoding="utf8") | |
| summary(data) | |
| data$www = log(data$www + 1) | |
| data$gj = log(data$gj + 1) | |
| plot.new() | |
| plot(data$www, data$gj, xlim=c(0, 12), ylim=c(0, 12), xlab="log(www_count)",ylab="log(gj_count)") | |
| #線形回帰してみる | |
| result = lm(gj ~ www, data=data) | |
| par(new=T) | |
| abline(result, col="black") | |
| summary(result) | |
| new <- data.frame(www = seq(0, 12, 0.1)) | |
| # 予測区間 | |
| result.pre <- predict(result, new, interval="prediction") | |
| par(new=T) | |
| plot(new$www, result.pre[,2], lty=2, col="blue" ,type="l", axes=F, ann = F, xlim=c(0, 12), ylim=c(0, 12)) | |
| par(new=T) | |
| plot(new$www, result.pre[,3], lty=2, col="blue" ,type="l", axes=F, ann = F, xlim=c(0, 12), ylim=c(0, 12)) | |
| # 信頼区間 | |
| result.con <- predict(result, new, interval="confidence") | |
| par(new=T) | |
| plot(new$www, result.con[,2], lty=2, col="red" ,type="l", axes=F, ann = F, xlim=c(0, 12), ylim=c(0, 12)) | |
| par(new=T) | |
| plot(new$www, result.con[,3], lty=2, col="red" ,type="l", axes=F, ann = F, xlim=c(0, 12), ylim=c(0, 12)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment