Last active
November 26, 2017 17:14
-
-
Save ratsgo/75f40be1cc3efb76c3ea206146eae959 to your computer and use it in GitHub Desktop.
Word2Vec 분석
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
# loading | |
DTM <- readRDS('dtm.rds') | |
vec <- read.csv('word2vec.txt', fileEncoding='utf-8', sep=" ", header=F, skip=1) | |
# distance matrix | |
distance <- dist(vec[,-1]) | |
distance <- as.matrix(distance) | |
colnames(distance) <- vec[,1] | |
rownames(distance) <- vec[,1] | |
# weight matrix | |
func <- c('디자인','화면','음질','스펙','카메라','소프트웨어','배터리') | |
funclocation <- which(rownames(distance) %in% func) | |
weight <- distance[funclocation,] | |
weight <- exp(-weight^2/100) | |
location <- order(colnames(weight)) | |
weight <- weight[,location] | |
# weight 행렬의 단어와 DTM 행렬의 단어가 일치해야 내적 의미가 있음, 0이 나와야 함 | |
which(colnames(weight) != colnames(DTM)) | |
# inner-product | |
result <- as.matrix(DTM) %*% t(weight) | |
# 문장별 길이 체크 | |
doc.length <- rowSums(DTM) | |
# post-processing | |
for (i in 1:dim(result)[1]) { | |
if (is.na(doc.length[i]) == T) { | |
doc.length[i] <- 1 | |
} | |
# 단어 개수가 2 이하인 리뷰는 아예 빠지도록 함 | |
if (doc.length[i] < 3) { | |
result[i,] <- 0 | |
} | |
# 문장별 스코어를 단어 개수로 나누어 Normalize | |
else { | |
result[i,] <- result[i,] / doc.length[i] | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment