Created
March 27, 2017 15:40
-
-
Save bobchennan/95e9e9f9b76d29dfaf02f5bb75c507c2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The data can be found in https://github.com/tdhopper/topic-modeling-datasets/blob/master/README.md | |
# The stop word lists can be found in https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt | |
# Data clean in http://www.mjdenny.com/Text_Processing_In_R.html | |
library(stringr) | |
library(gtools) | |
createDataFile = function(){ | |
processFile = function(filepath) { | |
con = file(filepath, "r") | |
inAbstract = FALSE | |
abs="" | |
while ( TRUE ) { | |
line = readLines(con, n = 1) | |
if ( length(line) == 0 ) { | |
break | |
} | |
if ( startsWith(line, " Abstract:")){ | |
inAbstract=TRUE | |
line=substring(iconv(line, "latin1", "ASCII", sub=""), 13) | |
} | |
if ( startsWith(line, " -------------------")){ | |
inAbstract=FALSE | |
#' Remove everything that is not a number or letter (may want to keep more | |
#' stuff in your actual analyses). | |
temp <- stringr::str_replace_all(str_trim(abs),"[^a-zA-Z\\s]", " ") | |
temp <- tolower(str_trim(temp)) | |
# Shrink down to just one white space | |
temp <- stringr::str_replace_all(temp,"[\\s]+", " ") | |
if(temp!=""){ | |
cat(temp) | |
cat('\n') | |
} | |
abs="" | |
} | |
if(inAbstract){ | |
abs=paste(abs,line,sep=" ") | |
} | |
} | |
close(con) | |
} | |
processFile('cgcbib.txt') | |
} | |
HDP = function(obs, test_data){ | |
init_K <- 62 | |
gamma_a <- .1 | |
gamma_b <- 1 | |
h_sym <- 0.5 | |
alpha0_a <- .1 | |
alpha0_b <- 1 | |
beta_u <- 1 | |
maxIter <- 1000 | |
cntm <- as.matrix(obs) | |
numw_doc <- rowSums(cntm) | |
docs_sel <- as.vector(numw_doc>1) | |
cntm <- cntm[docs_sel,] | |
num_doc <- dim(cntm)[1] | |
num_word <- dim(cntm)[2] | |
numw_doc <- rowSums(cntm) | |
word_occ <- colSums(cntm) | |
K <- init_K | |
topics <- vector() | |
init.Z = function(){ | |
Z <- list() | |
freq = rep(1./init_K, init_K) | |
for(i in 1:num_doc){ | |
word_t <- rmultinom(numw_doc[i], 1, freq) | |
Z[[i]] <- max.col(t(word_t)) | |
} | |
return(Z) | |
} | |
init.njk = function(Z){ | |
njk <- list() | |
for(i in 1:num_doc){ | |
tmp = list() | |
for(j in 1:K) | |
tmp[as.character(j)] <- 0 | |
for(j in seq_along(Z[[i]])){ | |
zij <- as.character(Z[[i]][[j]]) | |
tmp[[zij]] <- tmp[[zij]] + 1 | |
} | |
njk[[i]] <- tmp | |
} | |
return(njk) | |
} | |
init.nkw = function(Z){ | |
nkw <- list() | |
for(i in 1:K){ | |
nkw[[as.character(i)]] <- array(0, num_word) | |
} | |
for(i in 1:num_doc){ | |
idx <- 0 | |
for(j in 1:num_word){ | |
if (cntm[[i,j]]!=0) | |
for(k in 1:cntm[[i,j]]){ | |
idx <- idx + 1 | |
zij <- as.character(Z[[i]][[idx]]) | |
nkw[[zij]][[j]] <- nkw[[zij]][[j]] + 1 | |
} | |
} | |
} | |
return(nkw) | |
} | |
init.nk = function(Z){ | |
nk <- list() | |
for(i in 1:K){ | |
nk[[as.character(i)]] <- 0 | |
} | |
for(i in 1:num_doc){ | |
idx <- 0 | |
for(j in 1:num_word) | |
if (cntm[[i,j]]>0) | |
for(k in 1:cntm[[i,j]]){ | |
idx <- idx + 1 | |
zij <- as.character(Z[[i]][idx]) | |
nk[[zij]] <- nk[[zij]] + 1 | |
} | |
} | |
return(nk) | |
} | |
init.alpha = function(alpha0_a, alpha0_b){ | |
return(alpha0_a/alpha0_b) | |
} | |
init.gamma = function(gamma_a, gamma_b){ | |
return(gamma_a/gamma_b) | |
} | |
init.beta = function(){ | |
beta <- list() | |
new_K <- 0 | |
for(k in 1:K){ | |
if(nk[[as.character(k)]]>0){ | |
new_K <- new_K + 1 | |
beta_l <- rbeta(1, 1, gamma) | |
beta[[as.character(k)]] <- beta_l * beta_u | |
beta_u <<- beta_u * (1 - beta_l) | |
topics <<- c(topics, k) | |
} | |
} | |
K <<- new_K | |
return(beta) | |
} | |
sample.Z = function(Z){ | |
for(j in 1:num_doc){ | |
w <- 0 | |
for(w1 in 1:num_word) | |
if (cntm[[j,w1]]>0) | |
for(w2 in 1:cntm[[j,w1]]) | |
{ | |
w <- w + 1 | |
zij <- as.character(Z[[j]][w]) | |
if(njk[[j]][[zij]]==0) | |
stop("before error") | |
post <- array(0, K+1) | |
for(i in 1:K){ | |
k <- as.character(topics[[i]]) | |
if(nk[[zij]]-(k==zij)==0) | |
post[[i]] <- 0 | |
else{ | |
if (!is.null(njk[[j]][[k]])&&!is.na(njk[[j]][[k]])) | |
njki=njk[[j]][[k]] | |
else | |
njki=0 | |
prior <- njki-(k==zij) + alpha * beta[[zij]] | |
likelihood <- (max(0,nkw[[k]][w1])-(k==zij)+h_sym)/(max(0, nk[[k]])-(k==zij)+h_sym*num_word) | |
if(is.na(prior)||is.na(likelihood)||likelihood<0||prior<0) | |
{ | |
#print(njk[[j]]) | |
#print(numw_doc[j]) | |
print(k) | |
print(j) | |
stop("error in likelihood or prior") | |
} | |
post[i] <- prior * likelihood | |
} | |
} | |
post[[K+1]] <- alpha * beta_u * (1./num_word) | |
if(is.na(post[[K+1]])) | |
stop(c(alpha, "error", beta_u)) | |
if(sum(post)==0) | |
{ | |
print(njk[[j]]) | |
print(numw_doc[[j]]) | |
stop("post sum to 0") | |
} | |
if(anyNA(post)){ | |
print(post) | |
stop("post contains NA") | |
} | |
post <- post / sum(post) | |
if(anyNA(post)){ | |
print(post) | |
stop("post contains NA II") | |
} | |
topic <- which.max(rmultinom(1, 1, post)) | |
if (topic>K){ | |
beta_l <- rbeta(1, 1, gamma) | |
beta[[as.character(topic)]] <<- beta_l * beta_u | |
beta_u <<- beta_u * (1 - beta_l) | |
K <<- K+1 | |
topics[K] <<- topic | |
nkw[[as.character(K)]] <<- array(0, num_word) | |
nk[[as.character(K)]] <<- 0 | |
for(jj in 1:num_doc) | |
njk[[jj]][[as.character(K)]] <<- 0 | |
print(c("add topic",K)) | |
} | |
else | |
topic <- topics[[topic]] | |
Z[[j]][[w]] <- topic | |
topic <- as.character(topic) | |
if (topic != zij){ | |
njk[[j]][[zij]] <<- njk[[j]][[zij]] - 1 | |
nkw[[zij]][[w1]] <<- nkw[[zij]][[w1]] - 1 | |
nk[[zij]] <<- nk[[zij]] - 1 | |
#print(c(zij,"down to",nk[[zij]])) | |
if(nk[[zij]]==0){ | |
for(tmp in 1:K) | |
if(topics[tmp]==zij){ | |
topics[[tmp]] <<- topics[K] | |
topics <- head(topics, -1) | |
break | |
} | |
K <<- K - 1 | |
nk[[zij]] <<- NULL | |
nkw[[zij]] <<- NULL | |
for(jj in 1:num_doc) | |
njk[[jj]][[zij]]=NULL | |
print(c("delete topic",K)) | |
} | |
njk[[j]][[topic]] <<- njk[[j]][[topic]] + 1 | |
nkw[[topic]][[w1]] <<- nkw[[topic]][[w1]] + 1 | |
nk[[topic]] <<- nk[[topic]] + 1 | |
} | |
if(njk[[j]][[topic]]==0) | |
stop("after error") | |
} | |
} | |
print("sample z") | |
return(Z) | |
} | |
sample.m=function(){ | |
m <- array(0, K+1) | |
m[K+1]<-gamma | |
for(i in 1:K){ | |
k <- topics[[i]] | |
concen <- alpha*beta[[as.character(k)]] | |
tmp <- 0 | |
for(j in 1:num_doc){ | |
if(k<=length(njk[[j]])&&!is.null(njk[[j]][[as.character(k)]])&&!is.na(njk[[j]][[as.character(k)]])){ | |
now <- njk[[j]][[as.character(k)]] | |
tmp <- tmp + now | |
if(now>0){ | |
p <- concen / (concen + now) | |
m[i] <- m[i] + rbinom(1, now, p) | |
} | |
} | |
} | |
if (tmp!=nk[as.character(k)]) | |
stop(c(tmp,"nk error",tmp)) | |
} | |
return(m) | |
} | |
sample.beta=function(m){ | |
new_beta <- rdirichlet(1, m) | |
for(i in 1:K){ | |
k <- as.character(topics[[i]]) | |
beta[[k]] <<- new_beta[[i]] | |
} | |
beta_u <<- new_beta[[K+1]] | |
} | |
#sample.alpha=function(){ | |
#} | |
#sample.beta=function(){ | |
#} | |
#Initialize | |
Z <- init.Z() #Topic assignment of each word in each document | |
njk <- init.njk(Z) #Number of words in each topic k in each document j | |
nkw <- init.nkw(Z) #Number of word w in topic k | |
nk <- init.nk(Z) #Number of words in topic k | |
alpha <- init.alpha(alpha0_a, alpha0_b) | |
gamma <- init.gamma(gamma_a, gamma_b) | |
beta <- init.beta() | |
for(cas in 1:50){ | |
Z <- sample.Z(Z) | |
sample.beta(sample.m()) | |
#print(beta) | |
#print(beta_u) | |
} | |
} | |
#createDataFile() | |
require(tm) | |
x <- read.csv('data.txt', header = FALSE) | |
corp <- Corpus(DataframeSource(x)) | |
corp <- tm_map(corp, removeWords, stopwords("english")) | |
dtm <- DocumentTermMatrix(corp, control=list(bounds = list(global = c(10,Inf)))) | |
HDP(dtm,dtm) | |
#dim(dtm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment