Last active
April 12, 2021 10:27
-
-
Save benmarwick/4537873 to your computer and use it in GitHub Desktop.
R code to operate MALLET entirely from within R. Set variables, send commands to Windows' command console and get MALLET's result back into R for further analysis.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Set working directory | |
dir <- "C:\\" # adjust to suit | |
setwd(dir) | |
# configure variables and filenames for MALLET | |
## here using MALLET's built-in example data and | |
## variables from http://programminghistorian.org/lessons/topic-modeling-and-mallet | |
# folder containing txt files for MALLET to work on | |
importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en" | |
# name of file for MALLET to train model on | |
output <- "tutorial.mallet" | |
# set number of topics for MALLET to use | |
ntopics <- 20 | |
# set optimisation interval for MALLET to use | |
optint <- 20 | |
# set file names for output of model, extensions must be as shown | |
outputstate <- "topic-state.gz" | |
outputtopickeys <- "tutorial_keys.txt" | |
outputdoctopics <- "tutorial_composition.txt" | |
# combine variables into strings ready for windows command line | |
cd <- "cd C:\\mallet-2.0.7" # location of the bin directory | |
import <- paste("bin\\mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ") | |
train <- paste("bin\\mallet train-topics --input", output, "--num-topics", ntopics, "--optimize-interval", optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, sep = " ") | |
# setup system enviroment for R | |
MALLET_HOME <- "c:/mallet-2.0.7" # location of the bin directory | |
Sys.setenv("MALLET_HOME" = MALLET_HOME) | |
Sys.setenv(PATH = "c:/Program Files (x86)/Java/jre7/bin") | |
# send commands to the Windows command prompt | |
# watch results scroll by in R console... | |
shell(shQuote(paste(cd, import, train, sep = " && ")), | |
invisible = FALSE) | |
# inspect results | |
setwd(MALLET_HOME) | |
# outputstateresult <- | |
outputtopickeysresult <- read.table(outputtopickeys, header=F, sep="\t") | |
outputdoctopicsresult <-read.table(outputdoctopics, header=F, sep="\t") | |
# manipulate outputdoctopicsresult to be more useful | |
dat <- outputdoctopicsresult | |
l_dat <- reshape(dat, idvar=1:2, varying=list(topics=colnames(dat[,seq(3, ncol(dat)-1, 2)]), | |
props=colnames(dat[,seq(4, ncol(dat), 2)])), | |
direction="long") | |
library(reshape2) | |
w_dat <- dcast(l_dat, V2 ~ V3) | |
rm(l_dat) # because this is very big but not longer needed | |
# write reshaped table to CSV file for closer inspection | |
write.csv(w_dat, "topic_model_table.csv") | |
# find the location of that CSV file | |
# should pop open a window of the folder | |
# where the CSV is | |
shell.exec(getwd()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment