benmarwick · January 24, 2018 22:29
diff --git a/JSTOR2MALLET.r b/JSTOR2MALLET.r
 # set working directory, ie. location of JSTOR DfR CSV
 # files on the computer
 setwd("C:\\some directory with JSTOR DfR CSV files")
 
 # create a list of all the CSV files
 myFiles <- list.files(pattern="*.csv|CSV")
 
 # read in all the CSV files to an R data object
 myData <-  lapply(myFiles, read.csv)
 
 # assign file names to each dataframe in the list
 names(myData) <- myFiles
 
 # Here's the step where we turn the JSTOR DfR 'wordcount' into
 # the 'bag of words' that's typically needed for topic modelling
 # The R process is 'untable-ing' each CSV file into a
 # list of data frames, one data frame per file
 myUntabledData <- sapply(1:length(myData),
  function(x) {rep(myData[[x]]$WORDCOUNTS, times = myData[[x]]$WEIGHT)})
 
 # And here's the step where we create individual txt files
 # for each data frame (formerly a CSV file) that should be suitable for
 # input into MALLET.
 names(myUntabledData) <- myFiles
 sapply(myFiles,
  function (x) write.table(myUntabledData[x], file=paste(x, "txt", sep="."),
                          quote = FALSE, row.names = FALSE, eol = " " ))
 
 # Look in the working directory to find the txt files
	# set working directory, ie. location of JSTOR DfR CSV
	# files on the computer
	setwd("C:\\some directory with JSTOR DfR CSV files")

	# create a list of all the CSV files
	myFiles <- list.files(pattern="*.csv\|CSV")

	# read in all the CSV files to an R data object
	myData <- lapply(myFiles, read.csv)

	# assign file names to each dataframe in the list
	names(myData) <- myFiles

	# Here's the step where we turn the JSTOR DfR 'wordcount' into
	# the 'bag of words' that's typically needed for topic modelling
	# The R process is 'untable-ing' each CSV file into a
	# list of data frames, one data frame per file
	myUntabledData <- sapply(1:length(myData),
	function(x) {rep(myData[[x]]$WORDCOUNTS, times = myData[[x]]$WEIGHT)})

	# And here's the step where we create individual txt files
	# for each data frame (formerly a CSV file) that should be suitable for
	# input into MALLET.
	names(myUntabledData) <- myFiles
	sapply(myFiles,
	function (x) write.table(myUntabledData[x], file=paste(x, "txt", sep="."),
	quote = FALSE, row.names = FALSE, eol = " " ))

	# Look in the working directory to find the txt files