Last active
August 29, 2015 14:25
-
-
Save ahebrank/fc32dfafe0b855584815 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source me and then use: | |
# | |
# splitfile("workbook.xlsx", 1) | |
# | |
# to extract sheet 1 from workbook.xlsx | |
# set these if they vary | |
# subj_col determines the name of the output file | |
subj_col <- 1 | |
# content_col determines the column with the contents of the file | |
content_col <- 2 | |
# is there a header row? | |
has_header <- TRUE | |
# where to write them out? | |
output_directory <- "txt" | |
require('gdata') | |
splitfile <- function(filename, sheet = 1) { | |
if (!file.exists(output_directory)) { | |
dir.create(output_directory) | |
} | |
x <- read.xls(filename, sheet, header = has_header, quote = '', method='tab') | |
if (subj_col > ncol(x) || content_col > ncol(x)) { | |
error('Either subject col or content col are outside number of columns available') | |
} | |
n <- nrow(x) | |
for (i in 1:n) { | |
sn <- trim_quotes(x[i, subj_col]) | |
if (nchar(sn)>17) { | |
# probably a bad conversion from numeric to string | |
sn <- as.character(as.numeric(sn)) | |
} | |
outfn <- sprintf('%s/%s.txt', output_directory, sn) | |
print(outfn) | |
fout <- file(outfn) | |
writeLines(as.character(trim_quotes(x[i, content_col])), fout) | |
close(fout) | |
} | |
} | |
trim_quotes <- function(cell) { | |
# remove the quotes from the first and last character | |
x <- gsub('[(^")("$)]', '', cell) | |
gsub('\\\\', '"', x) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment