Created
March 31, 2016 18:07
-
-
Save rBatt/679bd7be02e4d07012ba1182e33ffc61 to your computer and use it in GitHub Desktop.
How to read a data file with 2 dimensional data interspersed with comments, in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Read JHo | |
#' | |
#' Read in a file formatted like Jennifer's data sets | |
#' | |
#' @param file a character indicating the file name (and possibly path) | |
#' @param n_nrows integer indicating number of rows for each matrix | |
#' @param n_matrices integer indicating number of matrices in the file | |
#' @param n_comment integer indicating the number of comment lines preceding each matrix of data; should include blank lines | |
#' @param n_top_comment integer indicating the number of comment lines that are unique to the start of the file; default is 0 lines | |
#' @param ... arguments to be passed to \code{scan}, such as \code{what=numeric()} and \code{sep=","} | |
#' | |
#' @details | |
#' Jennifer said she had a file that had several matrices interspersed with text comments. This is an approach that I think should help read in that data set | |
#' | |
#' Arguments to \code{scan} need to be specified. To read in example data, I used \code{read_jho(file_name, 7, 5, 3, 1, what=numeric(), sep=",")}. | |
#' | |
#' @return | |
#' Returns an array whose first two dimensions are equal to the dimensions of each matrix, and the size of the third dimension is the number of matrices | |
#' | |
#' @export | |
read_jho <- function(file, n_rows, n_matrices, n_comment, n_top_comment=0, ...){ | |
stopifnot(file.exists(file)) # check that file exists | |
data_read <- list() | |
for(i in 1:n_matrices){ | |
skip_first_line <- ifelse(i==1, n_top_comment, 0) | |
line_start <- (i-1)*(n_comment+n_rows) + (1+n_comment) | |
data_read[[i]] <- scan(file, skip=line_start, nlines=n_rows, ...) | |
data_read[[i]] <- matrix(data_read[[i]], nrow=n_rows) | |
} | |
data_array <- simplify2array(data_read) | |
return(data_array) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ========= | |
# = Setup = | |
# ========= | |
# ---- Data Dimensions ---- | |
# All used for generating fake file | |
# Only 2 needed to read in the fake file | |
mat_columns <- 3 # don't need to know for reading in | |
mat_rows <- 7 # needed for reading | |
n_matrices <- 5 # needed for reading | |
# ---- Prepare Content ---- | |
# Numeric Content | |
# Create a shorthand function for generating random numeric matrix (nm) | |
# This function uses R's scoping to find the dimensions, rather than | |
# the dimensions being supplied as arguments to the function | |
make_nm <- function(){matrix(rnorm(mat_rows*mat_columns), nrow=mat_rows, ncol=mat_columns)} | |
# Below is version of function that requires you to supply the 2 arguments | |
# I.e., make_nm() doesn't work with the function below, but works with the one above | |
# For below, have to do make_nm(7, 3), e.g. | |
# make_nm <- function(mat_rows, mat_columns){matrix(rnorm(mat_rows*mat_columns), nrow=mat_rows, ncol=mat_columns)} | |
# Gibberish/ Header Content | |
# Define gibberish values to be ignored when reading | |
# Could be any text that needs to be skipped | |
# If text has really weird characters, could maybe cause problems? | |
# This is simple example, though | |
animal_matrix <- matrix(c("cat","dog","sheep"),ncol=1) | |
# =============== | |
# = Create File = | |
# =============== | |
# ---- Create File ---- | |
# Define File Name | |
file_name <- "~/Desktop/jho_test.csv" | |
# Create file (no append) | |
# V1 is an arbitrary fake header text | |
# I don't want to append (write new file) | |
# I don't want row or column names (just adds extra junk) | |
write.table("V1", file=file_name, append=FALSE, row.names=FALSE, col.names=FALSE) # creates file, V1 only | |
# ---- Add to File ---- | |
# Add to file (append) | |
for(i in 1:n_matrices){ # loop through, adding lines | |
# Now append is true, because adding extra lines to file | |
write.table(animal_matrix, file=file_name, append=TRUE, row.names=FALSE, col.names=FALSE) # gibberish | |
write.table(make_nm(), file=file_name, append=TRUE, row.names=FALSE, col.names=FALSE, sep=",") # numeric | |
} | |
# ============= | |
# = Read File = | |
# ============= | |
# ---- Data Dimensions Needed for Read ---- | |
# See "Setup" Section Above | |
bonus_top_lines <- nrow(animal_matrix) # these would be gibberish comments to skip | |
good_lines <- mat_rows # these are the data lines | |
# ---- Read Data 1 Matrix at a Time ---- | |
# Create empty list | |
data_read <- list() | |
# Loop and Read | |
for(i in 1:n_matrices){ | |
# Extra Lines to Ignore if 1st Iteration | |
# Just to ignore the V1 header (only needed first time) | |
# If no random header unique to start of file, set to 0. | |
# If random header at at only start of file is more than 1 line, say 2 lines, just do ifelse(i==1, 2, 0) | |
skip_first_line <- ifelse(i==1, 1, 0) | |
# Determine Line of File to Start Reading for this Iteration | |
lines_read_previously <- (i-1)*(bonus_top_lines+good_lines) | |
line_start <- lines_read_previously + (1+bonus_top_lines) # the 1 is to start *after* where you ended last | |
# Read in Desired Portion of File | |
# The 'sep' argument should refer to what separates values in the numeric part | |
# See the `write.table(make_nm(), ...` piece of code under 'Add to File' section above | |
data_read[[i]] <- scan(file_name, skip=line_start, nlines=good_lines, what=numeric(), sep=",") | |
# Format to Matrix | |
# The value returned from `scan()` is a vector | |
# The number of columns is inferred; can be set explicitly for clarity, if known | |
data_read[[i]] <- matrix(data_read[[i]], nrow=good_lines) | |
} | |
# ============================================= | |
# = Take Element-Wise Average Across Matrices = | |
# ============================================= | |
# Format to array | |
data_array <- simplify2array(data_read) | |
# Take average | |
data_array_mean <- apply(data_array, c(1,2), mean) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment