-
-
Save rBatt/540b98b17f607dd091d328a453dcc8ba to your computer and use it in GitHub Desktop.
| #' Read JHo | |
| #' | |
| #' Read in a file formatted like Jennifer's data sets | |
| #' | |
| #' @param file a character indicating the file name (and possibly path) | |
| #' @param n_nrows integer indicating number of rows for each matrix | |
| #' @param n_matrices integer indicating number of matrices in the file | |
| #' @param n_comment integer indicating the number of comment lines preceding each matrix of data; should include blank lines | |
| #' @param n_top_comment integer indicating the number of comment lines that are unique to the start of the file; default is 0 lines | |
| #' @param ... arguments to be passed to \code{scan}, such as \code{what=numeric()} and \code{sep=","} | |
| #' | |
| #' @details | |
| #' Jennifer said she had a file that had several matrices interspersed with text comments. This is an approach that I think should help read in that data set | |
| #' | |
| #' Arguments to \code{scan} need to be specified. To read in example data, I used \code{read_jho(file_name, 7, 5, 3, 1, what=numeric(), sep=",")}. | |
| #' | |
| #' @return | |
| #' Returns an array whose first two dimensions are equal to the dimensions of each matrix, and the size of the third dimension is the number of matrices | |
| #' | |
| #' @export | |
| read_jho <- function(file, n_rows, n_matrices, n_comment, n_top_comment=0, ...){ | |
| stopifnot(file.exists(file)) # check that file exists | |
| data_read <- list() | |
| for(i in 1:n_matrices){ | |
| line_start <- (i-1)*(n_comment+n_rows) + n_comment + n_top_comment | |
| data_read[[i]] <- scan(file, skip=line_start, nlines=n_rows, ...) | |
| data_read[[i]] <- as.numeric(data_read[[i]]) | |
| data_read[[i]] <- matrix(data_read[[i]], nrow=n_rows, byrow=TRUE) | |
| } | |
| data_array <- simplify2array(data_read) | |
| return(data_array) | |
| } |
| # ========= | |
| # = Setup = | |
| # ========= | |
| # ---- Data Dimensions ---- | |
| # All used for generating fake file | |
| # Only 2 needed to read in the fake file | |
| n_columns <- 3 # don't need to know for reading in | |
| n_rows <- 7 # needed for reading | |
| n_matrices <- 5 # needed for reading | |
| # ---- Prepare Content ---- | |
| # Numeric Content | |
| # Create a shorthand function for generating random numeric matrix (nm) | |
| # This function uses R's scoping to find the dimensions, rather than | |
| # the dimensions being supplied as arguments to the function | |
| make_nm <- function(){matrix(rnorm(n_rows*n_columns), nrow=n_rows, ncol=n_columns)} | |
| # Below is version of function that requires you to supply the 2 arguments | |
| # I.e., make_nm() doesn't work with the function below, but works with the one above | |
| # For below, have to do make_nm(7, 3), e.g. | |
| # make_nm <- function(mat_rows, mat_columns){matrix(rnorm(mat_rows*mat_columns), nrow=mat_rows, ncol=mat_columns)} | |
| # Gibberish/ Header Content | |
| # Define gibberish values to be ignored when reading | |
| # Could be any text that needs to be skipped | |
| # If text has really weird characters, could maybe cause problems? | |
| # This is simple example, though | |
| animal_matrix <- matrix(c("cat","dog","sheep"),ncol=1) | |
| # =============== | |
| # = Create File = | |
| # =============== | |
| # ---- Create File ---- | |
| # Define File Name | |
| file_name <- "~/Desktop/jho_test.csv" | |
| # Create file (no append) | |
| # V1 is an arbitrary fake header text | |
| # I don't want to append (write new file) | |
| # I don't want row or column names (just adds extra junk) | |
| write.table("V1", file=file_name, append=FALSE, row.names=FALSE, col.names=FALSE) # creates file, V1 only | |
| n_top_comment <- 1 # because I'm adding just 1 "V1" line at top | |
| # ---- Add to File ---- | |
| # Add to file (append) | |
| for(i in 1:n_matrices){ # loop through, adding lines | |
| # Now append is true, because adding extra lines to file | |
| write.table(animal_matrix, file=file_name, append=TRUE, row.names=FALSE, col.names=FALSE) # gibberish | |
| write.table(make_nm(), file=file_name, append=TRUE, row.names=FALSE, col.names=FALSE, sep=",") # numeric | |
| } | |
| # ============= | |
| # = Read File = | |
| # ============= | |
| # ---- Data Dimensions Needed for Read ---- | |
| # See "Setup" Section Above | |
| n_comment <- nrow(animal_matrix) # these would be gibberish comments to skip | |
| # ---- Read Data 1 Matrix at a Time ---- | |
| # Create empty list | |
| data_read <- list() | |
| # Loop and Read | |
| for(i in 1:n_matrices){ | |
| # Determine Line of File to Start Reading for this Iteration | |
| lines_read_previously <- (i-1)*(n_comment+n_rows) + n_top_comment | |
| line_start <- lines_read_previously + n_comment | |
| # Read in Desired Portion of File | |
| # The 'sep' argument should refer to what separates values in the numeric part | |
| # See the `write.table(make_nm(), ...` piece of code under 'Add to File' section above | |
| data_read[[i]] <- scan(file_name, skip=line_start, nlines=good_lines, what=numeric(), sep=",") | |
| # Format to Matrix | |
| # The value returned from `scan()` is a vector | |
| # The number of columns is inferred; can be set explicitly for clarity, if known | |
| data_read[[i]] <- matrix(data_read[[i]], nrow=good_lines, byrow=TRUE) | |
| } | |
| # ============================================= | |
| # = Take Element-Wise Average Across Matrices = | |
| # ============================================= | |
| # Format to array | |
| data_array <- simplify2array(data_read) | |
| # Take average | |
| data_array_mean <- apply(data_array, c(1,2), mean) |
From Amphiprion
===== BAYENV2.0 =====
input file is set
number of populations is set
number of iterations is set
seed is set
TEST = 0 . So running matrix estimation
MCMC VER 0.71 (THREADED)
ITERATIONS = 100000
INPUT FILE = for_bayenv.txt
SEED = -4441
num_alleles = 2274.000000
number of loci = 1137
VAR-COVAR MATRIX: ITER = 500
3.658524e-03 5.511923e-05 -2.603048e-04
5.511923e-05 3.682494e-03 -2.102286e-04
-2.603048e-04 -2.102286e-04 3.842626e-03
VAR-COVAR MATRIX: ITER = 1000
3.773501e-03 -3.427948e-05 -2.089264e-05
-3.427948e-05 3.607002e-03 4.486956e-05
-2.089264e-05 4.486956e-05 3.636913e-03
@jahoey --In relation to your first comment, I figured out the problem, I think (untested): in matrix() there's an argument I forgot to set to TRUE (i.e., matrix(..., byrow=TRUE)) that allows it to be filled in row-wise.
The change was on line 91 the script and line 29 for the function
@jahoey OK I made changes that fixed the code. I was counting lines incorrectly, but was getting away with it in my toy example just do to a coincidence (that's what I get for testing more scenarios!).
I changed some of the names of things in the script file to match what I called them in the function, so they're more consistent now.
I got the code to work for both the 'Excel' and the 'amphiprion' versions of the data --- if you're using the original format of the data (what you said came from amphiprion) DO NOT specify the sep= argument (It wasn't tab delimited). If it's the .csv from the Excel Save As, then use sep=",".
Here is the line of code I used to run in the example data set in the "from amphiprion" format:
read_jho(file="~/Desktop/jho_test.txt", n_rows=3, n_matrices=3, n_comment=2, n_top_comment=13, what="character")Note that I had to make a weird formatting change once I saw your actual data ---- the values are expressed as -2.064e-05, which doesn't read in nicely if I specify what=numeric() like I had before. What I've done inside the function is to do a as.numeric() on whatever is read in, so that allows me to specify what="character" and still output a numeric array.
The "jho_test.txt" test file should be attached to this comment; if not, I'll just paste it into the following comment.
I think this should all work now. Let me know.
EDIT: Example File
Open TextEdit and copy paste the following, and save as "jho_test.txt" to your Desktop, then my example above should run.
===== BAYENV2.0 =====
input file is set
number of populations is set
number of iterations is set
seed is set
TEST = 0 . So running matrix estimation
MCMC VER 0.71 (THREADED)
ITERATIONS = 100000
INPUT FILE = for_bayenv.txt
SEED = -4441
num_alleles = 2274.000000
number of loci = 1137
VAR-COVAR MATRIX: ITER = 500
3.658524e-03 5.511923e-05 -2.603048e-04
5.511923e-05 3.682494e-03 -2.102286e-04
-2.603048e-04 -2.102286e-04 3.842626e-03
VAR-COVAR MATRIX: ITER = 1000
3.773501e-03 -3.427948e-05 -2.089264e-05
-3.427948e-05 3.607002e-03 4.486956e-05
-2.089264e-05 4.486956e-05 3.636913e-03
VAR-COVAR MATRIX: ITER = 1000
3.771e-03 -3.448e-05 -2.064e-05
-3.448e-05 3.602e-03 4.486e-05
-2.064e-05 4.486e-05 3.613e-03
I'm playing around with the jho_test.csv fake data using both the function and the stand alone for loop. Seems like the jho_test.csv data is being read in like a book (row by row), but then populating the array in a column by column fashion (like how we talked about for arrays yesterday).