Last active
October 4, 2024 00:54
-
-
Save benmarwick/55ab2860d3bdb2a85478 to your computer and use it in GitHub Desktop.
Function to import Agilent GCMS Chemstation D files in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##' Function readDFile | |
##' | |
##' Function readDFile | |
##' @param pathname the pathname of the directory containing the data to import | |
##' @return outData Output is a matrix of ion counts with rows as scantime and | |
##' columns as mass, and the respective values as labels | |
##' @export | |
readDFile<-function(pathname){ | |
filename<-file.path(pathname,'DATA.MS') | |
cat('Opening Agilent file...\n') | |
to.read<-file(filename,'rb') | |
agilent<-readBin(to.read,integer(),size=1,signed=FALSE,n=20000000, endian='little') | |
close(to.read) | |
### preparing vector with counts per scan | |
cat('...extracting counts per scan...\n') | |
countsNumber<-agilent[5782] | |
counts<-agilent[5782] | |
currentPosition<-5782 | |
while(currentPosition<length(agilent)){ | |
jumpLength<-countsNumber*4+7*4 | |
currentPosition<-currentPosition+jumpLength | |
countsNumber<-agilent[currentPosition] | |
counts<-c(counts,agilent[currentPosition]) | |
} | |
### counts will be too long and the last entry is NA | |
### counts will be corrected when the number of scans | |
### is known. | |
#counts<-countNumber(agilent) | |
### cut away NA. | |
counts<-counts[-which(is.na(counts))] | |
### the second period is extracted. This one is currently | |
### used to determine the number of scans. As the useful length | |
### is not known yet, it is extracted in overlength. Na's have | |
### to be removed then | |
cat('...determine number of scans...\n') | |
secondPeriod<-agilent[seq(5772,2000000,4)] | |
### remove Na's | |
if(any(which(is.na(secondPeriod)))){ | |
secondPeriod<-secondPeriod[-which(is.na(secondPeriod))] | |
} | |
### inline function to extract the data paddings | |
betweenSequence<-function(period,counts){ | |
tempSequence<-period[1:4] | |
currentPosition<-4 | |
for(ii in counts){ | |
currentPosition<-(currentPosition+ii) | |
tempSequence<-c(tempSequence,period[currentPosition:(currentPosition+7)]) | |
currentPosition<-(currentPosition+7) | |
} | |
return(tempSequence) | |
} | |
### the betweenSequence removes the variable length ion count | |
### data. Left are 8 numbers of padding betweeen each scan | |
betweenSecond<-betweenSequence(secondPeriod,counts) | |
### in the betweenSequence of the second Period, the 8th | |
### field is currently used for determination of scan numbers | |
### when it is 3 times in sequence zero, | |
numberOfScans<-which.max(abs(diff(betweenSecond[seq(1,100000,8)]))>1) | |
counts<-counts[1:numberOfScans] | |
rawExtractLength<-sum(counts)*4+(numberOfScans*4*7)+5770 | |
### extract periods with correct length | |
cat('...separate rawdata in four sequences...\n') | |
firstPeriod<-agilent[seq(5771,rawExtractLength,4)] | |
secondPeriod<-agilent[seq(5772,rawExtractLength,4)] | |
thirdPeriod<-agilent[seq(5773,rawExtractLength,4)] | |
fourthPeriod<-c(agilent[seq(5770,rawExtractLength,4)][-1],0) | |
## extract second, third and fourth between for the SCAN TIME | |
cat('...extracting scantimes...\n') | |
betweenFirst<-betweenSequence(firstPeriod,counts) | |
betweenSecond<-betweenSequence(secondPeriod,counts) | |
betweenThird<-betweenSequence(thirdPeriod,counts) | |
betweenFourth<-betweenSequence(fourthPeriod,counts) | |
scanTime<-betweenFirst[seq(1,8*numberOfScans,8)]*16777216 | |
+betweenSecond[seq(1,8*numberOfScans,8)]*65536 | |
+betweenThird[seq(1,8*numberOfScans,8)]*256 | |
+betweenFourth[seq(1,8*numberOfScans,8)] | |
scanTime<-round(scanTime/1000/60,4) | |
## extract main sequence, reverse them scan wise | |
mainSequence<-function(period,counts){ | |
tempSequence<-NULL | |
currentPosition<-5 | |
for(ii in counts){ | |
tempSequence<-c(tempSequence,period[(currentPosition+ii-1):currentPosition]) | |
currentPosition<-(currentPosition+ii+7) | |
} | |
return(tempSequence) | |
} | |
### extract main data for INT and MZ | |
cat('...extract intensity and Mz data...\n') | |
mainFirst<-mainSequence(firstPeriod,counts) | |
mainSecond<-mainSequence(secondPeriod,counts) | |
mainThird<-mainSequence(thirdPeriod,counts) | |
mainFourth<-mainSequence(fourthPeriod,counts) | |
### calculate MZs. This will result in the *real*, used MZs. For the case that the | |
### detector is switched off, zero values are included. | |
importMz<-round(mainFirst*12.8+mainSecond*0.05) | |
### calculate intensity values | |
cat('...calculate intensity values...\n') | |
mainFourth[which(floor(mainThird/64)==3)]<-(mainFourth[which(floor(mainThird/64)==3)]*512) | |
mainFourth[which(floor(mainThird/64)==2)]<-(mainFourth[which(floor(mainThird/64)==2)]*64) | |
mainFourth[which(floor(mainThird/64)==1)]<-(mainFourth[which(floor(mainThird/64)==1)]*8) | |
mainThird[which(floor(mainThird/64)==3)]<-((mainThird[which(floor(mainThird/64)==3)] %% 192)*131072) | |
mainThird[which(floor(mainThird/64)==2)]<-((mainThird[which(floor(mainThird/64)==2)] %% 128)*16384) | |
mainThird[which(floor(mainThird/64)==1)]<-((mainThird[which(floor(mainThird/64)==1)] %% 64)*2048) | |
mainThird[which(floor(mainThird/64)==0)]<-(mainThird[which(floor(mainThird/64)==0)]*256) | |
importInt<-(mainThird+mainFourth) | |
cat('...assembling data matrix...\n') | |
DATA<-matrix(rep(0,numberOfScans*max(importMz)),numberOfScans,max(importMz)) | |
position<-1 | |
for(ii in 1:numberOfScans){ | |
for(jj in counts[ii]){ | |
if(counts[ii]){ | |
DATA[ii,importMz[position:(position+jj-1)]]<-importInt[position:(position+jj-1)] | |
position<-position+jj | |
}else{ | |
position<-position+1 | |
} | |
} | |
} | |
rownames(DATA)<-scanTime | |
colnames(DATA)<-seq(1:dim(DATA)[2]) | |
#### Output is a matrix of ion counts with rows as scantime and columns as mass, | |
#### and the respective values as labels | |
return(DATA) | |
} | |
# To use the function, setwd to the dir with the DATA.MS, then, | |
my_data <- readDFile(getwd()) | |
abundance <- rowSums(df) | |
# but all scan times are zeros! | |
# As this doesn't get the scan times for me, my workaround is to use Openchrom to open the Agilent DATA.MS files (with the plug-in), then 'save-as' CSV. The CSVs then need to be cleaned further before plots can be made. Or we can convert files batchwise to xy files (they are actually txt files, see attached). | |
# Step 1: | |
# Menu bar>File>Import > select "Convert MSD chromatograms to *.ocb format" | |
# | |
# Step 2: | |
# Menu bar>File>Import > select "Convert FID, ECD... chromatograms to *.xy or *.ocb format" | |
# then look for the xy file, which is a plain text, two column file | |
# read in CSV files from openchrom (assuming we have a batch) | |
the_csv_files_from_openchrom <- list.files(pattern = ".csv$") | |
the_chrom_data <- lapply(the_csv_files_from_openchrom, read.csv2) | |
# convert all cols to numeric | |
asNumeric <- function(x) as.numeric(as.character(x)) | |
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)], | |
asNumeric)) | |
the_chrom_data <- lapply(the_chrom_data, function(i) factorsNumeric(i)) | |
# get only 'abundance' and 'time' for plotting | |
abundance_time <- lapply(the_chrom_data, function(i) data.frame(time = i$RT.minutes....NOT.USED.BY.IMPORT, abundance = rowSums(i[, 4:ncol(i)]) )) | |
# add sample names to the data | |
names(abundance_time) <- gsub(".csv", "", the_csv_files_from_openchrom) | |
indices <- lapply(abundance_time, nrow) | |
sample_ID <- unlist(lapply(seq_along(names(abundance_time)), function(i) rep(names(abundance_time)[i], indices[i]))) | |
abundance_time <- do.call(rbind.data.frame, abundance_time) | |
abundance_time$sample_ID <- sample_ID | |
# make a grid of plots | |
library(ggplot2) | |
ggplot(abundance_time, aes(time, abundance)) + | |
geom_line() + | |
xlab("time (min)") + | |
facet_wrap(~sample_ID, scales = "free_y") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment