Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active October 4, 2024 00:54
Show Gist options
  • Save benmarwick/55ab2860d3bdb2a85478 to your computer and use it in GitHub Desktop.
Save benmarwick/55ab2860d3bdb2a85478 to your computer and use it in GitHub Desktop.
Function to import Agilent GCMS Chemstation D files in R
##' Function readDFile
##'
##' Function readDFile
##' @param pathname the pathname of the directory containing the data to import
##' @return outData Output is a matrix of ion counts with rows as scantime and
##' columns as mass, and the respective values as labels
##' @export
readDFile<-function(pathname){
filename<-file.path(pathname,'DATA.MS')
cat('Opening Agilent file...\n')
to.read<-file(filename,'rb')
agilent<-readBin(to.read,integer(),size=1,signed=FALSE,n=20000000, endian='little')
close(to.read)
### preparing vector with counts per scan
cat('...extracting counts per scan...\n')
countsNumber<-agilent[5782]
counts<-agilent[5782]
currentPosition<-5782
while(currentPosition<length(agilent)){
jumpLength<-countsNumber*4+7*4
currentPosition<-currentPosition+jumpLength
countsNumber<-agilent[currentPosition]
counts<-c(counts,agilent[currentPosition])
}
### counts will be too long and the last entry is NA
### counts will be corrected when the number of scans
### is known.
#counts<-countNumber(agilent)
### cut away NA.
counts<-counts[-which(is.na(counts))]
### the second period is extracted. This one is currently
### used to determine the number of scans. As the useful length
### is not known yet, it is extracted in overlength. Na's have
### to be removed then
cat('...determine number of scans...\n')
secondPeriod<-agilent[seq(5772,2000000,4)]
### remove Na's
if(any(which(is.na(secondPeriod)))){
secondPeriod<-secondPeriod[-which(is.na(secondPeriod))]
}
### inline function to extract the data paddings
betweenSequence<-function(period,counts){
tempSequence<-period[1:4]
currentPosition<-4
for(ii in counts){
currentPosition<-(currentPosition+ii)
tempSequence<-c(tempSequence,period[currentPosition:(currentPosition+7)])
currentPosition<-(currentPosition+7)
}
return(tempSequence)
}
### the betweenSequence removes the variable length ion count
### data. Left are 8 numbers of padding betweeen each scan
betweenSecond<-betweenSequence(secondPeriod,counts)
### in the betweenSequence of the second Period, the 8th
### field is currently used for determination of scan numbers
### when it is 3 times in sequence zero,
numberOfScans<-which.max(abs(diff(betweenSecond[seq(1,100000,8)]))>1)
counts<-counts[1:numberOfScans]
rawExtractLength<-sum(counts)*4+(numberOfScans*4*7)+5770
### extract periods with correct length
cat('...separate rawdata in four sequences...\n')
firstPeriod<-agilent[seq(5771,rawExtractLength,4)]
secondPeriod<-agilent[seq(5772,rawExtractLength,4)]
thirdPeriod<-agilent[seq(5773,rawExtractLength,4)]
fourthPeriod<-c(agilent[seq(5770,rawExtractLength,4)][-1],0)
## extract second, third and fourth between for the SCAN TIME
cat('...extracting scantimes...\n')
betweenFirst<-betweenSequence(firstPeriod,counts)
betweenSecond<-betweenSequence(secondPeriod,counts)
betweenThird<-betweenSequence(thirdPeriod,counts)
betweenFourth<-betweenSequence(fourthPeriod,counts)
scanTime<-betweenFirst[seq(1,8*numberOfScans,8)]*16777216
+betweenSecond[seq(1,8*numberOfScans,8)]*65536
+betweenThird[seq(1,8*numberOfScans,8)]*256
+betweenFourth[seq(1,8*numberOfScans,8)]
scanTime<-round(scanTime/1000/60,4)
## extract main sequence, reverse them scan wise
mainSequence<-function(period,counts){
tempSequence<-NULL
currentPosition<-5
for(ii in counts){
tempSequence<-c(tempSequence,period[(currentPosition+ii-1):currentPosition])
currentPosition<-(currentPosition+ii+7)
}
return(tempSequence)
}
### extract main data for INT and MZ
cat('...extract intensity and Mz data...\n')
mainFirst<-mainSequence(firstPeriod,counts)
mainSecond<-mainSequence(secondPeriod,counts)
mainThird<-mainSequence(thirdPeriod,counts)
mainFourth<-mainSequence(fourthPeriod,counts)
### calculate MZs. This will result in the *real*, used MZs. For the case that the
### detector is switched off, zero values are included.
importMz<-round(mainFirst*12.8+mainSecond*0.05)
### calculate intensity values
cat('...calculate intensity values...\n')
mainFourth[which(floor(mainThird/64)==3)]<-(mainFourth[which(floor(mainThird/64)==3)]*512)
mainFourth[which(floor(mainThird/64)==2)]<-(mainFourth[which(floor(mainThird/64)==2)]*64)
mainFourth[which(floor(mainThird/64)==1)]<-(mainFourth[which(floor(mainThird/64)==1)]*8)
mainThird[which(floor(mainThird/64)==3)]<-((mainThird[which(floor(mainThird/64)==3)] %% 192)*131072)
mainThird[which(floor(mainThird/64)==2)]<-((mainThird[which(floor(mainThird/64)==2)] %% 128)*16384)
mainThird[which(floor(mainThird/64)==1)]<-((mainThird[which(floor(mainThird/64)==1)] %% 64)*2048)
mainThird[which(floor(mainThird/64)==0)]<-(mainThird[which(floor(mainThird/64)==0)]*256)
importInt<-(mainThird+mainFourth)
cat('...assembling data matrix...\n')
DATA<-matrix(rep(0,numberOfScans*max(importMz)),numberOfScans,max(importMz))
position<-1
for(ii in 1:numberOfScans){
for(jj in counts[ii]){
if(counts[ii]){
DATA[ii,importMz[position:(position+jj-1)]]<-importInt[position:(position+jj-1)]
position<-position+jj
}else{
position<-position+1
}
}
}
rownames(DATA)<-scanTime
colnames(DATA)<-seq(1:dim(DATA)[2])
#### Output is a matrix of ion counts with rows as scantime and columns as mass,
#### and the respective values as labels
return(DATA)
}
# To use the function, setwd to the dir with the DATA.MS, then,
my_data <- readDFile(getwd())
abundance <- rowSums(df)
# but all scan times are zeros!
# As this doesn't get the scan times for me, my workaround is to use Openchrom to open the Agilent DATA.MS files (with the plug-in), then 'save-as' CSV. The CSVs then need to be cleaned further before plots can be made. Or we can convert files batchwise to xy files (they are actually txt files, see attached).
# Step 1:
# Menu bar>File>Import > select "Convert MSD chromatograms to *.ocb format"
#
# Step 2:
# Menu bar>File>Import > select "Convert FID, ECD... chromatograms to *.xy or *.ocb format"
# then look for the xy file, which is a plain text, two column file
# read in CSV files from openchrom (assuming we have a batch)
the_csv_files_from_openchrom <- list.files(pattern = ".csv$")
the_chrom_data <- lapply(the_csv_files_from_openchrom, read.csv2)
# convert all cols to numeric
asNumeric <- function(x) as.numeric(as.character(x))
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],
asNumeric))
the_chrom_data <- lapply(the_chrom_data, function(i) factorsNumeric(i))
# get only 'abundance' and 'time' for plotting
abundance_time <- lapply(the_chrom_data, function(i) data.frame(time = i$RT.minutes....NOT.USED.BY.IMPORT, abundance = rowSums(i[, 4:ncol(i)]) ))
# add sample names to the data
names(abundance_time) <- gsub(".csv", "", the_csv_files_from_openchrom)
indices <- lapply(abundance_time, nrow)
sample_ID <- unlist(lapply(seq_along(names(abundance_time)), function(i) rep(names(abundance_time)[i], indices[i])))
abundance_time <- do.call(rbind.data.frame, abundance_time)
abundance_time$sample_ID <- sample_ID
# make a grid of plots
library(ggplot2)
ggplot(abundance_time, aes(time, abundance)) +
geom_line() +
xlab("time (min)") +
facet_wrap(~sample_ID, scales = "free_y")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment