Skip to content

Instantly share code, notes, and snippets.

@rmflight
Last active July 5, 2016 13:54
Show Gist options
  • Select an option

  • Save rmflight/5a3d23aaef168b54770d6505e39c0b10 to your computer and use it in GitHub Desktop.

Select an option

Save rmflight/5a3d23aaef168b54770d6505e39c0b10 to your computer and use it in GitHub Desktop.
{
"mzML": {
"schemaLocation": "http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd",
"id": "exampleData",
"version": "1.1.0"
},
"cvList": {
"cv": [
{
"id": "MS",
"fullName": "Proteomics Standards Initiative Mass Spectrometry Ontology",
"version": "3.79.0",
"URI": "http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"
}
],
"cv.1": [
{
"id": "UO",
"fullName": "Unit Ontology",
"version": "12:10:2011",
"URI": "http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"
}
],
"count": "2"
},
"fileDescription": {
"fileContent": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000579",
"name": "MS1 spectrum",
"value": ""
}
],
"cvParam.1": [
{
"cvRef": "MS",
"accession": "MS:1000580",
"name": "MSn spectrum",
"value": ""
}
]
},
"sourceFileList": {
"sourceFile": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000768",
"name": "Thermo nativeID format",
"value": ""
}
],
"cvParam.1": [
{
"cvRef": "MS",
"accession": "MS:1000563",
"name": "Thermo RAW format",
"value": ""
}
],
"cvParam.2": [
{
"cvRef": "MS",
"accession": "MS:1000569",
"name": "SHA-1",
"value": "6679ba84f57e8f25a3b8ebecc806ecafc79492ec"
}
],
"id": "RAW1",
"name": "UK001N1exoposb.raw",
"location": "file:///"
},
"count": "1"
}
},
"referenceableParamGroupList": {
"referenceableParamGroup": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1002416",
"name": "Orbitrap Fusion",
"value": ""
}
],
"cvParam.1": [
{
"cvRef": "MS",
"accession": "MS:1000529",
"name": "instrument serial number",
"value": "FSN10352"
}
],
"id": "CommonInstrumentParams"
},
"count": "1"
},
"softwareList": {
"software": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000532",
"name": "Xcalibur",
"value": ""
}
],
"id": "Xcalibur",
"version": "1.1.982"
},
"software.1": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000615",
"name": "ProteoWizard software",
"value": ""
}
],
"id": "pwiz",
"version": "3.0.9205"
},
"count": "2"
},
"instrumentConfigurationList": {
"instrumentConfiguration": {
"referenceableParamGroupRef": [
{
"ref": "CommonInstrumentParams"
}
],
"componentList": {
"source": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000398",
"name": "nanoelectrospray",
"value": ""
}
],
"cvParam.1": [
{
"cvRef": "MS",
"accession": "MS:1000485",
"name": "nanospray inlet",
"value": ""
}
],
"order": "1"
},
"analyzer": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000081",
"name": "quadrupole",
"value": ""
}
],
"order": "2"
},
"analyzer.1": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000484",
"name": "orbitrap",
"value": ""
}
],
"order": "3"
},
"detector": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000624",
"name": "inductive detector",
"value": ""
}
],
"order": "4"
},
"count": "4"
},
"softwareRef": [
{
"ref": "Xcalibur"
}
],
"id": "IC1"
},
"instrumentConfiguration.1": {
"referenceableParamGroupRef": [
{
"ref": "CommonInstrumentParams"
}
],
"componentList": {
"source": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000398",
"name": "nanoelectrospray",
"value": ""
}
],
"cvParam.1": [
{
"cvRef": "MS",
"accession": "MS:1000485",
"name": "nanospray inlet",
"value": ""
}
],
"order": "1"
},
"analyzer": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000081",
"name": "quadrupole",
"value": ""
}
],
"order": "2"
},
"analyzer.1": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000083",
"name": "radial ejection linear ion trap",
"value": ""
}
],
"order": "3"
},
"detector": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000253",
"name": "electron multiplier",
"value": ""
}
],
"order": "4"
},
"count": "4"
},
"softwareRef": [
{
"ref": "Xcalibur"
}
],
"id": "IC2"
},
"count": "2"
},
"dataProcessingList": {
"dataProcessing": {
"processingMethod": {
"cvParam": [
{
"cvRef": "MS",
"accession": "MS:1000544",
"name": "Conversion to mzML",
"value": ""
}
],
"order": "0",
"softwareRef": "pwiz"
},
"id": "pwiz_Reader_Thermo_conversion"
},
"count": "1"
},
"run": {
"id": "exampleData",
"defaultInstrumentConfigurationRef": "IC1",
"startTimeStamp": "2015-07-29 12:49:35Z",
"defaultSourceFileRef": "RAW1",
"scanPolarity": "positive"
}
}
<?xml version="1.0" encoding="utf-8"?>
<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">
<mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="exampleData" version="1.1.0">
<cvList count="2">
<cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.79.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>
<cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>
</cvList>
<fileDescription>
<fileContent>
<cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>
<cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>
</fileContent>
<sourceFileList count="1">
<sourceFile id="RAW1" name="UK001N1exoposb.raw" location="file:///">
<cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>
<cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>
<cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6679ba84f57e8f25a3b8ebecc806ecafc79492ec"/>
</sourceFile>
</sourceFileList>
</fileDescription>
<referenceableParamGroupList count="1">
<referenceableParamGroup id="CommonInstrumentParams">
<cvParam cvRef="MS" accession="MS:1002416" name="Orbitrap Fusion" value=""/>
<cvParam cvRef="MS" accession="MS:1000529" name="instrument serial number" value="FSN10352"/>
</referenceableParamGroup>
</referenceableParamGroupList>
<softwareList count="2">
<software id="Xcalibur" version="1.1.982">
<cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>
</software>
<software id="pwiz" version="3.0.9205">
<cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>
</software>
</softwareList>
<instrumentConfigurationList count="2">
<instrumentConfiguration id="IC1">
<referenceableParamGroupRef ref="CommonInstrumentParams"/>
<componentList count="4">
<source order="1">
<cvParam cvRef="MS" accession="MS:1000398" name="nanoelectrospray" value=""/>
<cvParam cvRef="MS" accession="MS:1000485" name="nanospray inlet" value=""/>
</source>
<analyzer order="2">
<cvParam cvRef="MS" accession="MS:1000081" name="quadrupole" value=""/>
</analyzer>
<analyzer order="3">
<cvParam cvRef="MS" accession="MS:1000484" name="orbitrap" value=""/>
</analyzer>
<detector order="4">
<cvParam cvRef="MS" accession="MS:1000624" name="inductive detector" value=""/>
</detector>
</componentList>
<softwareRef ref="Xcalibur"/>
</instrumentConfiguration>
<instrumentConfiguration id="IC2">
<referenceableParamGroupRef ref="CommonInstrumentParams"/>
<componentList count="4">
<source order="1">
<cvParam cvRef="MS" accession="MS:1000398" name="nanoelectrospray" value=""/>
<cvParam cvRef="MS" accession="MS:1000485" name="nanospray inlet" value=""/>
</source>
<analyzer order="2">
<cvParam cvRef="MS" accession="MS:1000081" name="quadrupole" value=""/>
</analyzer>
<analyzer order="3">
<cvParam cvRef="MS" accession="MS:1000083" name="radial ejection linear ion trap" value=""/>
</analyzer>
<detector order="4">
<cvParam cvRef="MS" accession="MS:1000253" name="electron multiplier" value=""/>
</detector>
</componentList>
<softwareRef ref="Xcalibur"/>
</instrumentConfiguration>
</instrumentConfigurationList>
<dataProcessingList count="1">
<dataProcessing id="pwiz_Reader_Thermo_conversion">
<processingMethod order="0" softwareRef="pwiz">
<cvParam cvRef="MS" accession="MS:1000544" name="Conversion to mzML" value=""/>
</processingMethod>
</dataProcessing>
</dataProcessingList>
<run id="exampleData" defaultInstrumentConfigurationRef="IC1" startTimeStamp="2015-07-29T12:49:35Z" defaultSourceFileRef="RAW1">
<spectrumList count="36" defaultDataProcessingRef="pwiz_Reader_Thermo_conversion">
<spectrum index="0" id="controllerType=0 controllerNumber=1 scan=3" defaultArrayLength="34573">
<cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>
<cvParam cvRef="MS" accession="MS:1000511" name="ms level" value="1"/>
<cvParam cvRef="MS" accession="MS:1000130" name="positive scan" value=""/>
<cvParam cvRef="MS" accession="MS:1000128" name="profile spectrum" value=""/>
<cvParam cvRef="MS" accession="MS:1000504" name="base peak m/z" value="432.239837646484" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<cvParam cvRef="MS" accession="MS:1000505" name="base peak intensity" value="3.8174825e06" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of detector counts"/>
<cvParam cvRef="MS" accession="MS:1000285" name="total ion current" value="2.0373532e07"/>
<cvParam cvRef="MS" accession="MS:1000528" name="lowest observed m/z" value="148.509396632102" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<cvParam cvRef="MS" accession="MS:1000527" name="highest observed m/z" value="1616.112961912861" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
<scanList count="1">
<cvParam cvRef="MS" accession="MS:1000795" name="no combination" value=""/>
</scanList>
</spectrum>
</spectrumList>
</run>
</mzML>
</indexedmzML>
library(XML)
library(jsonlite)
source("R/file_metadata.R")
input_file <- "input.mzML"
input_list <- get_mzml_metadata(input_file)
input_json <- meta_export_json(input_list)
cat(input_json, file = "input.json")
#' get mzML metadata
#'
#' @param mzml_file the mzML file to get metadata from
#'
#' @import XML
#' @export
get_mzml_metadata <- function(mzml_file){
xml_doc <- xmlTreeParse(mzml_file, useInternalNodes = TRUE)
ns <- xmlNamespaceDefinitions(xmlRoot(xml_doc), recursive = TRUE, simplify = TRUE)
names(ns)[1] <- "d1"
mz_metanodes <- getNodeSet(xml_doc, "/d1:indexedmzML/d1:mzML", ns)
mz_meta <- list()
tmp_attr <- unclass(xmlAttrs(mz_metanodes[[1]]))
attr(tmp_attr, "namespaces") <- NULL
mz_meta[["mzML"]][[".attrs"]] <- tmp_attr
other_nodes_2_get <- c("cvList", "fileDescription",
"referenceableParamGroupList",
"softwareList",
"instrumentConfigurationList",
"dataProcessingList")
other_nodes <- xmlChildren(mz_metanodes[[1]])
other_list <- lapply(other_nodes, xmlToList)
mz_meta <- c(mz_meta, other_list[other_nodes_2_get])
mz_meta[["run"]][[".attrs"]] <- xmlAttrs(mz_metanodes[[1]][["run"]])
mz_meta <- .remove_attrs(mz_meta)
mz_meta_frame <- .to_data_frame(mz_meta)
mz_meta_frame$run$scanPolarity <- .get_scan_polarity(other_list$run$spectrumList)
mz_meta_frame$run$startTimeStamp <- gsub("T", " ", mz_meta_frame$run$startTimeStamp)
mz_meta_frame
}
#' export metadata to json
#'
#' export the list metadata to a json string
#'
#' @param meta_list a list of metadata
#'
#' @importFrom jsonlite toJSON
#' @export
meta_export_json <- function(meta_list){
toJSON(meta_list, pretty = TRUE, auto_unbox = TRUE)
}
#' transform to data frame
#'
#' @param in_list the list of xml nodes to work on
#'
.to_data_frame <- function(in_list){
if (class(in_list) == "list") {
out_list <- lapply(in_list, .to_data_frame)
} else if (class(in_list) == "character") {
if (!is.null(names(in_list))) {
out_list <- as.data.frame(t(as.matrix(in_list)))
} else {
out_list <- in_list
}
}
out_list
}
#' remove attributes
#'
#' removes a list entry called ".attrs" from a list, and makes them first level
#' partners
#'
#' @param in_list the list to work on
#'
.remove_attrs <- function(in_list){
if (class(in_list) == "list") {
out_list <- in_list
list_names <- names(out_list)
if (".attrs" %in% list_names) {
tmp_attrs <- out_list[[".attrs"]]
name_attrs <- names(tmp_attrs)
if (sum(name_attrs %in% list_names) == 0) {
for (i_name in name_attrs) {
out_list[[i_name]] <- tmp_attrs[[i_name]]
}
out_list[[".attrs"]] <- NULL
}
} else {
out_list <- lapply(out_list, .remove_attrs)
}
# still need to check the rest of the pieces of the list!
out_list <- lapply(out_list, .remove_attrs)
} else {
out_list <- in_list
}
out_list
}
#' get_scan_mode
#'
#' takes a list from xmlToList for "run" and looks at whether all scans are positive, negative, or mixed
#'
#' @param spectrum_list the list of spectra
#'
.get_scan_polarity <- function(spectrum_list){
spectrum_list[[".attrs"]] <- NULL
scan_data <- lapply(spectrum_list, function(in_spectrum){
cv_loc <- which(names(in_spectrum) %in% "cvParam")
cv_data <- unlist(in_spectrum[cv_loc])
scan_polarity <- grep("scan", cv_data, value = TRUE)
scan_polarity
})
scan_polarity <- as.character(unique(scan_data))
if ((length(scan_polarity) == 1) && (grepl("positive", scan_polarity))) {
out_polarity <- "positive"
} else if ((length(scan_polarity) == 1) && (grepl("negative", scan_polarity))) {
out_polarity <- "negative"
} else {
out_polarity <- "mixed"
}
out_polarity
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment