Skip to content

Instantly share code, notes, and snippets.

@ktym
Created January 24, 2012 19:39
Show Gist options
  • Save ktym/1672112 to your computer and use it in GitHub Desktop.
Save ktym/1672112 to your computer and use it in GitHub Desktop.
MIRIAM XML to CSV
#!/usr/bin/env ruby
#
# Convert MIRIAM Registry XML file (http://www.ebi.ac.uk/miriam/main/export/) to CSV
# for browsing at TogoDB (http://semantic.togodb.dbcls.jp/togodb/view/miriam)
#
# Copyright (C) 2012 Toshiaki Katayama <[email protected]>
#
# Pre requirements:
# % curl http://www.ebi.ac.uk/miriam/main/export/xml/ > resources_all.xml
# % gem install nokogiri
# % miriam-xml2csv.rb resources_all.xml > resources_all.csv
#
require 'rubygems'
require 'nokogiri'
require 'csv'
xml = Nokogiri::XML(ARGF)
ns = xml.namespaces
header = %w(
MIRIAM
Namespace
Pattern
Name
Synonyms
Definition
URNs
URIs
Tags
SBML
PMIDs
URLs
Resource
State
Reliability
URL
Link
Example
Info
Institution
Location
)
puts CSV.generate_line(header)
xml.xpath('//xmlns:datatype', ns).each do |datatype|
next if datatype["obsolete"] == "true"
path = './/xmlns:resources/xmlns:resource'
datatype.xpath(path, ns).each do |res|
res_id = res["id"]
res_state = res["state"]
res_reliability = res["reliability"]
res_url = res.at('dataResource').content
res_link = res.at('dataEntry').content
res_example = res.at('dataEntityExample').content
res_info = res.at('dataInfo').content
res_institution = res.at('dataInstitution').content
res_location = res.at('dataLocation').content
path = './/xmlns:synonyms/xmlns:synonym'
synonyms = datatype.xpath(path, ns).collect(&:text)
path = './/xmlns:uris/xmlns:uri[@type="URN"]'
path2 = './/xmlns:uris/xmlns:uri[@deprecated="true"]'
urns = datatype.xpath(path, ns).collect(&:text) - datatype.xpath(path2, ns).collect(&:text)
path = './/xmlns:uris/xmlns:uri[@type="URL"]'
path2 = './/xmlns:uris/xmlns:uri[@deprecated="true"]'
uris = datatype.xpath(path, ns).collect(&:text) - datatype.xpath(path2, ns).collect(&:text)
path = './/xmlns:tags/xmlns:tag'
tags = datatype.xpath(path, ns).collect(&:text)
path = './/xmlns:annotation/xmlns:format[@name="SBML"]/xmlns:elements/xmlns:element'
sbml = datatype.xpath(path, ns).collect(&:text)
path = './/xmlns:documentations/xmlns:documentation[@type="PMID"]'
pmids = datatype.xpath(path, ns).collect(&:text).map {|x| x[/\d+/]}
path = './/xmlns:documentations/xmlns:documentation[@type="URL"]'
urls = datatype.xpath(path, ns).collect(&:text)
array = [
datatype["id"],
datatype.at('namespace').content,
datatype["pattern"],
datatype.at('name').content,
synonyms.join("; "),
datatype.at('definition').content,
urns.join("\n"),
uris.join("\n"),
tags.join("; "),
sbml.join("; "),
pmids.join("\n"),
urls.join("\n"),
res_id,
res_state,
res_reliability,
res_url,
res_link,
res_example,
res_info,
res_institution,
res_location,
]
puts CSV.generate_line(array)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment