Skip to content

Instantly share code, notes, and snippets.

@DASpringate
Created June 14, 2012 09:52
Show Gist options
  • Save DASpringate/2929366 to your computer and use it in GitHub Desktop.
Save DASpringate/2929366 to your computer and use it in GitHub Desktop.
Extracts metadata from scraped GOLD genome database files into a single flatfile
#!/usr/bin/Rscript
# Extracts a variety of organism, environmental, sequencing and project metadata
# From saved GOLDstamp files (genomesonline.org)
# and saves as a flatfile with 1 taxa per line
# See gist.github.com/2929217 to scrape the files from a treebase taxa list
require(XML)
url <- "http://www.treebase.org/treebase-web/search/study/taxa.html?id=10965"
taxa.table <- readHTMLTable(url)
taxa <- list.files()
taxa <- taxa[grep(".html",taxa)] # only look at html files
taxa.matrix <- matrix(nrow = length(taxa) ,ncol=41)
for(taxon in 1:length(taxa)){
tables <- readHTMLTable(taxa[taxon])
taxon.data <- list(
organism.name = tables[[1]][4,3],
common.name = tables[[1]][5,3],
phylum = tables[[1]][3,3],
genus = tables[[1]][7,3],
species = tables[[1]][9,3],
strain = tables[[1]][11,3],
collection = tables[[1]][13,3],
GOLDstamp = tables[[2]][2,3],
seq.date = tables[[2]][11,3],
ncbi.id = tables[[3]][2,3],
gcat.id = tables[[3]][6,3],
straininfo.id = tables[[3]][9,3],
greengenes.id = tables[[3]][10,3],
img.object.id = tables[[3]][11,3],
genome.data = tables[[3]][13,3],
size.kb = tables[[4]][12,3],
orfs = tables[[4]][13,3],
chromosomes = tables[[4]][14,3],
GC.content = tables[[4]][16,3],
isolation.site = tables[[5]][2,3],
collection.date = tables[[5]][5,3],
host.name = tables[[6]][2,3],
cell.shape = tables[[7]][2,3],
motility = tables[[7]][3,3],
temp.range = tables[[7]][6,3],
salinity = tables[[7]][7,3],
pH = tables[[7]][8,3],
cell.diameter = tables[[7]][9,3],
gram.stain = tables[[7]][12,3],
habit = tables[[7]][13,3],
symbiotic.inter = tables[[7]][14,3],
symbiosis = tables[[7]][15,3],
symbiont.name = tables[[7]][16,3],
symbiont.id = tables[[7]][17,3],
cell.arrangemnt = tables[[7]][18,3],
disease = tables[[7]][19,3],
habitat = tables[[7]][20,3],
temperature = tables[[7]][21,3],
metabolism = tables[[7]][22,3],
phenotype = tables[[7]][23,3],
energy.source = tables[[7]][24,3])
vars <- as.character(unlist(taxon.data))
vars[vars == " "] <- NA
taxa.matrix[taxon,] <- vars
}
colnames(taxa.matrix) <- names(taxon.data)
write.table(taxa.matrix,file="treebase_GOLD_metadata.csv",
quote=TRUE,row.names=FALSE, sep=";")
print("Done.")
# to read back in - don't forget it's semicolon delimited:
#df <- read.table(file=treebase_GOLD_metadata.csv,
# header=TRUE, sep=";")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment