Created
November 23, 2010 23:05
-
-
Save lindenb/712722 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gov.nih.nlm.ncbi.snp.geno.Individual; | |
import gov.nih.nlm.ncbi.snp.geno.SnpInfo; | |
import java.net.URL; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.zip.GZIPInputStream; | |
import javax.xml.bind.JAXBContext; | |
import javax.xml.bind.Unmarshaller; | |
import javax.xml.stream.XMLEventReader; | |
import javax.xml.stream.XMLInputFactory; | |
import javax.xml.stream.events.StartElement; | |
import javax.xml.stream.events.XMLEvent; | |
//xjc -d . ftp://ftp.ncbi.nlm.nih.gov/snp/specs/genoex_1_5.xsd | |
public class Biostar3872 | |
{ | |
private void run() throws Exception | |
{ | |
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); | |
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); | |
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); | |
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); | |
String chroms[]={ | |
"1","2","3","4","5","6","7","8","9", | |
"10","11","12","13","14","15","16","17","18","19", | |
"AltOnly","MT","Multi","NotOn","Un","X","Y"}; | |
JAXBContext jaxbCtxt=JAXBContext.newInstance(Individual.class,SnpInfo.class); | |
Unmarshaller unmarshaller=jaxbCtxt.createUnmarshaller(); | |
for(String chrom:chroms) | |
{ | |
List<String> headers=new ArrayList<String>(); | |
HashMap<Integer, Integer> column2id=new HashMap<Integer, Integer>(); | |
URL url=new URL("ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/genotype/gt_chr"+chrom+".xml.gz"); | |
System.err.println("Parsing "+url); | |
XMLEventReader reader= xmlInputFactory.createXMLEventReader(new GZIPInputStream(url.openStream())); | |
boolean headerPrinted=false; | |
while(reader.hasNext()) | |
{ | |
XMLEvent evt=reader.peek(); | |
if(!evt.isStartElement()) | |
{ | |
reader.nextEvent(); | |
continue; | |
} | |
StartElement start=evt.asStartElement(); | |
String localName=start.getName().getLocalPart(); | |
if(localName.equals("Individual")) | |
{ | |
Individual indi=unmarshaller.unmarshal(reader, Individual.class).getValue(); | |
for(Individual.SourceInfo sourceInfo:indi.getSourceInfo()) | |
{ | |
column2id.put(headers.size(), indi.getIndId()); | |
headers.add(sourceInfo.getIndId()); | |
break; | |
} | |
} | |
else if(localName.equals("SnpInfo")) | |
{ | |
SnpInfo snpInfo=unmarshaller.unmarshal(reader, SnpInfo.class).getValue(); | |
HashMap<Integer, String> indId2genotypes=new HashMap<Integer, String>(); | |
for(SnpInfo.SsInfo ssInfo:snpInfo.getSsInfo()) | |
{ | |
for(SnpInfo.SsInfo.ByPop byPop:ssInfo.getByPop()) | |
{ | |
for(SnpInfo.SsInfo.ByPop.GTypeByInd gtype:byPop.getGTypeByInd()) | |
{ | |
indId2genotypes.put(gtype.getIndId(), gtype.getGtype()); | |
} | |
} | |
} | |
if(!headerPrinted) | |
{ | |
headerPrinted=true; | |
System.out.print("rs##\tPosition"); | |
for(String s:headers) | |
{ | |
System.out.print("\t"+s); | |
} | |
System.out.println(); | |
} | |
System.out.print("rs"+snpInfo.getRsId()); | |
System.out.print("\t"); | |
for(SnpInfo.SnpLoc loc:snpInfo.getSnpLoc()) | |
{ | |
System.out.print("chrom"+loc.getChrom()+":"+loc.getStart()+"("+loc.getGenomicAssembly()+");"); | |
} | |
for(int i=0;i< headers.size();++i) | |
{ | |
Integer indiId=column2id.get(i); | |
String gtype=indId2genotypes.get(indiId); | |
if(gtype==null) gtype="#NA"; | |
System.out.print("\t"+gtype); | |
} | |
System.out.println(); | |
} | |
else | |
{ | |
//flush event | |
reader.nextEvent(); | |
} | |
} | |
reader.close(); | |
} | |
} | |
public static void main(String[] args) | |
{ | |
try | |
{ | |
new Biostar3872().run(); | |
} | |
catch (Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment