Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created November 23, 2010 23:05
Show Gist options
  • Save lindenb/712722 to your computer and use it in GitHub Desktop.
Save lindenb/712722 to your computer and use it in GitHub Desktop.
import gov.nih.nlm.ncbi.snp.geno.Individual;
import gov.nih.nlm.ncbi.snp.geno.SnpInfo;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.zip.GZIPInputStream;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
//xjc -d . ftp://ftp.ncbi.nlm.nih.gov/snp/specs/genoex_1_5.xsd
public class Biostar3872
{
private void run() throws Exception
{
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
String chroms[]={
"1","2","3","4","5","6","7","8","9",
"10","11","12","13","14","15","16","17","18","19",
"AltOnly","MT","Multi","NotOn","Un","X","Y"};
JAXBContext jaxbCtxt=JAXBContext.newInstance(Individual.class,SnpInfo.class);
Unmarshaller unmarshaller=jaxbCtxt.createUnmarshaller();
for(String chrom:chroms)
{
List<String> headers=new ArrayList<String>();
HashMap<Integer, Integer> column2id=new HashMap<Integer, Integer>();
URL url=new URL("ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/genotype/gt_chr"+chrom+".xml.gz");
System.err.println("Parsing "+url);
XMLEventReader reader= xmlInputFactory.createXMLEventReader(new GZIPInputStream(url.openStream()));
boolean headerPrinted=false;
while(reader.hasNext())
{
XMLEvent evt=reader.peek();
if(!evt.isStartElement())
{
reader.nextEvent();
continue;
}
StartElement start=evt.asStartElement();
String localName=start.getName().getLocalPart();
if(localName.equals("Individual"))
{
Individual indi=unmarshaller.unmarshal(reader, Individual.class).getValue();
for(Individual.SourceInfo sourceInfo:indi.getSourceInfo())
{
column2id.put(headers.size(), indi.getIndId());
headers.add(sourceInfo.getIndId());
break;
}
}
else if(localName.equals("SnpInfo"))
{
SnpInfo snpInfo=unmarshaller.unmarshal(reader, SnpInfo.class).getValue();
HashMap<Integer, String> indId2genotypes=new HashMap<Integer, String>();
for(SnpInfo.SsInfo ssInfo:snpInfo.getSsInfo())
{
for(SnpInfo.SsInfo.ByPop byPop:ssInfo.getByPop())
{
for(SnpInfo.SsInfo.ByPop.GTypeByInd gtype:byPop.getGTypeByInd())
{
indId2genotypes.put(gtype.getIndId(), gtype.getGtype());
}
}
}
if(!headerPrinted)
{
headerPrinted=true;
System.out.print("rs##\tPosition");
for(String s:headers)
{
System.out.print("\t"+s);
}
System.out.println();
}
System.out.print("rs"+snpInfo.getRsId());
System.out.print("\t");
for(SnpInfo.SnpLoc loc:snpInfo.getSnpLoc())
{
System.out.print("chrom"+loc.getChrom()+":"+loc.getStart()+"("+loc.getGenomicAssembly()+");");
}
for(int i=0;i< headers.size();++i)
{
Integer indiId=column2id.get(i);
String gtype=indId2genotypes.get(indiId);
if(gtype==null) gtype="#NA";
System.out.print("\t"+gtype);
}
System.out.println();
}
else
{
//flush event
reader.nextEvent();
}
}
reader.close();
}
}
public static void main(String[] args)
{
try
{
new Biostar3872().run();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment