Last active
June 17, 2021 09:05
-
-
Save CholoTook/60968e3ab6d90cb8fd19be55a25592f1 to your computer and use it in GitHub Desktop.
For some reason the people at GEO couldn't answer basic questions about data formats... GEO provides data in various format, but it isn't clear which formats are used by which tools ... </bitch> The following code snippet shows how to convert GEOs tabular data format for SNP-Chip genotype data into Plink .ped/.bed formats (with associated .bim/.…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import GEOparse | |
import pandas_plink | |
import xarray | |
# The genotypes are defined as 'dosages' by pandas-plink, so we are | |
# required to use floating point numbers in the encoding here. | |
genotype_encoding = { | |
'AA': 0.0, | |
'AB': 1.0, | |
'BB': 2.0, | |
'NC': math.nan | |
} | |
def get_gpl(gpl_accession='GPL17481'): | |
gpl = GEOparse.get_GEO(gpl_accession, | |
destdir='./Data/GEO/') | |
# Add explicit alleles to the dataframe using the SNP string | |
gpl.table['a0'] = [ x[1] for x in gpl.table.SNP ] | |
gpl.table['a1'] = [ x[3] for x in gpl.table.SNP ] | |
return gpl | |
def geo_gsm_to_plink(gsm_accession, gpl): | |
gsm = GEOparse.get_GEO(gsm_accession, | |
destdir='./Data/GEO/Samples/') | |
# Merge the sample and the platform data table for consistency | |
m = gsm.table.merge(gpl.table, left_on='ID_REF', right_on='ID') | |
genotype_column = 'VALUE' | |
# Some series in GEO use different column names for genotype data. | |
# TODO: I should use gsm.description to find it robustly. | |
if "GSE55134" in gsm.metadata['series_id']: | |
genotype_column = 'GType' | |
# Recode genotypes into Plink format | |
m["plink_genotype"] = [ | |
genotype_encoding[x] for x in m[genotype_column] | |
] | |
# Carefully build a DataArray object to pass into pandas-plink for writing | |
genotype_data = xarray.DataArray( | |
data = [m["plink_genotype"]], | |
dims = ["sample", "variant"], | |
coords = dict( | |
iid = ("sample", [gsm_accession]), | |
snp = ("variant", m.ID), | |
chrom = ("variant", m.CHROMOSOME), | |
pos = ("variant", m.Position), | |
a0 = ("variant", m.a0), | |
a1 = ("variant", m.a1) | |
) | |
) | |
# Write out Plink format files | |
pandas_plink.write_plink1_bin( | |
genotype_data, f"Data/PlinkFormat/Samples/{gsm_accession}.bed" | |
) | |
# On the command line, use plink --recode to get PED or VCF files! | |
if __name__ == '__main__': | |
geo_gsm_to_plink('GSM1330165', get_gpl()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment