Created
September 11, 2018 15:45
-
-
Save isaacovercast/01a3c76ccdcd197bee0bac6524ab0081 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def dadi_to_momi(infile, outfile=None, verbose=False): | |
if outfile == None: | |
outfile = infile.split(".sfs")[0] + "_momi.sfs" | |
dat = open(infile).readlines() | |
## Get rid of comment lines | |
dat = [x.strip() for x in dat if not x.startswith("#")] | |
if not len(dat) == 3: | |
raise Exception("Malformed dadi sfs {}.\n Must have 3 lines, yours has {}".format(infile, len(dat))) | |
## Parse the info line into nsamps per pop (list of ints), folding flag, and pop names list (if they are given) | |
info = dat[0].split() | |
nsamps = [] | |
## Keep carving off values as long as they cast successfully as int | |
for i in info: | |
try: | |
nsamps.append(int(i)) | |
except: | |
pass | |
nsamps = np.array(nsamps) | |
pops = [x.replace('"', '') for x in info[len(nsamps)+1:]] | |
folded = info[len(nsamps)] | |
folded = False if "un" in folded else True | |
if not len(pops) == len(nsamps): | |
print("Number of populations doesn't agree with number of pop names, using generic names.") | |
pops = ["pop"+x for x in range(len(nsamps))] | |
if verbose: print("Info nsamps={} folded={} pops={}".format(nsamps, folded, pops)) | |
## Get mask | |
mask = list(map(int, dat[2].split())) | |
if verbose: print(mask) | |
## Get sfs, and reshape based on sample sizes | |
sfs = list(map(float, dat[1].split())) | |
if verbose: print(sfs) | |
length = np.ma.array(sfs, mask=mask).sum() | |
sfs = np.array(sfs).reshape(nsamps) | |
if verbose: print("length {}".format(length)) | |
if verbose: print(sfs) | |
## Get counts per sfs bin | |
counts = Counter() | |
for sfsbin in product(*[range(y) for y in [x for x in nsamps]]): | |
## Ignore monomorphic snps | |
## nsamps minus 1 here because of the off by one diff between number | |
## of bins in the sfs and actual number of samples | |
if sfsbin == tuple(nsamps-1) or sfsbin == tuple([0] * len(nsamps)): | |
continue | |
## ignore zero bin counts | |
if sfs[sfsbin] == 0: | |
continue | |
if verbose: print(sfsbin, sfs[sfsbin]), | |
counts.update({sfsbin:sfs[sfsbin]}) | |
if verbose: print("nbins {}".format(len(counts))) | |
## Convert SFS bin style into momi config style | |
configs = pd.DataFrame(index=range(len(counts)), columns=pops) | |
locus_info = [] | |
for i, c in enumerate(counts): | |
## (n-1) here because nbins in dadi sfs is n+1 | |
cfg = np.array([[(n-1)-x, x] for x,n in zip(c, nsamps)]) | |
configs.iloc[i] = [list(map(int, list(x))) for x in cfg] | |
locus_info.append([0, i, counts[c]]) | |
if verbose: print("n_snps {}".format(np.sum([x[2] for x in locus_info]))) | |
## Finally build the momi style sfs dictionary and write it out | |
momi_sfs = {"sampled_pops":pops, | |
"folded":folded, | |
"length":int(length), | |
"configs":configs.values.tolist(), | |
"(locus,config_id,count)":locus_info} | |
with open(outfile, 'w') as out: | |
out.write("{}".format(json.dumps(momi_sfs))) | |
## make it pretty | |
sfs = momi.Sfs.load(outfile) | |
## Fold if unfolded | |
if folded: sfs = sfs.fold() | |
sfs.dump(outfile) | |
#dadi_to_momi(infile, verbose=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Isaac, that is great! I could not track product() as it did not exist in python 2 (unlike Counter()). Thank you for Open Science and great to know it is in Momi. Ludo