Created
February 5, 2020 13:44
-
-
Save endrebak/4437fde5713f20f2b3849f45addd1c49 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
#cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True | |
import sys, math, gzip | |
import numpy as np | |
# cimport numpy as cnp | |
import pandas as pd | |
from time import time | |
from libc.math cimport exp, fabs | |
from libc.stdint cimport int8_t | |
# # calculate Wen/Stephens shrinkage LD estimate | |
# gmapfile = sys.argv[1] # genetic map | |
# indfile = sys.argv[2] #list of individuals | |
# # NE = 11418.0 | |
# # CUTOFF = 1e-7 | |
# outfile = sys.argv[5] # outfile file | |
# NE = float(sys.argv[3]) | |
# CUTOFF = float(sys.argv[4]) | |
cimport cython | |
@cython.boundscheck(False) | |
@cython.wraparound(False) | |
@cython.initializedcheck(False) | |
@cython.cdivision(True) | |
cpdef calc_covar(haplos, pos2gpos, indfile, double NE, double CUTOFF): | |
inds = pd.read_table(indfile, header=None, squeeze=True).to_list() | |
haps= list() | |
nind_int = len(inds) | |
s = 0 | |
for _i in range(1, 2*nind_int): | |
s = s+ 1.0/float(_i) | |
s = 1/s | |
#print "s", s | |
cdef: | |
int i, j, k, pos1, pos2, len_allpos, toofar, n11, n10, n01, len_g1_int | |
double gpos1, gpos2, ee_const, len_g1, ee, df, theta, f11, f1, f2, b, Ds, Ds2, D, nind, thetas, thetas2 | |
nind = float(nind_int) | |
# pos2gpos = pd.read_table(gmapfile, usecols=[1, 2], index_col=0, header=None, sep=" ", squeeze=True) | |
# pos2gpos = pos2gpos | |
allpos = haplos.pop(0) #.tolist() | |
allrs = haplos.pop(1) #.tolist() | |
haps = haplos.values | |
pos2gpos = pos2gpos[allpos.values] | |
records = [] | |
len_g1_int = haps.shape[1] | |
len_g1 = float(haps.shape[1]) | |
ee_const = NE*4.0 / (2.0*nind) | |
len_allpos = len(allpos) | |
theta = s/(2.0*float(nind)+s) | |
thetas = (1-theta)*(1-theta) | |
thetas2 = (theta/2.0)*(1-theta/2.0) | |
cdef double[:] pos2gpos_view | |
cdef long[:] allpos_view | |
cdef long[:] g1_view, g2_view | |
cdef long[:, :] haps_view | |
pos2gpos_view = pos2gpos.values | |
allpos_view = allpos.values | |
haps_view = haps | |
for i in range(len_allpos): | |
pos1 = allpos_view[i] | |
gpos1 = pos2gpos_view[i] | |
toofar = 0 | |
j = i | |
while j < len_allpos and not toofar: | |
pos2 = allpos_view[j] | |
gpos2 = pos2gpos_view[j] | |
df = gpos2 - gpos1 | |
ee = exp( - df * ee_const) | |
if ee < CUTOFF: | |
toofar = 1 | |
j = j+1 | |
continue | |
n11, n01, n10 = 0, 0, 0 | |
for k in range(len_g1_int): | |
if haps_view[i][k] == 1 and haps_view[j][k] == 1: | |
n11 += 1 | |
elif haps_view[i][k] == 0 and haps_view[j][k] == 1: | |
n01 += 1 | |
elif haps_view[i][k] == 1 and haps_view[j][k] == 0: | |
n10 += 1 | |
f11 = n11/len_g1 | |
f1 = (n11+n10)/len_g1 | |
f2 = (n11+n01)/len_g1 | |
D = f11 - f1*f2 | |
Ds = D*ee | |
Ds2 = thetas*Ds | |
if fabs(Ds2) < CUTOFF: | |
j = j+1 | |
continue | |
if i == j: | |
Ds2 = Ds2 + thetas2 | |
# result = (allrs[i], allrs[j], pos1, pos2, gpos1, gpos2, D, Ds2) | |
print(pos1, pos2, Ds2) | |
# records.append(result) | |
j = j+1 | |
# outdf = pd.DataFrame.from_records(records) | |
# outdf.to_parquet(outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment