Skip to content

Instantly share code, notes, and snippets.

@endrebak
Created February 5, 2020 13:44
Show Gist options
  • Save endrebak/4437fde5713f20f2b3849f45addd1c49 to your computer and use it in GitHub Desktop.
Save endrebak/4437fde5713f20f2b3849f45addd1c49 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
import sys, math, gzip
import numpy as np
# cimport numpy as cnp
import pandas as pd
from time import time
from libc.math cimport exp, fabs
from libc.stdint cimport int8_t
# # calculate Wen/Stephens shrinkage LD estimate
# gmapfile = sys.argv[1] # genetic map
# indfile = sys.argv[2] #list of individuals
# # NE = 11418.0
# # CUTOFF = 1e-7
# outfile = sys.argv[5] # outfile file
# NE = float(sys.argv[3])
# CUTOFF = float(sys.argv[4])
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
@cython.cdivision(True)
cpdef calc_covar(haplos, pos2gpos, indfile, double NE, double CUTOFF):
inds = pd.read_table(indfile, header=None, squeeze=True).to_list()
haps= list()
nind_int = len(inds)
s = 0
for _i in range(1, 2*nind_int):
s = s+ 1.0/float(_i)
s = 1/s
#print "s", s
cdef:
int i, j, k, pos1, pos2, len_allpos, toofar, n11, n10, n01, len_g1_int
double gpos1, gpos2, ee_const, len_g1, ee, df, theta, f11, f1, f2, b, Ds, Ds2, D, nind, thetas, thetas2
nind = float(nind_int)
# pos2gpos = pd.read_table(gmapfile, usecols=[1, 2], index_col=0, header=None, sep=" ", squeeze=True)
# pos2gpos = pos2gpos
allpos = haplos.pop(0) #.tolist()
allrs = haplos.pop(1) #.tolist()
haps = haplos.values
pos2gpos = pos2gpos[allpos.values]
records = []
len_g1_int = haps.shape[1]
len_g1 = float(haps.shape[1])
ee_const = NE*4.0 / (2.0*nind)
len_allpos = len(allpos)
theta = s/(2.0*float(nind)+s)
thetas = (1-theta)*(1-theta)
thetas2 = (theta/2.0)*(1-theta/2.0)
cdef double[:] pos2gpos_view
cdef long[:] allpos_view
cdef long[:] g1_view, g2_view
cdef long[:, :] haps_view
pos2gpos_view = pos2gpos.values
allpos_view = allpos.values
haps_view = haps
for i in range(len_allpos):
pos1 = allpos_view[i]
gpos1 = pos2gpos_view[i]
toofar = 0
j = i
while j < len_allpos and not toofar:
pos2 = allpos_view[j]
gpos2 = pos2gpos_view[j]
df = gpos2 - gpos1
ee = exp( - df * ee_const)
if ee < CUTOFF:
toofar = 1
j = j+1
continue
n11, n01, n10 = 0, 0, 0
for k in range(len_g1_int):
if haps_view[i][k] == 1 and haps_view[j][k] == 1:
n11 += 1
elif haps_view[i][k] == 0 and haps_view[j][k] == 1:
n01 += 1
elif haps_view[i][k] == 1 and haps_view[j][k] == 0:
n10 += 1
f11 = n11/len_g1
f1 = (n11+n10)/len_g1
f2 = (n11+n01)/len_g1
D = f11 - f1*f2
Ds = D*ee
Ds2 = thetas*Ds
if fabs(Ds2) < CUTOFF:
j = j+1
continue
if i == j:
Ds2 = Ds2 + thetas2
# result = (allrs[i], allrs[j], pos1, pos2, gpos1, gpos2, D, Ds2)
print(pos1, pos2, Ds2)
# records.append(result)
j = j+1
# outdf = pd.DataFrame.from_records(records)
# outdf.to_parquet(outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment