Skip to content

Instantly share code, notes, and snippets.

@armanbilge
Last active August 29, 2015 14:21
Show Gist options
  • Save armanbilge/553dfc426ccb585f3a6f to your computer and use it in GitHub Desktop.
Save armanbilge/553dfc426ccb585f3a6f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# The MIT License (MIT)
#
# Copyright (c) 2015 Arman Bilge
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import sys
from collections import Counter
import itertools as it
from Bio import SeqIO
from numpy import random
BASES = ('A', 'C', 'G', 'T')
points = Counter()
t = {a: float(b) for a, b in map(str.split, it.filterfalse(str.isspace, open(sys.argv[1])))}
constant = int(sys.argv[2])
files = ((open(f), print(f))[0] for f in sys.argv[3:])
for x, y in it.combinations(t.values(), 2):
points[(abs(x - y), 0)] += constant
class SequenceRecord:
pass
def parse(f):
for l in f:
r = SequenceRecord()
r.id, r.seq = l.split()
yield r
pairs = it.chain.from_iterable(it.combinations(parse(f), 2) for f in files)
data = ((abs(t[i.id] - t[j.id]), int(x != y)) for i, j in pairs for x, y in zip(i.seq, j.seq) if x in BASES and y in BASES)
for p in data:
points[p] += 1
print(points)
def linreg(points):
N = sum(points.values())
X, Y, XX, XY = 0, 0, 0, 0
for p, n in points.items():
x, y = p
X += x * n
Y += y * n
XX += x * x * n
XY += x * y * n
mu = (XY - X * Y / N) / (XX - X * X / N)
theta = (Y - mu * X) / N
return mu, theta
mu, theta = linreg(points)
mus, thetas = [], []
N = sum(points.values())
items = list(points.items())
prob = [p / N for _, p in items]
items = [p for p, _ in items]
for _ in range(1000):
print('.', end='')
bootstrap = Counter()
total = 0
for p, n in zip(items, random.multinomial(N, prob)):
bootstrap[p] += n
theta, mu = linreg(bootstrap)
thetas.append(theta)
mus.append(mu)
print()
print('theta: {} [{}, {}]'.format(theta, min(thetas), max(thetas)))
print('mu: {} [{}, {}]'.format(mu, min(mus), max(mus)))
#!/usr/bin/env python3
# The MIT License (MIT)
#
# Copyright (c) 2015 Arman Bilge
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import sys
from collections import Counter
import itertools as it
from Bio import SeqIO
from numpy import random
BASES = ('A', 'C', 'G', 'T')
points = Counter()
t = {a: float(b) for a, b in map(str.split, it.filterfalse(str.isspace, open(sys.argv[1])))}
constant = sum(sum(map(int, l.split(',')[2:6])) for l in it.islice(open(sys.argv[2]), 0, None, 92))
for x, y in it.combinations(t.values(), 2):
points[(abs(x - y), 0)] += constant
class SequenceRecord:
pass
def consume(iter, n):
try:
while True:
yield tuple(it.islice(iter, 0, n))
except:
pass
def parse(f):
for l in f:
r = SequenceRecord()
r.id, r.seq = l.split(',')[1], l.split(',')[6]
yield r
pairs = it.chain.from_iterable(it.combinations(parse(g), 2) for g in consume(it.filterfalse(str.isspace, open(sys.argv[2])), 92))
data = ((abs(t[i.id] - t[j.id]), int(x != y)) for i, j in pairs for x, y in zip(i.seq, j.seq) if x in BASES and y in BASES)
for p in data:
points[p] += 1
print(points)
def linreg(points):
N = sum(points.values())
X, Y, XX, XY = 0, 0, 0, 0
for p, n in points.items():
x, y = p
X += x * n
Y += y * n
XX += x * x * n
XY += x * y * n
mu = (XY - X * Y / N) / (XX - X * X / N)
theta = (Y - mu * X) / N
return mu, theta
mu, theta = linreg(points)
mus, thetas = [], []
N = sum(points.values())
items = list(points.items())
prob = [p / N for _, p in items]
items = [p for p, _ in items]
for _ in range(1000):
print('.', end='')
bootstrap = Counter()
total = 0
for p, n in zip(items, random.multinomial(N, prob)):
bootstrap[p] += n
theta, mu = linreg(bootstrap)
thetas.append(theta)
mus.append(mu)
print()
print('theta: {} [{}, {}]'.format(theta, min(thetas), max(thetas)))
print('mu: {} [{}, {}]'.format(mu, min(mus), max(mus)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment