Created
February 16, 2018 16:17
-
-
Save danstowell/71943917a5ae9ae9743dcd8e608e8486 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from numpy import log | |
# we generate two independent "parent" processes, then for each one we independently create thinnings. | |
# we aim to have a distance measure that correctly clusters the two sets according to their parent. | |
# by Dan Stowell, copyright Feb 2018 | |
parents = [np.cumsum(np.random.exponential(size=100)) for whichp in range(2)] | |
thindens = 0.5 | |
children = [] | |
for parent in parents: | |
children.append([parent[np.nonzero(np.random.rand(len(parent)) < thindens)] for _ in range(10)]) | |
print("parent lengths:") | |
print([len(par) for par in parents]) | |
print("child lengths:") | |
print([[len(chi) for chi in chilist] for chilist in children]) | |
childrenconcat = np.concatenate(children) | |
def thinningdistance(s1, s2): | |
"Defined as the negative joint log-likelihood of the two sequences, assuming they were both generated by thinning from the same Poisson process realisation. Neglects a constant offset due to the overall likelihood of the union (which means that distances cannot be compared across datasets, but can be optimised or combined triangle-wise, with some care). Also uses a simplifying assumption that the generating process rates are equal to the empirical rates of the observations." | |
s1 = set(s1) | |
s2 = set(s2) | |
if s1 == s2: | |
return 0 # shortcut | |
su = s1 | s2 | |
n1 = float(len(s1)) | |
n2 = float(len(s2)) | |
nu = float(len(su)) | |
ll = n1 * log(n1/nu) + n2 * log(n2/nu) + (nu-n1) * log(1 - n1/nu) + (nu-n2) * log(1 - n2/nu) | |
return -ll | |
childrenconcat = np.concatenate(children) | |
dists = [[thinningdistance(a, b) for b in childrenconcat] for a in childrenconcat] | |
print("Distances:") | |
for row in dists: print map(int, row) | |
print("Distances between distinct children of same process:") | |
somedists = [] | |
for chilist in children: | |
for whichc, childa in enumerate(chilist): | |
for childb in chilist[:whichc]: | |
somedists.append(thinningdistance(childa, childb)) | |
print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), )) | |
print("Distances between children of different processes:") | |
somedists = [] | |
for whichc, chilista in enumerate(children): | |
for chilistb in children[:whichc]: | |
for childa in chilista: | |
for childb in chilistb: | |
somedists.append(thinningdistance(childa, childb)) | |
print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), )) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment