danstowell · February 16, 2018 16:17
diff --git a/thinningdistance.py b/thinningdistance.py

 import numpy as np
 from numpy import log

 # we generate two independent "parent" processes, then for each one we independently create thinnings.
 # we aim to have a distance measure that correctly clusters the two sets according to their parent.
 # by Dan Stowell, copyright Feb 2018

 parents = [np.cumsum(np.random.exponential(size=100)) for whichp in range(2)]

 thindens = 0.5

 children = []
 for parent in parents:
 	children.append([parent[np.nonzero(np.random.rand(len(parent)) < thindens)] for _ in range(10)])

 print("parent lengths:")
 print([len(par) for par in parents])
 print("child lengths:")
 print([[len(chi) for chi in chilist] for chilist in children])

 childrenconcat = np.concatenate(children)


 def thinningdistance(s1, s2):
 	"Defined as the negative joint log-likelihood of the two sequences, assuming they were both generated by thinning from the same Poisson process realisation. Neglects a constant offset due to the overall likelihood of the union (which means that distances cannot be compared across datasets, but can be optimised or combined triangle-wise, with some care). Also uses a simplifying assumption that the generating process rates are equal to the empirical rates of the observations."
 	s1 = set(s1)
 	s2 = set(s2)
 	if s1 == s2:
 		return 0 # shortcut
 	su = s1 | s2
 	n1 = float(len(s1))
 	n2 = float(len(s2))
 	nu = float(len(su))

 	ll = n1 * log(n1/nu)  +  n2 * log(n2/nu)  +  (nu-n1) * log(1 - n1/nu)  +  (nu-n2) * log(1 - n2/nu)

 	return -ll


 childrenconcat = np.concatenate(children)

 dists = [[thinningdistance(a, b) for b in childrenconcat] for a in childrenconcat]

 print("Distances:")
 for row in dists: print map(int, row)

 print("Distances between distinct children of same process:")
 somedists = []
 for chilist in children:
 	for whichc, childa in enumerate(chilist):
 		for childb in chilist[:whichc]:
 			somedists.append(thinningdistance(childa, childb))

 print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), ))

 print("Distances between children of different processes:")
 somedists = []
 for whichc, chilista in enumerate(children):
 	for chilistb in children[:whichc]:
 		for childa in chilista:
 			for childb in chilistb:
 				somedists.append(thinningdistance(childa, childb))

 print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), ))

	import numpy as np
	from numpy import log

	# we generate two independent "parent" processes, then for each one we independently create thinnings.
	# we aim to have a distance measure that correctly clusters the two sets according to their parent.
	# by Dan Stowell, copyright Feb 2018

	parents = [np.cumsum(np.random.exponential(size=100)) for whichp in range(2)]

	thindens = 0.5

	children = []
	for parent in parents:
	children.append([parent[np.nonzero(np.random.rand(len(parent)) < thindens)] for _ in range(10)])

	print("parent lengths:")
	print([len(par) for par in parents])
	print("child lengths:")
	print([[len(chi) for chi in chilist] for chilist in children])

	childrenconcat = np.concatenate(children)


	def thinningdistance(s1, s2):
	"Defined as the negative joint log-likelihood of the two sequences, assuming they were both generated by thinning from the same Poisson process realisation. Neglects a constant offset due to the overall likelihood of the union (which means that distances cannot be compared across datasets, but can be optimised or combined triangle-wise, with some care). Also uses a simplifying assumption that the generating process rates are equal to the empirical rates of the observations."
	s1 = set(s1)
	s2 = set(s2)
	if s1 == s2:
	return 0 # shortcut
	su = s1 \| s2
	n1 = float(len(s1))
	n2 = float(len(s2))
	nu = float(len(su))

	ll = n1 * log(n1/nu) + n2 * log(n2/nu) + (nu-n1) * log(1 - n1/nu) + (nu-n2) * log(1 - n2/nu)

	return -ll


	childrenconcat = np.concatenate(children)

	dists = [[thinningdistance(a, b) for b in childrenconcat] for a in childrenconcat]

	print("Distances:")
	for row in dists: print map(int, row)

	print("Distances between distinct children of same process:")
	somedists = []
	for chilist in children:
	for whichc, childa in enumerate(chilist):
	for childb in chilist[:whichc]:
	somedists.append(thinningdistance(childa, childb))

	print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), ))

	print("Distances between children of different processes:")
	somedists = []
	for whichc, chilista in enumerate(children):
	for chilistb in children[:whichc]:
	for childa in chilista:
	for childb in chilistb:
	somedists.append(thinningdistance(childa, childb))

	print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), ))