Created
January 1, 2014 22:21
-
-
Save dobrokot/8212139 to your computer and use it in GitHub Desktop.
substring probability, n successes in a row
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import random | |
| def id_mat(n): | |
| return [[int(i == j) for j in xrange(n)] for i in xrange(n)] | |
| def my_pow(x, n, mul_fun, one): | |
| r = one | |
| assert n >= 0 and int(n) == n | |
| while n != 0: | |
| if n % 2 == 1: | |
| r = mul_fun(r, x) | |
| x = mul_fun(x, x) | |
| n = n // 2 | |
| return r | |
| def mul(a, b): | |
| sz = len(a) | |
| assert sz == len(b) | |
| return [ | |
| [ sum(a[i][k]*b[k][j] for k in xrange(sz)) for j in xrange(sz) ] | |
| for i in xrange(sz) | |
| ] | |
| def count_matrix(m, p, n): | |
| res = my_pow(m, n, mul, id_mat(p)) | |
| return sum(row[0] for row in res) | |
| def mul_num(m, b): | |
| return [[b*x for x in row] for row in m] | |
| def count_same(p, al, n): | |
| m = [[al-1] * p] + [[int(j == i) for j in xrange(p)] for i in xrange(p-1)] | |
| m = mul_num(m, 1.0 / al) | |
| return count_matrix(m, p, n) | |
| def count_dif(p, al, n): | |
| m = [[al-2] * p] + [[1] * p] + [[int(j == i+1) for j in xrange(p)] for i in xrange(p-2)] | |
| m[0][0] += 1 | |
| m = mul_num(m, 1.0 / al) | |
| return count_matrix(m, p, n) | |
| def make_word_matrix(w, al): | |
| p = len(w) | |
| m = [[0]*p for _i in xrange(p)] | |
| #for every pair of prefixes (Pi, Pj), for every char 'c', | |
| #write to matrix[i][j] number of transitions (Pj + C -> Pi), | |
| #when full prefix w[0:len(w)] is forbidden | |
| for j in xrange(p): | |
| st = {} | |
| end = 0 | |
| for i in xrange(1, j+2): | |
| c = w[i-1] | |
| end = (j == p-1) and (c == w[p-1]) | |
| if not end: | |
| if (w[0:i-1] == w[j-i+1:j]): | |
| st[c] = max(st.get(c, -1), i) | |
| m[0][j] = al - (len(st) + end) | |
| for c, i in st.iteritems(): | |
| m[i][j] = 1 | |
| return m | |
| def count_word(w, al, n): | |
| return count_matrix(make_word_matrix(w, al), len(w), n) | |
| def count_word_prob(w, al, n): | |
| return count_matrix(mul_num(make_word_matrix(w, al), 1.0/al), len(w), n) | |
| def count_word_simple(w, al, n): | |
| import itertools | |
| def gen_alph(k): | |
| return ''.join(chr(i) for i in xrange(ord('a'), ord('a') + k)) | |
| return sum((w not in ''.join(x)) for x in itertools.product(gen_alph(al), repeat=n)) | |
| def main(): | |
| #from math import log as ln | |
| import os, sys | |
| try: | |
| _, word, alphabet_size, string_len = sys.argv | |
| alphabet_size = int(alphabet_size) | |
| string_len = int(string_len) | |
| assert alphabet_size > 0 | |
| assert string_len >= 0 | |
| assert len(set(word)) <= alphabet_size, "there are more characters in word than in alphabet" | |
| except: | |
| print >> sys.stderr, ( | |
| 'count number of word occurences in random strings, and probability\n' | |
| '\n' | |
| 'usage: python %s word, alphabet_size, string_len\n' | |
| 'alphabet assumed to be "a, b, c, ..." of alphabet_size size\n' | |
| 'example: python %s "abab" 26 100\n' ) % tuple([os.path.basename(sys.argv[0])] * 2) | |
| print >> sys.stderr | |
| print >> sys.stderr | |
| raise | |
| print 1.0 - count_word_prob(word, alphabet_size, string_len) | |
| #all = alphabet_size ** string_len | |
| #print '%s/%s' % (all - count_word(word, alphabet_size, string_len), all) | |
| #print 1-count_word_prob('abcd', 256, 2**32) | |
| #print 1-count_word_prob('abab', 256, 2**32) | |
| #print 1-count_word_prob('aaaa', 256, 2**32) | |
| #test by trivial count_word_simple: | |
| def gen_alph(k): | |
| return ''.join(chr(i) for i in xrange(ord('a'), ord('a') + k)) | |
| import itertools | |
| for wl in xrange(5): | |
| for x in itertools.product(gen_alph(wl)): | |
| w = ''.join(x) | |
| assert count_word_simple(w, 4, 6) == count_word(w, 4, 6) | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment