Last active
December 11, 2015 17:19
-
-
Save emdagon/6c8899ab2acfc269f505 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from pprint import pprint | |
stream = [ | |
["A", "B", "C", "D", "F", "G", "X", "Y", "T"], | |
["X", "Y", "A", "F", "C", "D", "G", "Z", "P"], | |
["A", "F", "G", "B", "C", "D", "B", "C", "U"], | |
["G", "X", "A", "B", "C", "D", "F", "D", "C"], | |
["A", "F", "G", "B", "F", "D", "B", "F", "D"], | |
] | |
# stream = [i.strip().upper().split() for i in open("test.txt").readlines()] | |
MIN_PATTERN_SIZE = 3 | |
MAX_PATTERN_SIZE = 10 | |
MIN_PATTERN_OCCURRENCES = 1 | |
_sequences = {} | |
def build_sequences(nodes): | |
global _sequences | |
# iterate the secuence nodes | |
for index, event in enumerate(nodes): | |
# iterate over each acepted pattern size | |
for pattern_size in range(MIN_PATTERN_SIZE, MAX_PATTERN_SIZE + 1): | |
# sequence = [x, y], then [x, y, z], then [x, y, z, ...], ... | |
sequence = nodes[index:index + pattern_size] | |
if len(sequence) < pattern_size: | |
# we discard the residual sequences | |
# [2:3] in [A, B, C] return just [C], so it doesn't apply | |
break | |
# build a sequence identificator | |
sequence_id = "".join(map(str, sequence)) | |
if sequence_id in _sequences: | |
# If we already "know" this sequence, simply increase the counters | |
_sequences[sequence_id]['count'] += 1 | |
_sequences[sequence_id]['points'] += 1 | |
else: | |
# If this is the first time we have this sequence, we build a sequence dictionary | |
# Points are used to qualify the sequence as pattern. | |
# Points will be discounted for each sequence occurrence in | |
# longer patterns (super-patterns). | |
# recurrence is used to normalize points when the sequence | |
# is found in more than one super-pattern of the same length (same segment) | |
_sequences[sequence_id] = {'id': sequence_id, 'count': 1, | |
'points': 1, 'recurrence': None, | |
'sequence': sequence} | |
def is_subpattern(sub, pattern): | |
sub = "".join(map(str, sub['sequence'])) | |
pattern = "".join(map(str, pattern['sequence'])) | |
# returns how many sub are in pattern | |
return pattern.count(sub) | |
for nodes_index, nodes in enumerate(stream): | |
# Load the patterns for each tuple in the stream | |
build_sequences(nodes) | |
# set of nodes (tuples items) | |
_nodes = set() | |
# tree of filtered patterns | |
# segment (sequence size) | |
# \ | |
# sequence index | |
# \ | |
# pattern | |
_filtered_tree = {} | |
# iterate over the potential patterns | |
for key, value in _sequences.items(): | |
# use its count as filter criteria | |
if value['count'] > MIN_PATTERN_OCCURRENCES: | |
# take advantage of this iteration to collect the nodes for later use | |
map(_nodes.add, value['sequence']) | |
size = len(value['sequence']) | |
# segment the patterns regarding its sequence size | |
if size not in _filtered_tree: | |
_filtered_tree[size] = {} | |
# fill the tree | |
_filtered_tree[size][value['id']] = value | |
del _sequences # memory clean | |
# At this point we have all the patterns, now we need to discard those which | |
# its count doesn't exceed its super-pattern count. Doing this we avoid redundant data | |
segments_keys = _filtered_tree.keys() | |
# we need to iterate the segments, starting with the bigger index (larger sequences) | |
segments_keys.reverse() | |
# iterate the segments | |
for current_segment_index, current_segment_key in enumerate(segments_keys): | |
current_segment = _filtered_tree[current_segment_key] | |
# iterate the segment patterns | |
for current_pattern_key, current_pattern in current_segment.items(): | |
# iterate the next segments | |
for next_segment_key in segments_keys[current_segment_index + 1:]: | |
next_segment = _filtered_tree[next_segment_key] | |
# iterate the next segment patterns | |
for next_pattern_key, next_pattern in next_segment.items(): | |
# check which of the next patterns are sub-pattern of the current one | |
# getting a count of sub-pattern occurrences | |
sub_occurrences = is_subpattern(next_pattern, current_pattern) | |
if sub_occurrences: | |
# substract to points, the current pattern count for each sub-pattern occurrence | |
_filtered_tree[next_segment_key][next_pattern_key]['points'] -= current_pattern['count'] * sub_occurrences | |
# the recurrence is taken in account starting by the second occurrence | |
if _filtered_tree[next_segment_key][next_pattern_key]['recurrence'] is None: | |
# recurrence is occurrences minus one | |
_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] = sub_occurrences - 1 | |
else: | |
# from now on, add the occurrences to recurrence | |
_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] += sub_occurrences | |
# Note: for a better understanding of this, please notice that in order for | |
# a sub-pattern to "survive", the sum of its points and recurrence must be greater than 0 | |
# Now we have all the next segment patterns processed against the current pattern | |
# iterate the next segment pattern (yeah, again) | |
for next_pattern_key, next_pattern in next_segment.items(): | |
if (_filtered_tree[next_segment_key][next_pattern_key]['points'] + (_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] or 0)) < 1: | |
# discard those of whith doesn't have any more points to apply | |
del _filtered_tree[next_segment_key][next_pattern_key] | |
# Done! | |
# From now on, we can play with our patterns =) | |
print "\nPatterns:\n" | |
for segment_id, segment in _filtered_tree.items(): | |
for pattern in segment.values(): | |
if pattern: | |
print "%d: %s (%d)" % (segment_id, pattern['sequence'], pattern['count']) | |
print "\n%s\n" % ("*" * 100) | |
print "Sankey graph data:\n" | |
links_aggregate = {} | |
_nodes = list(_nodes) | |
for segment_id, segment in _filtered_tree.items(): | |
for pattern in segment.values(): | |
if pattern: | |
for index, node in enumerate(pattern['sequence']): | |
pair = pattern['sequence'][index:index + 2] | |
if len(pair) < 2: | |
break | |
pair_id = "".join(map(str, pair)) | |
if pair_id in links_aggregate: | |
links_aggregate[pair_id]['value'] += pattern['count'] | |
else: | |
links_aggregate[pair_id] = { | |
'source': _nodes.index(pair[0]), | |
'target': _nodes.index(pair[1]), | |
'value': pattern['count'] | |
} | |
pprint([{'name': name} for name in _nodes]) | |
pprint(links_aggregate.values()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment