Skip to content

Instantly share code, notes, and snippets.

@emdagon
Last active December 11, 2015 17:19
Show Gist options
  • Save emdagon/6c8899ab2acfc269f505 to your computer and use it in GitHub Desktop.
Save emdagon/6c8899ab2acfc269f505 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from pprint import pprint
stream = [
["A", "B", "C", "D", "F", "G", "X", "Y", "T"],
["X", "Y", "A", "F", "C", "D", "G", "Z", "P"],
["A", "F", "G", "B", "C", "D", "B", "C", "U"],
["G", "X", "A", "B", "C", "D", "F", "D", "C"],
["A", "F", "G", "B", "F", "D", "B", "F", "D"],
]
# stream = [i.strip().upper().split() for i in open("test.txt").readlines()]
MIN_PATTERN_SIZE = 3
MAX_PATTERN_SIZE = 10
MIN_PATTERN_OCCURRENCES = 1
_sequences = {}
def build_sequences(nodes):
global _sequences
# iterate the secuence nodes
for index, event in enumerate(nodes):
# iterate over each acepted pattern size
for pattern_size in range(MIN_PATTERN_SIZE, MAX_PATTERN_SIZE + 1):
# sequence = [x, y], then [x, y, z], then [x, y, z, ...], ...
sequence = nodes[index:index + pattern_size]
if len(sequence) < pattern_size:
# we discard the residual sequences
# [2:3] in [A, B, C] return just [C], so it doesn't apply
break
# build a sequence identificator
sequence_id = "".join(map(str, sequence))
if sequence_id in _sequences:
# If we already "know" this sequence, simply increase the counters
_sequences[sequence_id]['count'] += 1
_sequences[sequence_id]['points'] += 1
else:
# If this is the first time we have this sequence, we build a sequence dictionary
# Points are used to qualify the sequence as pattern.
# Points will be discounted for each sequence occurrence in
# longer patterns (super-patterns).
# recurrence is used to normalize points when the sequence
# is found in more than one super-pattern of the same length (same segment)
_sequences[sequence_id] = {'id': sequence_id, 'count': 1,
'points': 1, 'recurrence': None,
'sequence': sequence}
def is_subpattern(sub, pattern):
sub = "".join(map(str, sub['sequence']))
pattern = "".join(map(str, pattern['sequence']))
# returns how many sub are in pattern
return pattern.count(sub)
for nodes_index, nodes in enumerate(stream):
# Load the patterns for each tuple in the stream
build_sequences(nodes)
# set of nodes (tuples items)
_nodes = set()
# tree of filtered patterns
# segment (sequence size)
# \
# sequence index
# \
# pattern
_filtered_tree = {}
# iterate over the potential patterns
for key, value in _sequences.items():
# use its count as filter criteria
if value['count'] > MIN_PATTERN_OCCURRENCES:
# take advantage of this iteration to collect the nodes for later use
map(_nodes.add, value['sequence'])
size = len(value['sequence'])
# segment the patterns regarding its sequence size
if size not in _filtered_tree:
_filtered_tree[size] = {}
# fill the tree
_filtered_tree[size][value['id']] = value
del _sequences # memory clean
# At this point we have all the patterns, now we need to discard those which
# its count doesn't exceed its super-pattern count. Doing this we avoid redundant data
segments_keys = _filtered_tree.keys()
# we need to iterate the segments, starting with the bigger index (larger sequences)
segments_keys.reverse()
# iterate the segments
for current_segment_index, current_segment_key in enumerate(segments_keys):
current_segment = _filtered_tree[current_segment_key]
# iterate the segment patterns
for current_pattern_key, current_pattern in current_segment.items():
# iterate the next segments
for next_segment_key in segments_keys[current_segment_index + 1:]:
next_segment = _filtered_tree[next_segment_key]
# iterate the next segment patterns
for next_pattern_key, next_pattern in next_segment.items():
# check which of the next patterns are sub-pattern of the current one
# getting a count of sub-pattern occurrences
sub_occurrences = is_subpattern(next_pattern, current_pattern)
if sub_occurrences:
# substract to points, the current pattern count for each sub-pattern occurrence
_filtered_tree[next_segment_key][next_pattern_key]['points'] -= current_pattern['count'] * sub_occurrences
# the recurrence is taken in account starting by the second occurrence
if _filtered_tree[next_segment_key][next_pattern_key]['recurrence'] is None:
# recurrence is occurrences minus one
_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] = sub_occurrences - 1
else:
# from now on, add the occurrences to recurrence
_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] += sub_occurrences
# Note: for a better understanding of this, please notice that in order for
# a sub-pattern to "survive", the sum of its points and recurrence must be greater than 0
# Now we have all the next segment patterns processed against the current pattern
# iterate the next segment pattern (yeah, again)
for next_pattern_key, next_pattern in next_segment.items():
if (_filtered_tree[next_segment_key][next_pattern_key]['points'] + (_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] or 0)) < 1:
# discard those of whith doesn't have any more points to apply
del _filtered_tree[next_segment_key][next_pattern_key]
# Done!
# From now on, we can play with our patterns =)
print "\nPatterns:\n"
for segment_id, segment in _filtered_tree.items():
for pattern in segment.values():
if pattern:
print "%d: %s (%d)" % (segment_id, pattern['sequence'], pattern['count'])
print "\n%s\n" % ("*" * 100)
print "Sankey graph data:\n"
links_aggregate = {}
_nodes = list(_nodes)
for segment_id, segment in _filtered_tree.items():
for pattern in segment.values():
if pattern:
for index, node in enumerate(pattern['sequence']):
pair = pattern['sequence'][index:index + 2]
if len(pair) < 2:
break
pair_id = "".join(map(str, pair))
if pair_id in links_aggregate:
links_aggregate[pair_id]['value'] += pattern['count']
else:
links_aggregate[pair_id] = {
'source': _nodes.index(pair[0]),
'target': _nodes.index(pair[1]),
'value': pattern['count']
}
pprint([{'name': name} for name in _nodes])
pprint(links_aggregate.values())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment