emdagon · December 11, 2015 17:19
diff --git a/pattern-recognition.py b/pattern-recognition.py
 #!/usr/bin/python

 from pprint import pprint


 stream = [
    ["A", "B", "C", "D", "F", "G", "X", "Y", "T"],
    ["X", "Y", "A", "F", "C", "D", "G", "Z", "P"],
    ["A", "F", "G", "B", "C", "D", "B", "C", "U"],
    ["G", "X", "A", "B", "C", "D", "F", "D", "C"],
    ["A", "F", "G", "B", "F", "D", "B", "F", "D"],
 ]

 # stream = [i.strip().upper().split() for i in open("test.txt").readlines()]

 MIN_PATTERN_SIZE = 3
 MAX_PATTERN_SIZE = 10

 MIN_PATTERN_OCCURRENCES = 1

 _sequences = {}

 def build_sequences(nodes):
    global _sequences
    # iterate the secuence nodes
    for index, event in enumerate(nodes):
        # iterate over each acepted pattern size
        for pattern_size in range(MIN_PATTERN_SIZE, MAX_PATTERN_SIZE + 1):
            # sequence = [x, y], then [x, y, z], then [x, y, z, ...], ...
            sequence = nodes[index:index + pattern_size]
            if len(sequence) < pattern_size:
                # we discard the residual sequences
                #   [2:3] in [A, B, C] return just [C], so it doesn't apply
                break
            # build a sequence identificator
            sequence_id = "".join(map(str, sequence))
            if sequence_id in _sequences:
                # If we already "know" this sequence, simply increase the counters
                _sequences[sequence_id]['count'] += 1
                _sequences[sequence_id]['points'] += 1
            else:
                # If this is the first time we have this sequence, we build a sequence dictionary
                #   Points are used to qualify the sequence as pattern.
                #   Points will be discounted for each sequence occurrence in 
                #    longer patterns (super-patterns).
                #   recurrence is used to normalize points when the sequence 
                #    is found in more than one super-pattern of the same length (same segment)
                _sequences[sequence_id] = {'id': sequence_id, 'count': 1,
                                          'points': 1, 'recurrence': None,
                                          'sequence': sequence}


 def is_subpattern(sub, pattern):
    sub = "".join(map(str, sub['sequence']))
    pattern = "".join(map(str, pattern['sequence']))
    # returns how many sub are in pattern
    return pattern.count(sub)


 for nodes_index, nodes in enumerate(stream):
    # Load the patterns for each tuple in the stream
    build_sequences(nodes)

 # set of nodes (tuples items)
 _nodes = set()

 # tree of filtered patterns
 #   segment (sequence size)
 #      \
 #       sequence index
 #           \
 #           pattern
 _filtered_tree = {}

 # iterate over the potential patterns
 for key, value in _sequences.items():

    # use its count as filter criteria
    if value['count'] > MIN_PATTERN_OCCURRENCES:

        # take advantage of this iteration to collect the nodes for later use
        map(_nodes.add, value['sequence'])

        size = len(value['sequence'])

        # segment the patterns regarding its sequence size
        if size not in _filtered_tree:
            _filtered_tree[size] = {}

        # fill the tree
        _filtered_tree[size][value['id']] = value


 del _sequences # memory clean

 # At this point we have all the patterns, now we need to discard those which
 # its count doesn't exceed its super-pattern count. Doing this we avoid redundant data

 segments_keys = _filtered_tree.keys()
 # we need to iterate the segments, starting with the bigger index (larger sequences)
 segments_keys.reverse()

 # iterate the segments
 for current_segment_index, current_segment_key in enumerate(segments_keys):
    current_segment = _filtered_tree[current_segment_key]
    
    # iterate the segment patterns
    for current_pattern_key, current_pattern in current_segment.items():

        # iterate the next segments
        for next_segment_key in segments_keys[current_segment_index + 1:]:
            
            next_segment = _filtered_tree[next_segment_key]
            
            # iterate the next segment patterns
            for next_pattern_key, next_pattern in next_segment.items():

                # check which of the next patterns are sub-pattern of the current one
                # getting a count of sub-pattern occurrences
                sub_occurrences = is_subpattern(next_pattern, current_pattern)

                if sub_occurrences:
                    
                    # substract to points, the current pattern count for each sub-pattern occurrence
                    _filtered_tree[next_segment_key][next_pattern_key]['points'] -= current_pattern['count'] * sub_occurrences
                    # the recurrence is taken in account starting by the second occurrence
                    if _filtered_tree[next_segment_key][next_pattern_key]['recurrence'] is None:
                        # recurrence is occurrences minus one
                        _filtered_tree[next_segment_key][next_pattern_key]['recurrence'] = sub_occurrences - 1
                    else:
                        # from now on, add the occurrences to recurrence
                        _filtered_tree[next_segment_key][next_pattern_key]['recurrence'] += sub_occurrences
                    # Note: for a better understanding of this, please notice that in order for
                    # a sub-pattern to "survive", the sum of its points and recurrence must be greater than 0

            # Now we have all the next segment patterns processed against the current pattern

            # iterate the next segment pattern (yeah, again)
            for next_pattern_key, next_pattern in next_segment.items():
                if (_filtered_tree[next_segment_key][next_pattern_key]['points'] + (_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] or 0)) < 1:
                    # discard those of whith doesn't have any more points to apply
                    del _filtered_tree[next_segment_key][next_pattern_key]

 # Done!
 # From now on, we can play with our patterns =)

 print "\nPatterns:\n"

 for segment_id, segment in _filtered_tree.items():
    for pattern in segment.values():
        if pattern:
            print "%d: %s (%d)" % (segment_id, pattern['sequence'], pattern['count'])

 print "\n%s\n" % ("*" * 100)

 print "Sankey graph data:\n"

 links_aggregate = {}
 _nodes = list(_nodes)

 for segment_id, segment in _filtered_tree.items():
    for pattern in segment.values():
        if pattern:
            for index, node in enumerate(pattern['sequence']):
                pair = pattern['sequence'][index:index + 2]
                if len(pair) < 2:
                    break
                pair_id = "".join(map(str, pair))
                if pair_id in links_aggregate:
                    links_aggregate[pair_id]['value'] += pattern['count']
                else:
                    links_aggregate[pair_id] = {
                        'source': _nodes.index(pair[0]),
                        'target': _nodes.index(pair[1]),
                        'value': pattern['count']
                    }

 pprint([{'name': name} for name in _nodes])
 pprint(links_aggregate.values())
	#!/usr/bin/python

	from pprint import pprint


	stream = [
	["A", "B", "C", "D", "F", "G", "X", "Y", "T"],
	["X", "Y", "A", "F", "C", "D", "G", "Z", "P"],
	["A", "F", "G", "B", "C", "D", "B", "C", "U"],
	["G", "X", "A", "B", "C", "D", "F", "D", "C"],
	["A", "F", "G", "B", "F", "D", "B", "F", "D"],
	]

	# stream = [i.strip().upper().split() for i in open("test.txt").readlines()]

	MIN_PATTERN_SIZE = 3
	MAX_PATTERN_SIZE = 10

	MIN_PATTERN_OCCURRENCES = 1

	_sequences = {}

	def build_sequences(nodes):
	global _sequences
	# iterate the secuence nodes
	for index, event in enumerate(nodes):
	# iterate over each acepted pattern size
	for pattern_size in range(MIN_PATTERN_SIZE, MAX_PATTERN_SIZE + 1):
	# sequence = [x, y], then [x, y, z], then [x, y, z, ...], ...
	sequence = nodes[index:index + pattern_size]
	if len(sequence) < pattern_size:
	# we discard the residual sequences
	# [2:3] in [A, B, C] return just [C], so it doesn't apply
	break
	# build a sequence identificator
	sequence_id = "".join(map(str, sequence))
	if sequence_id in _sequences:
	# If we already "know" this sequence, simply increase the counters
	_sequences[sequence_id]['count'] += 1
	_sequences[sequence_id]['points'] += 1
	else:
	# If this is the first time we have this sequence, we build a sequence dictionary
	# Points are used to qualify the sequence as pattern.
	# Points will be discounted for each sequence occurrence in
	# longer patterns (super-patterns).
	# recurrence is used to normalize points when the sequence
	# is found in more than one super-pattern of the same length (same segment)
	_sequences[sequence_id] = {'id': sequence_id, 'count': 1,
	'points': 1, 'recurrence': None,
	'sequence': sequence}


	def is_subpattern(sub, pattern):
	sub = "".join(map(str, sub['sequence']))
	pattern = "".join(map(str, pattern['sequence']))
	# returns how many sub are in pattern
	return pattern.count(sub)


	for nodes_index, nodes in enumerate(stream):
	# Load the patterns for each tuple in the stream
	build_sequences(nodes)

	# set of nodes (tuples items)
	_nodes = set()

	# tree of filtered patterns
	# segment (sequence size)
	# \
	# sequence index
	# \
	# pattern
	_filtered_tree = {}

	# iterate over the potential patterns
	for key, value in _sequences.items():

	# use its count as filter criteria
	if value['count'] > MIN_PATTERN_OCCURRENCES:

	# take advantage of this iteration to collect the nodes for later use
	map(_nodes.add, value['sequence'])

	size = len(value['sequence'])

	# segment the patterns regarding its sequence size
	if size not in _filtered_tree:
	_filtered_tree[size] = {}

	# fill the tree
	_filtered_tree[size][value['id']] = value


	del _sequences # memory clean

	# At this point we have all the patterns, now we need to discard those which
	# its count doesn't exceed its super-pattern count. Doing this we avoid redundant data

	segments_keys = _filtered_tree.keys()
	# we need to iterate the segments, starting with the bigger index (larger sequences)
	segments_keys.reverse()

	# iterate the segments
	for current_segment_index, current_segment_key in enumerate(segments_keys):
	current_segment = _filtered_tree[current_segment_key]

	# iterate the segment patterns
	for current_pattern_key, current_pattern in current_segment.items():

	# iterate the next segments
	for next_segment_key in segments_keys[current_segment_index + 1:]:

	next_segment = _filtered_tree[next_segment_key]

	# iterate the next segment patterns
	for next_pattern_key, next_pattern in next_segment.items():

	# check which of the next patterns are sub-pattern of the current one
	# getting a count of sub-pattern occurrences
	sub_occurrences = is_subpattern(next_pattern, current_pattern)

	if sub_occurrences:

	# substract to points, the current pattern count for each sub-pattern occurrence
	_filtered_tree[next_segment_key][next_pattern_key]['points'] -= current_pattern['count'] * sub_occurrences
	# the recurrence is taken in account starting by the second occurrence
	if _filtered_tree[next_segment_key][next_pattern_key]['recurrence'] is None:
	# recurrence is occurrences minus one
	_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] = sub_occurrences - 1
	else:
	# from now on, add the occurrences to recurrence
	_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] += sub_occurrences
	# Note: for a better understanding of this, please notice that in order for
	# a sub-pattern to "survive", the sum of its points and recurrence must be greater than 0

	# Now we have all the next segment patterns processed against the current pattern

	# iterate the next segment pattern (yeah, again)
	for next_pattern_key, next_pattern in next_segment.items():
	if (_filtered_tree[next_segment_key][next_pattern_key]['points'] + (_filtered_tree[next_segment_key][next_pattern_key]['recurrence'] or 0)) < 1:
	# discard those of whith doesn't have any more points to apply
	del _filtered_tree[next_segment_key][next_pattern_key]

	# Done!
	# From now on, we can play with our patterns =)

	print "\nPatterns:\n"

	for segment_id, segment in _filtered_tree.items():
	for pattern in segment.values():
	if pattern:
	print "%d: %s (%d)" % (segment_id, pattern['sequence'], pattern['count'])

	print "\n%s\n" % ("" 100)

	print "Sankey graph data:\n"

	links_aggregate = {}
	_nodes = list(_nodes)

	for segment_id, segment in _filtered_tree.items():
	for pattern in segment.values():
	if pattern:
	for index, node in enumerate(pattern['sequence']):
	pair = pattern['sequence'][index:index + 2]
	if len(pair) < 2:
	break
	pair_id = "".join(map(str, pair))
	if pair_id in links_aggregate:
	links_aggregate[pair_id]['value'] += pattern['count']
	else:
	links_aggregate[pair_id] = {
	'source': _nodes.index(pair[0]),
	'target': _nodes.index(pair[1]),
	'value': pattern['count']
	}

	pprint([{'name': name} for name in _nodes])
	pprint(links_aggregate.values())