Created
April 19, 2013 18:01
-
-
Save peterk87/5422042 to your computer and use it in GitHub Desktop.
Python: Parse binary patterns from file and get the unique binary patterns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assuming that the file is tab-delimited ('\t') | |
delim = '\t' | |
# binary file path: | |
binary_patterns_filename = "binary_patterns.txt" | |
# get all of the lines in the binary patterns file | |
lines = [line.rstrip() for line in open(binary_patterns_filename, "r")] | |
# 1st line should contain the headers | |
headers = lines[0].split(delim) | |
print "All headers:", headers | |
# 1st column should contain the name/index of pattern/isolate/etc | |
print "First header:", headers[0] | |
# starting at the 1st line after the header line | |
# get all of the unique patterns and store in a dictionary | |
unique_fings = {} # dictionary to store all unique fingerprints | |
for line in lines[1:]: | |
# split the current line based on the delimiter (e.g. tabs) | |
delim_split = line.split(delim) | |
# get the element in the first column | |
first_element = delim_split[0] | |
# concatenate all of the '0's and '1's to get the fingerprint | |
fing = "".join(delim_split[1:]) | |
# check if the fingerprint isn't already in the unique_fings dictionary | |
if fing not in unique_fings: | |
# add the new fingerprint and start a list containing the first element | |
# string ID | |
unique_fings[fing] = [first_element] | |
else: | |
# if the fingerprint is already present in the unique_fings dictionary | |
# add the first element string ID to the fingerprint | |
unique_fings[fing].append(first_element) | |
# show the number of entries in the binary patterns file vs how many unique | |
# fingerprints there are | |
print "Number of entries:", len(lines - 1) | |
print "Number of unique fingerprints:", len(unique_fings) | |
# you could get the frequency of each of the fingerprints | |
# dump your fingerprints into a list of tuples | |
fing_counts = [(fing, len(unique_fings[fing])) for fing in unique_fings] | |
# each tuple will contain the fingerprint and the count | |
# however, this list isn't sorted so to sort by descending order you can do the | |
# following: | |
fing_counts.sort(key=lambda tup: tup[1], reverse=True) | |
# This will sort the list by 2nd element in each tuple, the frequency count | |
# print how frequent each fingerprint is in the complete list of binary patterns | |
for fing, count in fing_counts: | |
print "Fingerprint", fing, "appears", count, "times" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment