Created
April 3, 2012 12:39
-
-
Save davidwtbuxton/2291658 to your computer and use it in GitHub Desktop.
De-duplicate UKOOA data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python 2.7 | |
from collections import OrderedDict | |
in_name = 'F87.p190' | |
out_name = 'my_results.txt' | |
results = OrderedDict() | |
# You should keep your input and output files separate, makes life much easier. | |
# The 'U' in 'rU' enables universal new-lines, which means it doesn't matter if | |
# your data uses Mac, Unix or Windows new-line conventions. | |
with open(in_name 'rU') as in_file: | |
for line in in_file: | |
number = line[1:12] # The range of the line number | |
x = line[25:35] # Range for the x co-ord | |
y = line[35:45] # Range for the y co-ord | |
key = (number,x,y) # Creates a tuple to use as a unique key | |
if key not in results: # Ignore line if we already recorded it | |
results[key]=line | |
# Now we have accumulated all the unique lines. | |
with open(out_name, 'w') as out_file: | |
for key in results: | |
# results[key] gets the value we stored (the whole line) | |
out_file.write(results[key]) | |
# An alternate strategy, which ought to be a little faster and use less memory. | |
# Might be important if you have really big data. In this version we use a set | |
# to record the unique keys and open the input and output files at the same time. | |
# If the key is not in the set already then write | |
# the line to the out_file. Then record the key in the set (adding the same key | |
# to the set only ever makes one copy of the key). | |
with open(in_name, 'rU') as in_file, open(out_name, 'w') as out_file: | |
# Records unique keys as we find them | |
all_keys = set() | |
for line in in_file: | |
# Same unique key as with the other way, just shorter to type. | |
key = (line[1:12], line[25:35], line[35:45]) | |
# Check if we had this key already. If not write line. | |
if key not in all_keys: | |
out_file.write(line) | |
# Now record the key | |
all_keys.add(key) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment