Created
November 4, 2018 18:24
-
-
Save grischard/aaf01f2547d42e4dce0df9efacfdb313 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Process diffs between Luxembourg address dumps | |
TODO | |
-- notice if an address has changed street, postcode or house number | |
DONE | |
-- throw out duplicate id_caclr_bat | |
-- mark as deleted if id_caclr_bat is gone in new dataset | |
-- mark as new if id_caclr_bat wasn't in old dataset | |
-- if data has changed, show as modified (throw out the ones that are the same!) | |
-- filter address moves of less than 1m | |
""" | |
import sys | |
import argparse | |
import pandas as pd | |
import datacompy | |
import math | |
def main(f1, f2, delfile): | |
# Load the csv. Drop rows with missing values. Drop duplicates. | |
df1 = ( | |
pd.read_csv(f1, engine="c") | |
.dropna() | |
.drop_duplicates(subset=["id_caclr_bat"], keep=False) | |
) | |
df2 = ( | |
pd.read_csv(f2, engine="c") | |
.dropna() | |
.drop_duplicates(subset=["id_caclr_bat"], keep=False) | |
) | |
compare = datacompy.Compare( | |
df1, | |
df2, | |
join_columns="id_geoportail", # You can also specify a list of columns | |
df1_name="Old", # Optional, defaults to 'df1' | |
df2_name="New", # Optional, defaults to 'df2' | |
) | |
compare.matches(ignore_extra_columns=False) | |
deleted = compare.df1_unq_rows | |
new = compare.df2_unq_rows | |
# Compare only positions for now. | |
diff = compare.intersect_rows[ | |
[ | |
"id_geoportail", | |
"coord_nord_luref_df1", | |
"coord_nord_luref_df2", | |
"coord_est_luref_df1", | |
"coord_est_luref_df2", | |
"coord_est_luref_match", | |
] | |
] | |
# delete from diff where both sides match | |
diff = diff.loc[diff["coord_est_luref_match"] == False] | |
# Calculate distance | |
diff["distance"] = diff.apply( | |
lambda x: math.hypot( | |
x["coord_nord_luref_df2"] - x["coord_nord_luref_df1"], | |
x["coord_est_luref_df2"] - x["coord_est_luref_df1"], | |
), | |
axis=1, | |
) | |
# Ignore if moved by less than 1m | |
diff = diff.loc[diff["coord_est_luref_match"] > 1.0] | |
print(diff.sort_values(by="distance", ascending=False).head()) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("infile", nargs=2, type=argparse.FileType("r")) | |
parser.add_argument( | |
"delfile", nargs="?", type=argparse.FileType("w"), default=sys.stdout | |
) | |
parser.add_argument( | |
"newfile", nargs="?", type=argparse.FileType("w"), default=sys.stdout | |
) | |
args = parser.parse_args() | |
main(*args.infile, args.delfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment