Skip to content

Instantly share code, notes, and snippets.

@grischard
Created November 4, 2018 18:24
Show Gist options
  • Save grischard/aaf01f2547d42e4dce0df9efacfdb313 to your computer and use it in GitHub Desktop.
Save grischard/aaf01f2547d42e4dce0df9efacfdb313 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Process diffs between Luxembourg address dumps
TODO
-- notice if an address has changed street, postcode or house number
DONE
-- throw out duplicate id_caclr_bat
-- mark as deleted if id_caclr_bat is gone in new dataset
-- mark as new if id_caclr_bat wasn't in old dataset
-- if data has changed, show as modified (throw out the ones that are the same!)
-- filter address moves of less than 1m
"""
import sys
import argparse
import pandas as pd
import datacompy
import math
def main(f1, f2, delfile):
# Load the csv. Drop rows with missing values. Drop duplicates.
df1 = (
pd.read_csv(f1, engine="c")
.dropna()
.drop_duplicates(subset=["id_caclr_bat"], keep=False)
)
df2 = (
pd.read_csv(f2, engine="c")
.dropna()
.drop_duplicates(subset=["id_caclr_bat"], keep=False)
)
compare = datacompy.Compare(
df1,
df2,
join_columns="id_geoportail", # You can also specify a list of columns
df1_name="Old", # Optional, defaults to 'df1'
df2_name="New", # Optional, defaults to 'df2'
)
compare.matches(ignore_extra_columns=False)
deleted = compare.df1_unq_rows
new = compare.df2_unq_rows
# Compare only positions for now.
diff = compare.intersect_rows[
[
"id_geoportail",
"coord_nord_luref_df1",
"coord_nord_luref_df2",
"coord_est_luref_df1",
"coord_est_luref_df2",
"coord_est_luref_match",
]
]
# delete from diff where both sides match
diff = diff.loc[diff["coord_est_luref_match"] == False]
# Calculate distance
diff["distance"] = diff.apply(
lambda x: math.hypot(
x["coord_nord_luref_df2"] - x["coord_nord_luref_df1"],
x["coord_est_luref_df2"] - x["coord_est_luref_df1"],
),
axis=1,
)
# Ignore if moved by less than 1m
diff = diff.loc[diff["coord_est_luref_match"] > 1.0]
print(diff.sort_values(by="distance", ascending=False).head())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("infile", nargs=2, type=argparse.FileType("r"))
parser.add_argument(
"delfile", nargs="?", type=argparse.FileType("w"), default=sys.stdout
)
parser.add_argument(
"newfile", nargs="?", type=argparse.FileType("w"), default=sys.stdout
)
args = parser.parse_args()
main(*args.infile, args.delfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment