Created
September 15, 2020 00:44
-
-
Save shiumachi/cbb25843f3302ca18027d0928aa1d217 to your computer and use it in GitHub Desktop.
simple validation function for a list of pandas.DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pathlib | |
import typing | |
import pandas as pd | |
# data definition | |
## valid data | |
df1 = pd.DataFrame( | |
[ | |
{"c1": 100, "c2": "a100"}, | |
{"c1": 101, "c2": "a101"}, | |
] | |
) | |
## valid data | |
df2 = pd.DataFrame( | |
[ | |
{"c1": 200, "c2": "a200"}, | |
{"c1": 202, "c2": "a202"}, | |
] | |
) | |
## invalid data: ambiguous column names | |
df3 = pd.DataFrame( | |
[ | |
{"c1": 300, "c3": "a300"}, | |
{"c1": 301, "c3": "a301"}, | |
] | |
) | |
## invalid data: inconsistent dtypes | |
df4 = pd.DataFrame( | |
[ | |
{"c1": "400", "c2": "a400"}, | |
{"c1": 401, "c2": "a401"}, | |
] | |
) | |
## dataset test case 1: ambiguous column names | |
data1 = [ | |
(pathlib.Path("file1.csv"), df1), | |
(pathlib.Path("file2.csv"), df2), | |
(pathlib.Path("file3.csv"), df3), | |
] | |
## dataset test case 2: inconsistent dtypes | |
data2 = [ | |
(pathlib.Path("file1.csv"), df1), | |
(pathlib.Path("file2.csv"), df2), | |
(pathlib.Path("file4.csv"), df4), | |
] | |
def validate( | |
data: typing.Sequence[typing.Tuple[pathlib.Path, typing.Sequence[pd.DataFrame]]] | |
) -> typing.Sequence[pd.DataFrame]: | |
"""simple data validation | |
:param data: [(path, df)] | |
:return: [df] | |
""" | |
for x, y in zip(data, data[1:]): | |
if x[1].columns.tolist() != y[1].columns.tolist(): | |
raise ValueError(f"ambiguous columns: {x[0]}, {y[0]}") | |
for xd, yd in zip(x[1].dtypes, y[1].dtypes): | |
if xd != yd: | |
raise ValueError(f"inconsistent dtypes: {xd}, {yd} in {x[0]}, {y[0]}") | |
return [x[1] for x in data] | |
print("### validate ambiguous columns demo ###") | |
try: | |
pd.concat(validate(data1)) | |
except ValueError as ve: | |
print(ve) | |
print("### validate inconsistent dtypes demo ###") | |
try: | |
pd.concat(validate(data2)) | |
except ValueError as ve: | |
print(ve) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment