Skip to content

Instantly share code, notes, and snippets.

@shiumachi
Created September 15, 2020 00:44
Show Gist options
  • Save shiumachi/cbb25843f3302ca18027d0928aa1d217 to your computer and use it in GitHub Desktop.
Save shiumachi/cbb25843f3302ca18027d0928aa1d217 to your computer and use it in GitHub Desktop.
simple validation function for a list of pandas.DataFrame
import pathlib
import typing
import pandas as pd
# data definition
## valid data
df1 = pd.DataFrame(
[
{"c1": 100, "c2": "a100"},
{"c1": 101, "c2": "a101"},
]
)
## valid data
df2 = pd.DataFrame(
[
{"c1": 200, "c2": "a200"},
{"c1": 202, "c2": "a202"},
]
)
## invalid data: ambiguous column names
df3 = pd.DataFrame(
[
{"c1": 300, "c3": "a300"},
{"c1": 301, "c3": "a301"},
]
)
## invalid data: inconsistent dtypes
df4 = pd.DataFrame(
[
{"c1": "400", "c2": "a400"},
{"c1": 401, "c2": "a401"},
]
)
## dataset test case 1: ambiguous column names
data1 = [
(pathlib.Path("file1.csv"), df1),
(pathlib.Path("file2.csv"), df2),
(pathlib.Path("file3.csv"), df3),
]
## dataset test case 2: inconsistent dtypes
data2 = [
(pathlib.Path("file1.csv"), df1),
(pathlib.Path("file2.csv"), df2),
(pathlib.Path("file4.csv"), df4),
]
def validate(
data: typing.Sequence[typing.Tuple[pathlib.Path, typing.Sequence[pd.DataFrame]]]
) -> typing.Sequence[pd.DataFrame]:
"""simple data validation
:param data: [(path, df)]
:return: [df]
"""
for x, y in zip(data, data[1:]):
if x[1].columns.tolist() != y[1].columns.tolist():
raise ValueError(f"ambiguous columns: {x[0]}, {y[0]}")
for xd, yd in zip(x[1].dtypes, y[1].dtypes):
if xd != yd:
raise ValueError(f"inconsistent dtypes: {xd}, {yd} in {x[0]}, {y[0]}")
return [x[1] for x in data]
print("### validate ambiguous columns demo ###")
try:
pd.concat(validate(data1))
except ValueError as ve:
print(ve)
print("### validate inconsistent dtypes demo ###")
try:
pd.concat(validate(data2))
except ValueError as ve:
print(ve)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment