Last active
May 26, 2023 15:49
-
-
Save bede/c2cd27a12add680648fde39c427ae752 to your computer and use it in GitHub Desktop.
Pandera MWE – I want a single failure case when region_is_valid fails indicating the sample_name of the row that failed (cDNA-VOC-1-v4-1)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from io import StringIO | |
import pandas as pd | |
import pandera as pa | |
import pandera.extensions as extensions | |
from pandera.typing import Index, Series | |
csv_string = """ | |
sample_name,country,region | |
cDNA-VOC-1-v4-1,USA,Bretagne | |
cDNA-VOC-1-v4-2,USA,Texas | |
""" | |
# Bretagne is not in the USA, so should fail the check | |
countries_subdivisions = {'USA': ['Texas'], 'France': ['Bretagne']} | |
COUNTRIES_ALPHA_3 = set(countries_subdivisions.keys()) | |
REGIONS = {i for l in countries_subdivisions.values() for i in l} | |
@extensions.register_check_method() | |
def region_is_valid(df): | |
""" | |
Validate the region field using ISO-3166 | |
""" | |
def validate_region(row): | |
if not pd.isna(row["region"]) and row["region"] not in countries_subdivisions.get( | |
row["country"] | |
): | |
valid = False | |
else: | |
valid = True | |
return valid | |
return df.apply(validate_region, axis=1) | |
# @extensions.register_check_method() | |
# def is_texas(df): | |
# return df['region'] == 'Texas' | |
class BaseSchema(pa.SchemaModel): | |
""" | |
Validate generic GPAS upload CSVs | |
""" | |
sample_name: Index[str] = pa.Field(str_matches=r"^[A-Za-z0-9._-]+$", unique=True, coerce=True, nullable=False) | |
country: Series[str] = pa.Field(isin=COUNTRIES_ALPHA_3, coerce=True, nullable=False) | |
region: Series[str] = pa.Field(nullable=True, isin=REGIONS, coerce=True) | |
class Config: | |
region_is_valid = () | |
# is_texas = () | |
def main(): | |
df = pd.read_csv(StringIO(csv_string), index_col=0) | |
try: | |
BaseSchema.validate(df, lazy=True) | |
except pa.errors.SchemaErrors as e: | |
print(e.failure_cases) | |
if __name__ == '__main__': | |
main() |
Clearly I could do e.failure_cases.groupby('index')['check'].unique()
, but perhaps there is a Right Way to do this : ) @cosmicBboy
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script outputs the following:
I'd like to have a single failure case per failing row – is this possible?