Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save datavudeja/5cd1ba7cc3b2a540a0090cb09099b5d3 to your computer and use it in GitHub Desktop.

Select an option

Save datavudeja/5cd1ba7cc3b2a540a0090cb09099b5d3 to your computer and use it in GitHub Desktop.
A class to check and validate pandas DataFrame based on custom and default functions.
import string
import re
from typing import Any, List, Dict
class GracefulKeyFormatter(string.Formatter):
def get_value(self, key: str, args: List[Any], kwargs: Dict[str, Any]) -> Any:
"""
Retrieve the value of the given key from the provided keyword arguments.
Parameters:
key (str): The key to look up.
args (List[Any]): Positional arguments (unused in this method).
kwargs (Dict[str, Any]): Keyword arguments containing the values for formatting.
Returns:
Any: The value associated with the given key or an empty string if the key is not found.
"""
return kwargs.get(key, "")
def vformat(self, format_string: str, args: List[Any], kwargs: Dict[str, Any]) -> str:
"""
Format the given format string using the provided arguments and post-process the result.
Parameters:
format_string (str): The format string containing placeholders.
args (List[Any]): Positional arguments for formatting.
kwargs (Dict[str, Any]): Keyword arguments for formatting.
Returns:
str: The formatted string with any sequence of multiple underscores replaced by a single underscore and stripped of leading and trailing underscores.
"""
result = super().vformat(format_string, args, kwargs)
# Remove any sequence of multiple underscores
result = re.sub(r"_+", "_", result)
# Strip leading and trailing underscores
return result.strip("_")
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict
class IDataFrameValidator(ABC):
"""
Abstract base class for DataFrame validators.
"""
@abstractmethod
def __init__(
self, df: pd.DataFrame, custom_functions: Dict[str, Callable[..., Any]] = None
):
"""
Initialize the validator with a DataFrame.
Parameters:
df (pd.DataFrame): The DataFrame to validate.
"""
pass
@abstractmethod
def validate_rules(self, rules: Dict[str, Any]) -> bool:
"""
Validate the provided rules against a defined schema.
Parameters:
rules (Dict): The rules to validate.
Returns:
bool: True if the rules are valid, False otherwise.
"""
pass
@abstractmethod
def validate(self, rules: Dict[str, Any]) -> pd.DataFrame:
"""
Apply the defined rules to the DataFrame and validate its values.
Parameters:
rules (Dict): A dictionary containing the rules to apply.
Returns:
pd.DataFrame: The DataFrame with additional columns indicating the validation results.
"""
pass
from typing import Any, Callable, Dict
import pandas as pd
from jsonschema import Draft7Validator, exceptions
class PandasDataFrameValidator(IDataFrameValidator):
"""
A class to check and validate pandas DataFrame based on custom and default functions.
"""
@staticmethod
def is_number(value: Any) -> bool:
"""
Check if the given value is a number.
Parameters:
value (Any): The value to check.
Returns:
bool: True if the value is a number, False otherwise.
"""
return isinstance(value, (int, float))
@staticmethod
def is_string(value: Any) -> bool:
"""
Check if the given value is a string.
Parameters:
value (Any): The value to check.
Returns:
bool: True if the value is a string, False otherwise.
"""
return isinstance(value, str)
@classmethod
def default_functions(cls) -> Dict[str, Callable[..., Any]]:
"""
Return a dictionary of default functions.
Returns:
Dict[str, Callable[..., Any]]: A dictionary of function names mapped to their corresponding static methods.
"""
return {
"is_number": cls.is_number,
"is_string": cls.is_string,
# Add more default functions as needed
}
DEFAULT_FUNCTIONS = {
"is_number": is_number,
"is_string": is_string,
# Add more default functions as needed
}
DEFAULT_RULES_SCHEMA = {
"type": "object",
"properties": {
"rules": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"column": {"type": "string"},
"function": {"type": "string"},
"fact": {"type": "boolean"},
"format": {"type": "string"},
"format_kwargs": {
"type": "object",
"additionalProperties": {"type": "string"},
},
},
"required": ["name", "column", "function", "fact"],
"additionalProperties": False, # This ensures that additional properties cause a validation error
},
}
},
"required": ["rules"],
}
def __init__(
self,
df: pd.DataFrame,
custom_functions: Dict[str, Callable[..., Any]] = None,
custom_rules_schema: Dict[str, Any] = None,
formatter: string.Formatter = None,
):
"""
Initialize the PandasDataFrameValidator with a DataFrame, custom functions, and a formatter.
Parameters:
df (pd.DataFrame): The DataFrame to validate.
custom_functions (Dict[str, Callable[..., Any]], optional): A dictionary of custom functions. Defaults to None.
formatter (string.Formatter, optional): A custom formatter for column names. Defaults to GracefulKeyFormatter.
"""
self.df = df
self.formatter = formatter or GracefulKeyFormatter()
self.rules_schema = custom_rules_schema or self.DEFAULT_RULES_SCHEMA
# Merge default functions with custom functions
self.functions = {**self.DEFAULT_FUNCTIONS, **(custom_functions or {})}
def validate_rules(self, rules: Dict) -> bool:
"""
Validate the provided rules against the defined self.rules_schema.
Parameters:
rules (Dict): The rules to validate.
Returns:
bool: True if the rules are valid, False otherwise.
Raises:
jsonschema.exceptions.ValidationError: If the rules do not match the schema.
"""
validator = Draft7Validator(self.rules_schema)
errors = [error.message for error in validator.iter_errors(rules)]
if errors:
error_messages = "\n".join(errors)
raise exceptions.ValidationError(
f"Provided rules do not match the expected schema. Errors:\n{error_messages}"
)
return True
def validate(self, rules: Dict) -> pd.DataFrame:
"""
Validate_rules to rules_schema
Apply the defined rules to the DataFrame and validate its values.
Parameters:
rules (Dict): A dictionary containing the rules to apply.
Returns:
pd.DataFrame: The DataFrame with additional columns indicating the validation results.
"""
self.validate_rules(rules=rules)
for rule in rules["rules"]:
column = rule["column"]
function_name = rule["function"]
rule_name = rule["name"]
fact = rule["fact"]
format_str = rule.get("format", "{rule_name}_valid")
format_kwargs = rule.get("format_kwargs", {})
format_kwargs["rule_name"] = rule_name
function = self.functions.get(function_name)
if not function:
raise ValueError(f"Function '{function_name}' not found.")
column_name = self.formatter.format(format_str, **format_kwargs)
self.df[column_name] = self.df[column].apply(
self._apply_function_wrapper(function, fact)
)
return self.df
@staticmethod
def _apply_function_wrapper(
function: Callable[..., Any], fact: Any
) -> Callable[..., int]:
"""
Return a wrapper function to apply the given function and compare its result with the expected fact.
Parameters:
function (Callable[..., Any]): The function to apply.
fact (Any): The expected result.
Returns:
Callable[..., int]: The wrapper function.
"""
def wrapper(value: Any) -> int:
try:
result = function(value)
if result == fact:
return 1
else:
return 0
except Exception:
return -1
return wrapper
# Usage:
if __name__ == "__main__":
# Define custom function
def is_uppercase(value: Any) -> bool:
"""
Check if the given value, when converted to a string, is in uppercase.
Parameters:
value (Any): The value to check.
Returns:
bool: True if the value is in uppercase, False otherwise.
"""
return str(value).isupper()
# Define your rules
rules = {
"rules": [
{
"name": "age_is_number",
"column": "age",
"function": "is_number",
"fact": True,
"format": "{rule_name}_{suffix}__{test}",
"format_kwargs": {
"suffix": "validation_result",
},
},
{
"name": "name_is_string",
"column": "name",
"function": "is_string",
"fact": True,
},
{
"name": "name_is_uppercase",
"column": "name",
"function": "is_uppercase",
"fact": True,
},
],
}
data = pd.DataFrame(
[
{"name": "John", "age": 30},
{"name": "Alice", "age": "thirty"}, # This will cause a validation error
{"name": "Bob", "age": 40},
{"name": "bob lowwer", "age": "250"},
]
)
validator = PandasDataFrameValidator(
data, custom_functions={"is_uppercase": is_uppercase}
)
validated_data = validator.validate(rules)
print(validated_data)
# Define your invalid rules
invalid_rules = {
"rules": [
{
"invalid_key": "value1",
"name": "age_is_number",
"column": "age",
"function": "is_number",
"fact": True,
"format": "{rule_name}_{suffix}__{test}",
"format_kwargs": {
"suffix": "validation_result",
},
},
{
"another_invalid_key": "value1",
"name": "name_is_string",
"column": "name",
"function": "is_string",
"fact": True,
},
{
"name": "name_is_uppercase",
"column": "name",
"function": "is_uppercase",
"fact": True,
},
],
}
validator.validate_rules(rules=invalid_rules)
pandas
jsonschema
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment