-
-
Save datavudeja/5cd1ba7cc3b2a540a0090cb09099b5d3 to your computer and use it in GitHub Desktop.
A class to check and validate pandas DataFrame based on custom and default functions.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import string | |
| import re | |
| from typing import Any, List, Dict | |
| class GracefulKeyFormatter(string.Formatter): | |
| def get_value(self, key: str, args: List[Any], kwargs: Dict[str, Any]) -> Any: | |
| """ | |
| Retrieve the value of the given key from the provided keyword arguments. | |
| Parameters: | |
| key (str): The key to look up. | |
| args (List[Any]): Positional arguments (unused in this method). | |
| kwargs (Dict[str, Any]): Keyword arguments containing the values for formatting. | |
| Returns: | |
| Any: The value associated with the given key or an empty string if the key is not found. | |
| """ | |
| return kwargs.get(key, "") | |
| def vformat(self, format_string: str, args: List[Any], kwargs: Dict[str, Any]) -> str: | |
| """ | |
| Format the given format string using the provided arguments and post-process the result. | |
| Parameters: | |
| format_string (str): The format string containing placeholders. | |
| args (List[Any]): Positional arguments for formatting. | |
| kwargs (Dict[str, Any]): Keyword arguments for formatting. | |
| Returns: | |
| str: The formatted string with any sequence of multiple underscores replaced by a single underscore and stripped of leading and trailing underscores. | |
| """ | |
| result = super().vformat(format_string, args, kwargs) | |
| # Remove any sequence of multiple underscores | |
| result = re.sub(r"_+", "_", result) | |
| # Strip leading and trailing underscores | |
| return result.strip("_") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from abc import ABC, abstractmethod | |
| from typing import Any, Callable, Dict | |
| class IDataFrameValidator(ABC): | |
| """ | |
| Abstract base class for DataFrame validators. | |
| """ | |
| @abstractmethod | |
| def __init__( | |
| self, df: pd.DataFrame, custom_functions: Dict[str, Callable[..., Any]] = None | |
| ): | |
| """ | |
| Initialize the validator with a DataFrame. | |
| Parameters: | |
| df (pd.DataFrame): The DataFrame to validate. | |
| """ | |
| pass | |
| @abstractmethod | |
| def validate_rules(self, rules: Dict[str, Any]) -> bool: | |
| """ | |
| Validate the provided rules against a defined schema. | |
| Parameters: | |
| rules (Dict): The rules to validate. | |
| Returns: | |
| bool: True if the rules are valid, False otherwise. | |
| """ | |
| pass | |
| @abstractmethod | |
| def validate(self, rules: Dict[str, Any]) -> pd.DataFrame: | |
| """ | |
| Apply the defined rules to the DataFrame and validate its values. | |
| Parameters: | |
| rules (Dict): A dictionary containing the rules to apply. | |
| Returns: | |
| pd.DataFrame: The DataFrame with additional columns indicating the validation results. | |
| """ | |
| pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import Any, Callable, Dict | |
| import pandas as pd | |
| from jsonschema import Draft7Validator, exceptions | |
| class PandasDataFrameValidator(IDataFrameValidator): | |
| """ | |
| A class to check and validate pandas DataFrame based on custom and default functions. | |
| """ | |
| @staticmethod | |
| def is_number(value: Any) -> bool: | |
| """ | |
| Check if the given value is a number. | |
| Parameters: | |
| value (Any): The value to check. | |
| Returns: | |
| bool: True if the value is a number, False otherwise. | |
| """ | |
| return isinstance(value, (int, float)) | |
| @staticmethod | |
| def is_string(value: Any) -> bool: | |
| """ | |
| Check if the given value is a string. | |
| Parameters: | |
| value (Any): The value to check. | |
| Returns: | |
| bool: True if the value is a string, False otherwise. | |
| """ | |
| return isinstance(value, str) | |
| @classmethod | |
| def default_functions(cls) -> Dict[str, Callable[..., Any]]: | |
| """ | |
| Return a dictionary of default functions. | |
| Returns: | |
| Dict[str, Callable[..., Any]]: A dictionary of function names mapped to their corresponding static methods. | |
| """ | |
| return { | |
| "is_number": cls.is_number, | |
| "is_string": cls.is_string, | |
| # Add more default functions as needed | |
| } | |
| DEFAULT_FUNCTIONS = { | |
| "is_number": is_number, | |
| "is_string": is_string, | |
| # Add more default functions as needed | |
| } | |
| DEFAULT_RULES_SCHEMA = { | |
| "type": "object", | |
| "properties": { | |
| "rules": { | |
| "type": "array", | |
| "items": { | |
| "type": "object", | |
| "properties": { | |
| "name": {"type": "string"}, | |
| "column": {"type": "string"}, | |
| "function": {"type": "string"}, | |
| "fact": {"type": "boolean"}, | |
| "format": {"type": "string"}, | |
| "format_kwargs": { | |
| "type": "object", | |
| "additionalProperties": {"type": "string"}, | |
| }, | |
| }, | |
| "required": ["name", "column", "function", "fact"], | |
| "additionalProperties": False, # This ensures that additional properties cause a validation error | |
| }, | |
| } | |
| }, | |
| "required": ["rules"], | |
| } | |
| def __init__( | |
| self, | |
| df: pd.DataFrame, | |
| custom_functions: Dict[str, Callable[..., Any]] = None, | |
| custom_rules_schema: Dict[str, Any] = None, | |
| formatter: string.Formatter = None, | |
| ): | |
| """ | |
| Initialize the PandasDataFrameValidator with a DataFrame, custom functions, and a formatter. | |
| Parameters: | |
| df (pd.DataFrame): The DataFrame to validate. | |
| custom_functions (Dict[str, Callable[..., Any]], optional): A dictionary of custom functions. Defaults to None. | |
| formatter (string.Formatter, optional): A custom formatter for column names. Defaults to GracefulKeyFormatter. | |
| """ | |
| self.df = df | |
| self.formatter = formatter or GracefulKeyFormatter() | |
| self.rules_schema = custom_rules_schema or self.DEFAULT_RULES_SCHEMA | |
| # Merge default functions with custom functions | |
| self.functions = {**self.DEFAULT_FUNCTIONS, **(custom_functions or {})} | |
| def validate_rules(self, rules: Dict) -> bool: | |
| """ | |
| Validate the provided rules against the defined self.rules_schema. | |
| Parameters: | |
| rules (Dict): The rules to validate. | |
| Returns: | |
| bool: True if the rules are valid, False otherwise. | |
| Raises: | |
| jsonschema.exceptions.ValidationError: If the rules do not match the schema. | |
| """ | |
| validator = Draft7Validator(self.rules_schema) | |
| errors = [error.message for error in validator.iter_errors(rules)] | |
| if errors: | |
| error_messages = "\n".join(errors) | |
| raise exceptions.ValidationError( | |
| f"Provided rules do not match the expected schema. Errors:\n{error_messages}" | |
| ) | |
| return True | |
| def validate(self, rules: Dict) -> pd.DataFrame: | |
| """ | |
| Validate_rules to rules_schema | |
| Apply the defined rules to the DataFrame and validate its values. | |
| Parameters: | |
| rules (Dict): A dictionary containing the rules to apply. | |
| Returns: | |
| pd.DataFrame: The DataFrame with additional columns indicating the validation results. | |
| """ | |
| self.validate_rules(rules=rules) | |
| for rule in rules["rules"]: | |
| column = rule["column"] | |
| function_name = rule["function"] | |
| rule_name = rule["name"] | |
| fact = rule["fact"] | |
| format_str = rule.get("format", "{rule_name}_valid") | |
| format_kwargs = rule.get("format_kwargs", {}) | |
| format_kwargs["rule_name"] = rule_name | |
| function = self.functions.get(function_name) | |
| if not function: | |
| raise ValueError(f"Function '{function_name}' not found.") | |
| column_name = self.formatter.format(format_str, **format_kwargs) | |
| self.df[column_name] = self.df[column].apply( | |
| self._apply_function_wrapper(function, fact) | |
| ) | |
| return self.df | |
| @staticmethod | |
| def _apply_function_wrapper( | |
| function: Callable[..., Any], fact: Any | |
| ) -> Callable[..., int]: | |
| """ | |
| Return a wrapper function to apply the given function and compare its result with the expected fact. | |
| Parameters: | |
| function (Callable[..., Any]): The function to apply. | |
| fact (Any): The expected result. | |
| Returns: | |
| Callable[..., int]: The wrapper function. | |
| """ | |
| def wrapper(value: Any) -> int: | |
| try: | |
| result = function(value) | |
| if result == fact: | |
| return 1 | |
| else: | |
| return 0 | |
| except Exception: | |
| return -1 | |
| return wrapper | |
| # Usage: | |
| if __name__ == "__main__": | |
| # Define custom function | |
| def is_uppercase(value: Any) -> bool: | |
| """ | |
| Check if the given value, when converted to a string, is in uppercase. | |
| Parameters: | |
| value (Any): The value to check. | |
| Returns: | |
| bool: True if the value is in uppercase, False otherwise. | |
| """ | |
| return str(value).isupper() | |
| # Define your rules | |
| rules = { | |
| "rules": [ | |
| { | |
| "name": "age_is_number", | |
| "column": "age", | |
| "function": "is_number", | |
| "fact": True, | |
| "format": "{rule_name}_{suffix}__{test}", | |
| "format_kwargs": { | |
| "suffix": "validation_result", | |
| }, | |
| }, | |
| { | |
| "name": "name_is_string", | |
| "column": "name", | |
| "function": "is_string", | |
| "fact": True, | |
| }, | |
| { | |
| "name": "name_is_uppercase", | |
| "column": "name", | |
| "function": "is_uppercase", | |
| "fact": True, | |
| }, | |
| ], | |
| } | |
| data = pd.DataFrame( | |
| [ | |
| {"name": "John", "age": 30}, | |
| {"name": "Alice", "age": "thirty"}, # This will cause a validation error | |
| {"name": "Bob", "age": 40}, | |
| {"name": "bob lowwer", "age": "250"}, | |
| ] | |
| ) | |
| validator = PandasDataFrameValidator( | |
| data, custom_functions={"is_uppercase": is_uppercase} | |
| ) | |
| validated_data = validator.validate(rules) | |
| print(validated_data) | |
| # Define your invalid rules | |
| invalid_rules = { | |
| "rules": [ | |
| { | |
| "invalid_key": "value1", | |
| "name": "age_is_number", | |
| "column": "age", | |
| "function": "is_number", | |
| "fact": True, | |
| "format": "{rule_name}_{suffix}__{test}", | |
| "format_kwargs": { | |
| "suffix": "validation_result", | |
| }, | |
| }, | |
| { | |
| "another_invalid_key": "value1", | |
| "name": "name_is_string", | |
| "column": "name", | |
| "function": "is_string", | |
| "fact": True, | |
| }, | |
| { | |
| "name": "name_is_uppercase", | |
| "column": "name", | |
| "function": "is_uppercase", | |
| "fact": True, | |
| }, | |
| ], | |
| } | |
| validator.validate_rules(rules=invalid_rules) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| pandas | |
| jsonschema |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment