Created
February 28, 2024 19:25
-
-
Save fsndzomga/7ef9f9f4dabde91f43a4d9aab697db23 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from dotenv import load_dotenv | |
from pydantic import BaseModel, create_model | |
from typing import List, Any, Type | |
from anonLLM.llm import OpenaiLanguageModel | |
load_dotenv() | |
llm = OpenaiLanguageModel(anonymize=False, temperature=1) | |
def generate_dynamic_model(columns: List[str]) -> BaseModel: | |
""" | |
Dynamically creates a Pydantic model with keys for each column name. | |
Parameters: | |
columns (List[str]): List of column names to be used as keys in the model. | |
Returns: | |
BaseModel: A dynamically generated Pydantic model. | |
""" | |
fields = {column: (Any, ...) for column in columns} | |
dynamic_model = create_model('DynamicModel', **fields) | |
return dynamic_model | |
def generate_dataset(columns: List[str], rows: int): | |
# Define pydantic model on the fly | |
response_format = generate_dynamic_model(columns) | |
# Create an empty DataFrame with specified columns | |
df = pd.DataFrame(columns=columns) | |
row_df = "" | |
for row in range(rows): | |
data = llm.generate( | |
f"""Generate some synthetic data respecting the format provided | |
Should be different than values in {df.head(500)}""", | |
output_format=response_format | |
) | |
row_df = pd.DataFrame([data]) | |
df = pd.concat([df, row_df], ignore_index=True) | |
return df | |
def infer_column_type(column_values: pd.Series) -> Type[Any]: | |
""" | |
Infers the data type for a column based on its non-NaN values. | |
Parameters: | |
column_values (pd.Series): The column values. | |
Returns: | |
Type[Any]: The inferred data type (str, int, float, etc.). | |
""" | |
# Default to str if the column is empty or only contains NaNs | |
if column_values.dropna().empty: | |
return str | |
# Check if the column can be converted to numeric types (int or float) | |
if pd.to_numeric(column_values.dropna(), errors='coerce').notnull().all(): | |
# If all values are integers, prefer int type | |
if (pd.to_numeric(column_values.dropna(), errors='coerce') % 1 == 0).all(): # noqa | |
return int | |
else: | |
return float | |
else: | |
return str | |
def generate_dynamic_model_for_column(column: str, column_type: Type[Any]) -> BaseModel: # noqa | |
""" | |
Dynamically creates a Pydantic model for a specific column. | |
Parameters: | |
column (str): The column name. | |
column_type (Type[Any]): The data type of the column. | |
Returns: | |
BaseModel: A dynamically generated Pydantic model. | |
""" | |
fields = {column: (column_type, ...)} | |
return create_model(f"DynamicModel_{column}", **fields) | |
def generate_parts_of_dataset(input_df: pd.DataFrame) -> pd.DataFrame: | |
for row_index, row in input_df.iterrows(): | |
for col in input_df.columns: | |
if pd.isna(input_df.at[row_index, col]) or input_df.at[row_index, col] == '': # noqa | |
column_type = infer_column_type(input_df[col]) | |
response_format = generate_dynamic_model_for_column(col, column_type) # noqa | |
# Prepare context for generation: all non-NaN row values + examples from the column # noqa | |
row_context = {key: val for key, val in row.dropna().to_dict().items() if key != col} # noqa | |
column_examples = input_df[col].dropna().unique().tolist()[:3] # Get up to 3 unique non-NaN examples # noqa | |
prompt = f"Given the context {row_context} and examples {column_examples} in column '{col}', generate a synthetic {col} value." # noqa | |
synthetic_data = llm.generate( | |
prompt, | |
output_format=response_format | |
) | |
# Directly access the value using the key | |
input_df.at[row_index, col] = synthetic_data[col] | |
return input_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment