Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created February 28, 2024 19:25
Show Gist options
  • Save fsndzomga/7ef9f9f4dabde91f43a4d9aab697db23 to your computer and use it in GitHub Desktop.
Save fsndzomga/7ef9f9f4dabde91f43a4d9aab697db23 to your computer and use it in GitHub Desktop.
import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, create_model
from typing import List, Any, Type
from anonLLM.llm import OpenaiLanguageModel
load_dotenv()
llm = OpenaiLanguageModel(anonymize=False, temperature=1)
def generate_dynamic_model(columns: List[str]) -> BaseModel:
"""
Dynamically creates a Pydantic model with keys for each column name.
Parameters:
columns (List[str]): List of column names to be used as keys in the model.
Returns:
BaseModel: A dynamically generated Pydantic model.
"""
fields = {column: (Any, ...) for column in columns}
dynamic_model = create_model('DynamicModel', **fields)
return dynamic_model
def generate_dataset(columns: List[str], rows: int):
# Define pydantic model on the fly
response_format = generate_dynamic_model(columns)
# Create an empty DataFrame with specified columns
df = pd.DataFrame(columns=columns)
row_df = ""
for row in range(rows):
data = llm.generate(
f"""Generate some synthetic data respecting the format provided
Should be different than values in {df.head(500)}""",
output_format=response_format
)
row_df = pd.DataFrame([data])
df = pd.concat([df, row_df], ignore_index=True)
return df
def infer_column_type(column_values: pd.Series) -> Type[Any]:
"""
Infers the data type for a column based on its non-NaN values.
Parameters:
column_values (pd.Series): The column values.
Returns:
Type[Any]: The inferred data type (str, int, float, etc.).
"""
# Default to str if the column is empty or only contains NaNs
if column_values.dropna().empty:
return str
# Check if the column can be converted to numeric types (int or float)
if pd.to_numeric(column_values.dropna(), errors='coerce').notnull().all():
# If all values are integers, prefer int type
if (pd.to_numeric(column_values.dropna(), errors='coerce') % 1 == 0).all(): # noqa
return int
else:
return float
else:
return str
def generate_dynamic_model_for_column(column: str, column_type: Type[Any]) -> BaseModel: # noqa
"""
Dynamically creates a Pydantic model for a specific column.
Parameters:
column (str): The column name.
column_type (Type[Any]): The data type of the column.
Returns:
BaseModel: A dynamically generated Pydantic model.
"""
fields = {column: (column_type, ...)}
return create_model(f"DynamicModel_{column}", **fields)
def generate_parts_of_dataset(input_df: pd.DataFrame) -> pd.DataFrame:
for row_index, row in input_df.iterrows():
for col in input_df.columns:
if pd.isna(input_df.at[row_index, col]) or input_df.at[row_index, col] == '': # noqa
column_type = infer_column_type(input_df[col])
response_format = generate_dynamic_model_for_column(col, column_type) # noqa
# Prepare context for generation: all non-NaN row values + examples from the column # noqa
row_context = {key: val for key, val in row.dropna().to_dict().items() if key != col} # noqa
column_examples = input_df[col].dropna().unique().tolist()[:3] # Get up to 3 unique non-NaN examples # noqa
prompt = f"Given the context {row_context} and examples {column_examples} in column '{col}', generate a synthetic {col} value." # noqa
synthetic_data = llm.generate(
prompt,
output_format=response_format
)
# Directly access the value using the key
input_df.at[row_index, col] = synthetic_data[col]
return input_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment