fsndzomga · February 28, 2024 19:25
diff --git a/synthetic-data-gen.py b/synthetic-data-gen.py
 import pandas as pd
 from dotenv import load_dotenv
 from pydantic import BaseModel, create_model
 from typing import List, Any, Type
 from anonLLM.llm import OpenaiLanguageModel

 load_dotenv()

 llm = OpenaiLanguageModel(anonymize=False, temperature=1)


 def generate_dynamic_model(columns: List[str]) -> BaseModel:
    """
    Dynamically creates a Pydantic model with keys for each column name.

    Parameters:
    columns (List[str]): List of column names to be used as keys in the model.

    Returns:
    BaseModel: A dynamically generated Pydantic model.
    """
    fields = {column: (Any, ...) for column in columns}
    dynamic_model = create_model('DynamicModel', **fields)
    return dynamic_model


 def generate_dataset(columns: List[str], rows: int):
    # Define pydantic model on the fly
    response_format = generate_dynamic_model(columns)

    # Create an empty DataFrame with specified columns
    df = pd.DataFrame(columns=columns)

    row_df = ""

    for row in range(rows):
        data = llm.generate(
            f"""Generate some synthetic data respecting the format provided
            Should be different than values in {df.head(500)}""",
            output_format=response_format
        )
        row_df = pd.DataFrame([data])
        df = pd.concat([df, row_df], ignore_index=True)
    return df


 def infer_column_type(column_values: pd.Series) -> Type[Any]:
    """
    Infers the data type for a column based on its non-NaN values.

    Parameters:
    column_values (pd.Series): The column values.

    Returns:
    Type[Any]: The inferred data type (str, int, float, etc.).
    """
    # Default to str if the column is empty or only contains NaNs
    if column_values.dropna().empty:
        return str

    # Check if the column can be converted to numeric types (int or float)
    if pd.to_numeric(column_values.dropna(), errors='coerce').notnull().all():
        # If all values are integers, prefer int type
        if (pd.to_numeric(column_values.dropna(), errors='coerce') % 1 == 0).all(): # noqa
            return int
        else:
            return float
    else:
        return str


 def generate_dynamic_model_for_column(column: str, column_type: Type[Any]) -> BaseModel: # noqa
    """
    Dynamically creates a Pydantic model for a specific column.

    Parameters:
    column (str): The column name.
    column_type (Type[Any]): The data type of the column.

    Returns:
    BaseModel: A dynamically generated Pydantic model.
    """
    fields = {column: (column_type, ...)}
    return create_model(f"DynamicModel_{column}", **fields)


 def generate_parts_of_dataset(input_df: pd.DataFrame) -> pd.DataFrame:
    for row_index, row in input_df.iterrows():
        for col in input_df.columns:
            if pd.isna(input_df.at[row_index, col]) or input_df.at[row_index, col] == '': # noqa
                column_type = infer_column_type(input_df[col])
                response_format = generate_dynamic_model_for_column(col, column_type) # noqa

                # Prepare context for generation: all non-NaN row values + examples from the column # noqa
                row_context = {key: val for key, val in row.dropna().to_dict().items() if key != col} # noqa
                column_examples = input_df[col].dropna().unique().tolist()[:3]  # Get up to 3 unique non-NaN examples # noqa

                prompt = f"Given the context {row_context} and examples {column_examples} in column '{col}', generate a synthetic {col} value." # noqa

                synthetic_data = llm.generate(
                    prompt,
                    output_format=response_format
                )

                # Directly access the value using the key
                input_df.at[row_index, col] = synthetic_data[col]

    return input_df
	import pandas as pd
	from dotenv import load_dotenv
	from pydantic import BaseModel, create_model
	from typing import List, Any, Type
	from anonLLM.llm import OpenaiLanguageModel

	load_dotenv()

	llm = OpenaiLanguageModel(anonymize=False, temperature=1)


	def generate_dynamic_model(columns: List[str]) -> BaseModel:
	"""
	Dynamically creates a Pydantic model with keys for each column name.

	Parameters:
	columns (List[str]): List of column names to be used as keys in the model.

	Returns:
	BaseModel: A dynamically generated Pydantic model.
	"""
	fields = {column: (Any, ...) for column in columns}
	dynamic_model = create_model('DynamicModel', **fields)
	return dynamic_model


	def generate_dataset(columns: List[str], rows: int):
	# Define pydantic model on the fly
	response_format = generate_dynamic_model(columns)

	# Create an empty DataFrame with specified columns
	df = pd.DataFrame(columns=columns)

	row_df = ""

	for row in range(rows):
	data = llm.generate(
	f"""Generate some synthetic data respecting the format provided
	Should be different than values in {df.head(500)}""",
	output_format=response_format
	)
	row_df = pd.DataFrame([data])
	df = pd.concat([df, row_df], ignore_index=True)
	return df


	def infer_column_type(column_values: pd.Series) -> Type[Any]:
	"""
	Infers the data type for a column based on its non-NaN values.

	Parameters:
	column_values (pd.Series): The column values.

	Returns:
	Type[Any]: The inferred data type (str, int, float, etc.).
	"""
	# Default to str if the column is empty or only contains NaNs
	if column_values.dropna().empty:
	return str

	# Check if the column can be converted to numeric types (int or float)
	if pd.to_numeric(column_values.dropna(), errors='coerce').notnull().all():
	# If all values are integers, prefer int type
	if (pd.to_numeric(column_values.dropna(), errors='coerce') % 1 == 0).all(): # noqa
	return int
	else:
	return float
	else:
	return str


	def generate_dynamic_model_for_column(column: str, column_type: Type[Any]) -> BaseModel: # noqa
	"""
	Dynamically creates a Pydantic model for a specific column.

	Parameters:
	column (str): The column name.
	column_type (Type[Any]): The data type of the column.

	Returns:
	BaseModel: A dynamically generated Pydantic model.
	"""
	fields = {column: (column_type, ...)}
	return create_model(f"DynamicModel_{column}", **fields)


	def generate_parts_of_dataset(input_df: pd.DataFrame) -> pd.DataFrame:
	for row_index, row in input_df.iterrows():
	for col in input_df.columns:
	if pd.isna(input_df.at[row_index, col]) or input_df.at[row_index, col] == '': # noqa
	column_type = infer_column_type(input_df[col])
	response_format = generate_dynamic_model_for_column(col, column_type) # noqa

	# Prepare context for generation: all non-NaN row values + examples from the column # noqa
	row_context = {key: val for key, val in row.dropna().to_dict().items() if key != col} # noqa
	column_examples = input_df[col].dropna().unique().tolist()[:3] # Get up to 3 unique non-NaN examples # noqa

	prompt = f"Given the context {row_context} and examples {column_examples} in column '{col}', generate a synthetic {col} value." # noqa

	synthetic_data = llm.generate(
	prompt,
	output_format=response_format
	)

	# Directly access the value using the key
	input_df.at[row_index, col] = synthetic_data[col]

	return input_df