Last active
June 17, 2023 05:37
-
-
Save juanluisrto/ba842d2a0991244adc3f77d0278a9613 to your computer and use it in GitHub Desktop.
Function which generates dataframes with fake data in a controlled manner.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from itertools import cycle | |
def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None): | |
categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'], | |
'names' : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'], | |
'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'], | |
'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white'] | |
} | |
default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")} | |
rng = np.random.default_rng(seed) | |
first_c = default_intervals["c"][0] | |
categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c]) | |
default_intervals["c"] = (categories_names, default_intervals["c"][1]) | |
if isinstance(col_names,list): | |
assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements" | |
elif col_names is None: | |
suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"} | |
col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)] | |
if isinstance(intervals,list): | |
assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements" | |
else: | |
if isinstance(intervals,dict): | |
assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys" | |
default_intervals.update(intervals) | |
intervals = [default_intervals[col] for col in cols] | |
df = pd.DataFrame() | |
for col, col_name, interval in zip(cols, col_names, intervals): | |
if interval is None: | |
interval = default_intervals[col] | |
assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings." | |
if col in ("i","f","d"): | |
start, end = interval | |
if col == "i": | |
df[col_name] = rng.integers(start, end, size) | |
elif col == "f": | |
df[col_name] = rng.uniform(start, end, size) | |
elif col == "c": | |
if isinstance(interval, list): | |
categories = np.array(interval) | |
else: | |
cat_family, length = interval | |
if isinstance(cat_family, cycle): | |
cat_family = next(cat_family) | |
assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}" | |
categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True) | |
df[col_name] = rng.choice(categories, size, shuffle = True) | |
elif col == "d": | |
df[col_name] = rng.choice(pd.date_range(start, end), size) | |
return df |
Thanks @dqmota for developing this further!
I have been testing your code.
So k
columns are generating an ordered index in order to number each row? This is the expected behaviour right? It's what I am getting.
Hi @juanluisrto, you are right.
I had a requirement to send fake data with unique values to an API, so an ordered numbered index was a straightforward way I found to accomplish it.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Juan, I made a small increment on your great work, please consider merging it to your code.
https://gist.github.com/dqmota/7ee172e67f506d2f0dc4f08561952312
Best regards
DM