Skip to content

Instantly share code, notes, and snippets.

@Voyz
Last active June 4, 2024 07:02
Show Gist options
  • Save Voyz/0bb6774e859255ff0ae05a73adb6d329 to your computer and use it in GitHub Desktop.
Save Voyz/0bb6774e859255ff0ae05a73adb6d329 to your computer and use it in GitHub Desktop.
infer_dtypes infers datatypes from a list of values. This file includes an example from pd.DataFrame.
import string
import warnings
from typing import List, Dict
import numpy as np
import pandas as pd
def infer_dtypes(values: List, sample_size: int = 300, stop_after: int = 300):
"""
Infers the data type by randomly sampling from a list. Values are explicitly converted to string before checking.
Args:
values (list): A list to infer data types from.
sample_size (int, optional): The number of values to sample from the list. Entire list will be sampled if set to None. Defaults to 300.
stop_after (int, optional): The maximum number of non-empty values needed for the test. Equal to sample_size if set to None. Defaults to 300.
Returns:
str: The inferred data type ('int', 'float', 'bool', 'str', 'mixed', 'empty').
"""
found = 0
non_empty_count = 0
sample_size = sample_size if sample_size is not None else len(values)
stop_after = stop_after if stop_after is not None else sample_size
for v in np.random.choice(values, sample_size):
v = str(v)
if v != '':
non_empty_count += 1
if non_empty_count > stop_after:
break
try:
int(v)
found |= 1
except ValueError:
try:
float(v)
found |= 2
except ValueError:
if v.lower() in ['true', 'false']:
found |= 4
else:
found |= 8
# Check if the data is mixed
if bin(found).count('1') > 1:
return 'mixed'
if found & 8:
return 'str'
elif found & 4:
return 'bool'
elif found & 2:
return 'float'
elif found & 1:
return 'int'
else:
return 'empty'
def serialise_df(df: pd.DataFrame):
converted_df = df.convert_dtypes()
values = df.convert_dtypes().T.values
values = np.where((pd.isnull(values).T * (converted_df.dtypes != 'object').values).T, '', values)
dictionary = dict(zip(df.columns, values))
df.convert_dtypes()
return dictionary
_mapping = {
'mixed': 'O',
'str': 'string',
'bool': 'bool',
'float': 'Float64',
'int': 'Int64',
'empty': 'O',
}
def deserialise_df(values: List | Dict, dtypes: Dict[str, str]):
df = pd.DataFrame(values)
for col, dtype in dtypes.items():
parsed_dtype = _mapping[dtype]
if dtype in ['int', 'float']:
df[col].replace('', np.nan, inplace=True)
if dtype == 'bool' and df[col].replace('', np.nan).isnull().values.any():
parsed_dtype = 'O'
df[col] = df[col].astype(parsed_dtype)
return df
if __name__ == '__main__':
warnings.simplefilter(action='ignore', category=FutureWarning)
df = pd.DataFrame(np.arange(10), columns=['Int64'], dtype='Int64')
df['Float64'] = df['Int64'] / 100
df['int32'] = df['Int64'].astype(int)
df['float64'] = df['int32'] / 100
df['int_float'] = df['int32']
df['string'] = [string.ascii_lowercase[i % 26] * 4 for i in df['int32'].values]
df['empty'] = [''] * len(df)
df['bool'] = [True] * int(len(df) / 2) + [False] * int(len(df) / 2)
mixed = []
for i in range(df.shape[0]):
row = df.iloc[i]
s = i % 5
if s == 0:
mixed.append(row['Int64'])
elif s == 1:
mixed.append(row['Float64'])
elif s == 2:
mixed.append(row['bool'])
elif s == 3:
mixed.append(row['string'])
elif s == 4:
mixed.append(row['empty'])
df['mixed'] = mixed
for i in range(10): # make it huge
df = pd.concat([df, df.copy()])
df.reset_index(inplace=True, drop=True)
# corrupt the data with nans and empty strings
df.iloc[-3] = {'Int64': np.nan, 'Float64': np.nan, 'int32': np.nan, 'float64': np.nan, 'int_float': 1.00001, 'string': '', 'empty': '', 'bool': ''}
serialised = serialise_df(df)
dtypes = {}
for col, values in serialised.items():
dtype = infer_dtypes(values)
dtypes[col] = dtype
print(f'{col}: {dtype}')
deserialised = deserialise_df(serialised, dtypes)
print(deserialised.compare(df))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment