Voyz · June 4, 2024 07:02
diff --git a/infer_dtypes.py b/infer_dtypes.py
 import string
 import warnings
 from typing import List, Dict

 import numpy as np
 import pandas as pd


 def infer_dtypes(values: List, sample_size: int = 300, stop_after: int = 300):
    """
    Infers the data type by randomly sampling from a list. Values are explicitly converted to string before checking.

    Args:
        values (list): A list to infer data types from.
        sample_size (int, optional): The number of values to sample from the list. Entire list will be sampled if set to None. Defaults to 300.
        stop_after (int, optional): The maximum number of non-empty values needed for the test. Equal to sample_size if set to None. Defaults to 300.

    Returns:
        str: The inferred data type ('int', 'float', 'bool', 'str', 'mixed', 'empty').
    """
    found = 0
    non_empty_count = 0

    sample_size = sample_size if sample_size is not None else len(values)
    stop_after = stop_after if stop_after is not None else sample_size

    for v in np.random.choice(values, sample_size):
        v = str(v)
        if v != '':
            non_empty_count += 1
            if non_empty_count > stop_after:
                break
            try:
                int(v)
                found |= 1
            except ValueError:
                try:
                    float(v)
                    found |= 2
                except ValueError:
                    if v.lower() in ['true', 'false']:
                        found |= 4
                    else:
                        found |= 8

    # Check if the data is mixed
    if bin(found).count('1') > 1:
        return 'mixed'

    if found & 8:
        return 'str'
    elif found & 4:
        return 'bool'
    elif found & 2:
        return 'float'
    elif found & 1:
        return 'int'
    else:
        return 'empty'


 def serialise_df(df: pd.DataFrame):
    converted_df = df.convert_dtypes()
    values = df.convert_dtypes().T.values
    values = np.where((pd.isnull(values).T * (converted_df.dtypes != 'object').values).T, '', values)
    dictionary = dict(zip(df.columns, values))
    df.convert_dtypes()
    return dictionary


 _mapping = {
    'mixed': 'O',
    'str': 'string',
    'bool': 'bool',
    'float': 'Float64',
    'int': 'Int64',
    'empty': 'O',
 }


 def deserialise_df(values: List | Dict, dtypes: Dict[str, str]):
    df = pd.DataFrame(values)
    for col, dtype in dtypes.items():
        parsed_dtype = _mapping[dtype]
        if dtype in ['int', 'float']:
            df[col].replace('', np.nan, inplace=True)
        if dtype == 'bool' and df[col].replace('', np.nan).isnull().values.any():
            parsed_dtype = 'O'

        df[col] = df[col].astype(parsed_dtype)
    return df


 if __name__ == '__main__':
    warnings.simplefilter(action='ignore', category=FutureWarning)

    df = pd.DataFrame(np.arange(10), columns=['Int64'], dtype='Int64')
    df['Float64'] = df['Int64'] / 100
    df['int32'] = df['Int64'].astype(int)
    df['float64'] = df['int32'] / 100
    df['int_float'] = df['int32']
    df['string'] = [string.ascii_lowercase[i % 26] * 4 for i in df['int32'].values]
    df['empty'] = [''] * len(df)
    df['bool'] = [True] * int(len(df) / 2) + [False] * int(len(df) / 2)

    mixed = []
    for i in range(df.shape[0]):
        row = df.iloc[i]
        s = i % 5
        if s == 0:
            mixed.append(row['Int64'])
        elif s == 1:
            mixed.append(row['Float64'])
        elif s == 2:
            mixed.append(row['bool'])
        elif s == 3:
            mixed.append(row['string'])
        elif s == 4:
            mixed.append(row['empty'])
    df['mixed'] = mixed

    for i in range(10):  # make it huge
        df = pd.concat([df, df.copy()])

    df.reset_index(inplace=True, drop=True)

    # corrupt the data with nans and empty strings
    df.iloc[-3] = {'Int64': np.nan, 'Float64': np.nan, 'int32': np.nan, 'float64': np.nan, 'int_float': 1.00001, 'string': '', 'empty': '', 'bool': ''}

    serialised = serialise_df(df)
    dtypes = {}
    for col, values in serialised.items():
        dtype = infer_dtypes(values)
        dtypes[col] = dtype
        print(f'{col}: {dtype}')

    deserialised = deserialise_df(serialised, dtypes)
    print(deserialised.compare(df))
	import string
	import warnings
	from typing import List, Dict

	import numpy as np
	import pandas as pd


	def infer_dtypes(values: List, sample_size: int = 300, stop_after: int = 300):
	"""
	Infers the data type by randomly sampling from a list. Values are explicitly converted to string before checking.

	Args:
	values (list): A list to infer data types from.
	sample_size (int, optional): The number of values to sample from the list. Entire list will be sampled if set to None. Defaults to 300.
	stop_after (int, optional): The maximum number of non-empty values needed for the test. Equal to sample_size if set to None. Defaults to 300.

	Returns:
	str: The inferred data type ('int', 'float', 'bool', 'str', 'mixed', 'empty').
	"""
	found = 0
	non_empty_count = 0

	sample_size = sample_size if sample_size is not None else len(values)
	stop_after = stop_after if stop_after is not None else sample_size

	for v in np.random.choice(values, sample_size):
	v = str(v)
	if v != '':
	non_empty_count += 1
	if non_empty_count > stop_after:
	break
	try:
	int(v)
	found \|= 1
	except ValueError:
	try:
	float(v)
	found \|= 2
	except ValueError:
	if v.lower() in ['true', 'false']:
	found \|= 4
	else:
	found \|= 8

	# Check if the data is mixed
	if bin(found).count('1') > 1:
	return 'mixed'

	if found & 8:
	return 'str'
	elif found & 4:
	return 'bool'
	elif found & 2:
	return 'float'
	elif found & 1:
	return 'int'
	else:
	return 'empty'


	def serialise_df(df: pd.DataFrame):
	converted_df = df.convert_dtypes()
	values = df.convert_dtypes().T.values
	values = np.where((pd.isnull(values).T * (converted_df.dtypes != 'object').values).T, '', values)
	dictionary = dict(zip(df.columns, values))
	df.convert_dtypes()
	return dictionary


	_mapping = {
	'mixed': 'O',
	'str': 'string',
	'bool': 'bool',
	'float': 'Float64',
	'int': 'Int64',
	'empty': 'O',
	}


	def deserialise_df(values: List \| Dict, dtypes: Dict[str, str]):
	df = pd.DataFrame(values)
	for col, dtype in dtypes.items():
	parsed_dtype = _mapping[dtype]
	if dtype in ['int', 'float']:
	df[col].replace('', np.nan, inplace=True)
	if dtype == 'bool' and df[col].replace('', np.nan).isnull().values.any():
	parsed_dtype = 'O'

	df[col] = df[col].astype(parsed_dtype)
	return df


	if __name__ == '__main__':
	warnings.simplefilter(action='ignore', category=FutureWarning)

	df = pd.DataFrame(np.arange(10), columns=['Int64'], dtype='Int64')
	df['Float64'] = df['Int64'] / 100
	df['int32'] = df['Int64'].astype(int)
	df['float64'] = df['int32'] / 100
	df['int_float'] = df['int32']
	df['string'] = [string.ascii_lowercase[i % 26] * 4 for i in df['int32'].values]
	df['empty'] = [''] * len(df)
	df['bool'] = [True] * int(len(df) / 2) + [False] * int(len(df) / 2)

	mixed = []
	for i in range(df.shape[0]):
	row = df.iloc[i]
	s = i % 5
	if s == 0:
	mixed.append(row['Int64'])
	elif s == 1:
	mixed.append(row['Float64'])
	elif s == 2:
	mixed.append(row['bool'])
	elif s == 3:
	mixed.append(row['string'])
	elif s == 4:
	mixed.append(row['empty'])
	df['mixed'] = mixed

	for i in range(10): # make it huge
	df = pd.concat([df, df.copy()])

	df.reset_index(inplace=True, drop=True)

	# corrupt the data with nans and empty strings
	df.iloc[-3] = {'Int64': np.nan, 'Float64': np.nan, 'int32': np.nan, 'float64': np.nan, 'int_float': 1.00001, 'string': '', 'empty': '', 'bool': ''}

	serialised = serialise_df(df)
	dtypes = {}
	for col, values in serialised.items():
	dtype = infer_dtypes(values)
	dtypes[col] = dtype
	print(f'{col}: {dtype}')

	deserialised = deserialise_df(serialised, dtypes)
	print(deserialised.compare(df))