ZaxR · September 12, 2019 20:25
diff --git a/id_cat_num_types.py b/id_cat_num_types.py
 """
 Models often have both numeric and categorical features.
 These types of features have different EDA flows and need separate trasformations (e.g. OHE for categorical features).
 I want to see how much of the pipeline can reliably be automated,
 starting with identifying the columns we want to use in our model and what type they are.
 Pandas offers `df.select_dtypes` which allows you to use include or excludes for types.
 You can then use the following to split numeric and categorical:
 """

 # splits the df itself; use .categories at the end to just get col names
 df_numeric = df.select_dtypes(include=[np.number])
 df_categorical = df.select_dtypes(exclude=[np.number])

 """
 The problem with this is that it doesn't identify things we don't want very well.
 For example, we usually don't want to include date columns at all,
 and boolean columns should get picked up as categorical, not numeric.
 A library called pandas_profiling is a useful tool for not only detecting types,
 but generating html reports assessing the values/distributions of dataframes.
 See an example output here: https://pandas-profiling.github.io/pandas-profiling/examples/census/census_report.html
 The base type detection can be imported for use separately:
 """

 from collections import defaultdict
 
 import numpy as np
 import pandas as pd
 from pandas_profiling.model.base import get_var_type
 
 
 df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6], "c": ["test", "text", "for", "me", "to", np.nan], "d": [1, 1, 1, 0, 1, 0]})
 
 # Get all the types pandas_profiling offers
 d = {col: get_var_type(df[col])['type'].value for col in df.columns}
 fd = defaultdict(list)
 for k, v in d.items():
    fd[v].append(k)
     
 cols_by_base_type = dict(fd)
 
 # Group the types pandas_profiling offers to match typical needs
 cat_num_cols = defaultdict(list)
 for k, v in cols_by_base_type.items():
    # Treat boolean and unique columns as categorical
    k = 'CAT' if k in ['BOOL', 'UNIQUE'] else k
    cat_num_cols[k].extend(v)
 dict(cat_num_cols)
	"""
	Models often have both numeric and categorical features.
	These types of features have different EDA flows and need separate trasformations (e.g. OHE for categorical features).
	I want to see how much of the pipeline can reliably be automated,
	starting with identifying the columns we want to use in our model and what type they are.
	Pandas offers `df.select_dtypes` which allows you to use include or excludes for types.
	You can then use the following to split numeric and categorical:
	"""

	# splits the df itself; use .categories at the end to just get col names
	df_numeric = df.select_dtypes(include=[np.number])
	df_categorical = df.select_dtypes(exclude=[np.number])

	"""
	The problem with this is that it doesn't identify things we don't want very well.
	For example, we usually don't want to include date columns at all,
	and boolean columns should get picked up as categorical, not numeric.
	A library called pandas_profiling is a useful tool for not only detecting types,
	but generating html reports assessing the values/distributions of dataframes.
	See an example output here: https://pandas-profiling.github.io/pandas-profiling/examples/census/census_report.html
	The base type detection can be imported for use separately:
	"""

	from collections import defaultdict

	import numpy as np
	import pandas as pd
	from pandas_profiling.model.base import get_var_type


	df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6], "c": ["test", "text", "for", "me", "to", np.nan], "d": [1, 1, 1, 0, 1, 0]})

	# Get all the types pandas_profiling offers
	d = {col: get_var_type(df[col])['type'].value for col in df.columns}
	fd = defaultdict(list)
	for k, v in d.items():
	fd[v].append(k)

	cols_by_base_type = dict(fd)

	# Group the types pandas_profiling offers to match typical needs
	cat_num_cols = defaultdict(list)
	for k, v in cols_by_base_type.items():
	# Treat boolean and unique columns as categorical
	k = 'CAT' if k in ['BOOL', 'UNIQUE'] else k
	cat_num_cols[k].extend(v)
	dict(cat_num_cols)