Created
November 18, 2014 22:58
-
-
Save spitz-dan-l/f494674b855eeea9f998 to your computer and use it in GitHub Desktop.
One-hot encoding demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction import DictVectorizer | |
import pandas | |
def demo1(input_csv, new_csv): | |
"""Demonstration of one-hot encoding through Scikit-Learn's DictVectorizer object.""" | |
df = pandas.read_csv(input_csv) #read csv into a DataFrame object | |
#create the one hot encoder | |
one_hot_encoder = DictVectorizer(sparse=False) | |
#one hot encoder builds its internal mapping from string -> column index | |
#using the data in df | |
one_hot_encoder.fit(df_to_dicts(df)) #takes a long time on large datasets | |
#note- the output of transform() is a numpy array, NOT a DataFrame | |
one_hot_encoded_array = one_hot_encoder.transform(df_to_dicts(df)) | |
print(one_hot_encoded_array) | |
# ...later, we have new data never seen by our model before... | |
df2 = pandas.read_csv(new_csv) | |
# we are reusing the same one_hot_encoder object as before. it gracefully handles never-before-seen categorical values | |
# and maps all input values to the correct columns. | |
new_one_hot_encoded_array = one_hot_encoder.transform(df_to_dicts(df2)) | |
print(new_one_hot_encoded_array) | |
def df_to_dicts(df): | |
"""helper function for feeding the data from a DataFrame into a DictVectorizer""" | |
for (i, r) in df.iterrows(): | |
yield r.to_dict() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment