Created
February 3, 2015 19:47
-
-
Save yankov/a86d44bf0e6009944c21 to your computer and use it in GitHub Desktop.
partial onhotencoder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy import sparse | |
class OneHotEncoder(): | |
""" | |
OneHotEncoder takes data matrix with categorical columns and | |
converts it to a sparse binary matrix doing one-of-k encoding. | |
Parts of code borrowed from Paul Duan (www.paulduan.com) | |
Licence: MIT (https://github.com/pyduan/amazonaccess/blob/master/MIT-LICENSE) | |
""" | |
def __init__(self): | |
self.keymap = None | |
def fit(self, x): | |
self.keymap = [] | |
for col in x.T: | |
uniques = set(list(col)) | |
self.keymap.append(dict((key, i) for i, key in enumerate(uniques))) | |
def partial_fit(self, x): | |
""" | |
This method can be used for doing one hot encoding in mini-batch mode. | |
""" | |
if self.keymap is None: | |
self.fit(x) | |
else: | |
for i, col in enumerate(x.T): | |
uniques = set(self.keymap[i].keys() + (list(col))) | |
self.keymap[i] = dict((key, i) for i, key in enumerate(uniques)) | |
def transform(self, x): | |
if self.keymap is None: | |
self.fit(x) | |
outdat = [] | |
for i, col in enumerate(x.T): | |
km = self.keymap[i] | |
num_labels = len(km) | |
spmat = sparse.lil_matrix((x.shape[0], num_labels)) | |
for j, val in enumerate(col): | |
if val in km: | |
spmat[j, km[val]] = 1 | |
outdat.append(spmat) | |
outdat = sparse.hstack(outdat).tocsr() | |
return outdat |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment