Last active
February 17, 2016 15:34
-
-
Save raghavrv/8ba99007ebb0c96d81cc to your computer and use it in GitHub Desktop.
Snippet to load and vectorize the adult dataset with missing values - https://archive.ics.uci.edu/ml/datasets/Adult
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load categories of categorical features from descr | |
descr = """@attribute Age real [17.0, 90.0] | |
@attribute Workclass {Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked} | |
@attribute Fnlwgt real [12285.0, 1490400.0] | |
@attribute Education {Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool} | |
@attribute Education-num real [1.0, 16.0] | |
@attribute Marital-status {Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse} | |
@attribute Occupation {Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces} | |
@attribute Relationship {Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried} | |
@attribute Race {White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black} | |
@attribute Sex {Female, Male} | |
@attribute Capital-gain real [0.0, 99999.0] | |
@attribute Capital-loss real [0.0, 4356.0] | |
@attribute Hours-per-week real [1.0, 99.0] | |
@attribute Native-country {United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands}""" | |
cat_feats = [] | |
feat_names = [] | |
for feat in descr.splitlines(): | |
feat = feat.strip('@attribute ') | |
if '{' not in feat: | |
cat_feats.append(()) | |
feat_names.append(feat.split(' real ')[0]) | |
continue | |
f_i_name, f_i_vals = feat.split(' {') | |
feat_names.append(f_i_name) | |
cat_feats.append(tuple(sorted(f_i_vals.strip(' }').split(', ')))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load data from adult dataset | |
data = [] | |
target = [] | |
with open('adult.dat') as adult: | |
for line in adult.readlines(): | |
data_i = [] | |
line = line.strip('\r\n ').split(',') | |
# print line | |
for j, d in enumerate(line): | |
if j > 3: | |
# We have deleted the 3rd col from feat_names | |
# and cat_feats | |
j -= 1 | |
if j in (0, 2, 9, 10, 11): | |
data_i.append(float(d)) #if d != "?" else np.nan) | |
# Skip 3 We have numerical data for that | |
elif j in (1, 4, 5, 6, 7, 8, 12): | |
try: | |
data_i.append(float(cat_feats[j].index(d)) | |
if d != "?" else np.nan) | |
except: | |
print d, j, feat_names[j] | |
raise | |
data.append(data_i) | |
target.append(int(line[-1] == '<=50K')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment