Skip to content

Instantly share code, notes, and snippets.

@raghavrv
Created February 22, 2016 14:38
Show Gist options
  • Save raghavrv/4f6c8ee25d62d3867774 to your computer and use it in GitHub Desktop.
Save raghavrv/4f6c8ee25d62d3867774 to your computer and use it in GitHub Desktop.
Loading the Census Income Dataset in python
# The census income dataset with 0.2 million datapoints
# http://sci2s.ugr.es/keel/dataset.php?cod=195
# Load categories of categorical features from descr
descr = """@attribute Atr-0 integer[0,90]
@attribute Atr-1 {Self-employed-not_incorporated,Not_in_universe,Private,Local_government,Federal_government,Self-employed-incorporated,State_government,Never_worked,Without_pay}
@attribute Atr-2 integer[0,51]
@attribute Atr-3 integer[0,46]
@attribute Atr-4 {Some_college_but_no_degree,10th_grade,Children,Bachelors_degree(BA_AB_BS),High_school_graduate,Masters_degree(MA_MS_MEng_MEd_MSW_MBA),Less_than_1st_grade,Associates_degree-academic_program,7th_and_8th_grade,12th_grade_no_diploma,Associates_degree-occup_/vocational,Prof_school_degree_(MD_DDS_DVM_LLB_JD),5th_or_6th_grade,11th_grade,Doctorate_degree(PhD_EdD),9th_grade,1st_2nd_3rd_or_4th_grade}
@attribute Atr-5 integer[0,9999]
@attribute Atr-6 {Not_in_universe,High_school,College_or_university}
@attribute Atr-7 {Divorced,Never_married,Married-civilian_spouse_present,Widowed,Separated,Married-spouse_absent,Married-A_F_spouse_present}
@attribute Atr-8 {Construction,Not_in_universe_or_children,Entertainment,Finance_insurance_and_real_estate,Education,Business_and_repair_services,Manufacturing-nondurable_goods,Personal_services_except_private_HH,Manufacturing-durable_goods,Other_professional_services,Mining,Transportation,Wholesale_trade,Public_administration,Retail_trade,Social_services,Private_household_services,Utilities_and_sanitary_services,Communications,Hospital_services,Medical_except_hospital,Agriculture,Forestry_and_fisheries,Armed_Forces}
@attribute Atr-9 {Precision_production_craft_&_repair,Not_in_universe,Professional_specialty,Executive_admin_and_managerial,Handlers_equip_cleaners_etc,Adm_support_including_clerical,Machine_operators_assmblrs_&_inspctrs,Other_service,Sales,Private_household_services,Technicians_and_related_support,Transportation_and_material_moving,Farming_forestry_and_fishing,Protective_services,Armed_Forces}
@attribute Atr-10 {White,Asian_or_Pacific_Islander,Amer_Indian_Aleut_or_Eskimo,Black,Other}
@attribute Atr-11 {All_other,Do_not_know,Central_or_South_American,Mexican_(Mexicano),Mexican-American,Other_Spanish,Puerto_Rican,Cuban,Chicano,NA}
@attribute Atr-12 {Male,Female}
@attribute Atr-13 {Not_in_universe,No,Yes}
@attribute Atr-14 {Not_in_universe,Job_loser_-_on_layoff,Other_job_loser,New_entrant,Re-entrant,Job_leaver}
@attribute Atr-15 {Children_or_Armed_Forces,Not_in_labor_force,Full-time_schedules,Unemployed_full-time,Unemployed_part-_time,PT_for_non-econ_reasons_usually_FT,PT_for_econ_reasons_usually_PT,PT_for_econ_reasons_usually_FT}
@attribute Atr-16 integer[0,99999]
@attribute Atr-17 integer[0,4608]
@attribute Atr-18 integer[0,99999]
@attribute Atr-19 {Head_of_household,Nonfiler,Joint_both_under_65,Single,Joint_both_65+,Joint_one_under_65_&_one_65+}
@attribute Atr-20 {South,Not_in_universe,Northeast,Midwest,West,Abroad}
@attribute Atr-21 {Arkansas,Not_in_universe,Utah,Michigan,Minnesota,Alaska,Kansas,Indiana,Massachusetts,New_Mexico,Nevada,Tennessee,Colorado,Abroad,Kentucky,California,Arizona,North_Carolina,Connecticut,Florida,Vermont,Maryland,Oklahoma,Oregon,Ohio,South_Carolina,Texas,Montana,Wyoming,Georgia,Pennsylvania,Iowa,New_Hampshire,Missouri,Alabama,North_Dakota,New_Jersey,Louisiana,West_Virginia,Delaware,Illinois,Maine,Wisconsin,New_York,Idaho,District_of_Columbia,South_Dakota,Nebraska,Virginia,Mississippi}
@attribute Atr-22 {Householder,Child_18+_never_marr_Not_in_a_subfamily,Child_<18_never_marr_not_in_subfamily,Spouse_of_householder,Secondary_individual,Other_Rel_18+_never_marr_not_in_subfamily,Nonfamily_householder,Grandchild_<18_never_marr_not_in_subfamily,Grandchild_<18_never_marr_child_of_subfamily_RP,Child_18+_ever_marr_Not_in_a_subfamily,Child_18+_never_marr_RP_of_subfamily,Child_18+_spouse_of_subfamily_RP,Other_Rel_<18_never_marr_child_of_subfamily_RP,Child_under_18_of_RP_of_unrel_subfamily,Grandchild_18+_never_marr_not_in_subfamily,Child_18+_ever_marr_RP_of_subfamily,Other_Rel_18+_ever_marr_RP_of_subfamily,Other_Rel_18+_ever_marr_not_in_subfamily,RP_of_unrelated_subfamily,Other_Rel_18+_spouse_of_subfamily_RP,Other_Rel_<18_never_marr_not_in_subfamily,Other_Rel_<18_spouse_of_subfamily_RP,In_group_quarters,Grandchild_18+_spouse_of_subfamily_RP,Other_Rel_18+_never_marr_RP_of_subfamily,Child_<18_never_marr_RP_of_subfamily,Child_<18_ever_marr_not_in_subfamily,Other_Rel_<18_ever_marr_RP_of_subfamily,Grandchild_18+_ever_marr_not_in_subfamily,Child_<18_spouse_of_subfamily_RP,Spouse_of_RP_of_unrelated_subfamily,Other_Rel_<18_never_married_RP_of_subfamily,Grandchild_18+_never_marr_RP_of_subfamily,Grandchild_18+_ever_marr_RP_of_subfamily,Child_<18_ever_marr_RP_of_subfamily,Other_Rel_<18_ever_marr_not_in_subfamily,Grandchild_<18_never_marr_RP_of_subfamily,Grandchild_<18_ever_marr_not_in_subfamily}
@attribute Atr-23 {Householder,Child_18_or_older,Child_under_18_never_married,Spouse_of_householder,Nonrelative_of_householder,Other_relative_of_householder,Group_Quarters-_Secondary_individual,Child_under_18_ever_married}
@attribute Atr-24 real[37.87,18656.3]
@attribute Atr-25 {MSA_to_MSA,Nonmover,NonMSA_to_nonMSA,Not_in_universe,Not_identifiable,Abroad_to_MSA,MSA_to_nonMSA,Abroad_to_nonMSA,NonMSA_to_MSA}
@attribute Atr-26 {Same_county,Nonmover,Different_region,Different_county_same_state,Not_in_universe,Different_division_same_region,Abroad,Different_state_same_division}
@attribute Atr-27 {Same_county,Nonmover,Different_state_in_South,Different_county_same_state,Not_in_universe,Different_state_in_Northeast,Abroad,Different_state_in_Midwest,Different_state_in_West}
@attribute Atr-28 {No,Not_in_universe_under_1_year_old,Yes}
@attribute Atr-29 {Yes,Not_in_universe,No}
@attribute Atr-30 integer[0,6]
@attribute Atr-31 {Not_in_universe,Both_parents_present,Mother_only_present,Neither_parent_present,Father_only_present}
@attribute Atr-32 {United-States,Vietnam,Philippines,Columbia,Germany,Mexico,Japan,Peru,Dominican-Republic,South_Korea,Cuba,El-Salvador,Canada,Scotland,Outlying-U_S_(Guam_USVI_etc),Italy,Guatemala,Ecuador,Puerto-Rico,Cambodia,China,Poland,Nicaragua,Taiwan,England,Ireland,Hungary,Yugoslavia,Trinadad&Tobago,Jamaica,Honduras,Portugal,Iran,France,India,Hong_Kong,Haiti,Greece,Holand-Netherlands,Thailand,Laos,Panama}
@attribute Atr-33 {United-States,Vietnam,Columbia,Mexico,El-Salvador,Peru,Puerto-Rico,Cuba,Philippines,Dominican-Republic,Germany,England,Guatemala,Scotland,Portugal,Italy,Ecuador,Yugoslavia,China,Poland,Hungary,Nicaragua,Taiwan,Ireland,Canada,South_Korea,Trinadad&Tobago,Jamaica,Honduras,Iran,France,Cambodia,India,Hong_Kong,Haiti,Japan,Greece,Holand-Netherlands,Thailand,Panama,Laos,Outlying-U_S_(Guam_USVI_etc)}
@attribute Atr-34 {United-States,Vietnam,Columbia,Mexico,Peru,Cuba,Philippines,Dominican-Republic,El-Salvador,Canada,Scotland,Portugal,Guatemala,Ecuador,Germany,Outlying-U_S_(Guam_USVI_etc),Puerto-Rico,Italy,China,Poland,Nicaragua,Taiwan,England,Ireland,South_Korea,Trinadad&Tobago,Jamaica,Honduras,Iran,Hungary,France,Cambodia,India,Hong_Kong,Japan,Haiti,Holand-Netherlands,Greece,Thailand,Panama,Yugoslavia,Laos}
@attribute Atr-35 {Native-_Born_in_the_United_States,Foreign_born-_Not_a_citizen_of_U_S,Foreign_born-_U_S_citizen_by_naturalization,Native-_Born_abroad_of_American_Parent(s),Native-_Born_in_Puerto_Rico_or_U_S_Outlying}
@attribute Atr-36 integer[0,2]
@attribute Atr-37 {Not_in_universe,No,Yes}
@attribute Atr-38 integer[0,2]
@attribute Atr-39 integer[0,52]
@attribute Atr-40 integer[94,95]"""
cat_feats = []
feat_names = []
is_cat = []
for feat in descr.splitlines():
feat = feat.strip('@attribute ')
if '{' not in feat:
cat_feats.append(())
feat_names.append(feat.split(' ')[0])
is_cat.append(0)
continue
f_i_name, f_i_vals = feat.split(' {')
feat_names.append(f_i_name)
cat_feats.append(tuple(sorted(f_i_vals.strip(' }').split(','))))
is_cat.append(1)
output_labels = ['-_50000.', '50000+.']
# Delete all the attribute description lines before proceeding with this script
# Load data from census income dataset
data = []
target = []
with open('census.dat') as census:
for line in census.readlines():
data_i = []
line = line.strip('\r\n ').split(',')
# print line[:-1], line[-1]
for j, d in enumerate(line[:-1]):
if bool(is_cat[j]):
try:
data_i.append(float(cat_feats[j].index(d))
if d != "?" else np.nan)
except:
print d, j, feat_names[j]
raise
else:
data_i.append(float(d)) #if d != "?" else np.nan)
data.append(data_i)
target.append(int(line[-1] == output_labels[0]))
data = np.array(data)
target = np.array(target)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment