glamp · August 29, 2015 14:04 · rdempsey · Feb 23, 2015
diff --git a/buildingtheclassifier.py b/buildingtheclassifier.py
 # building the classifier
 from sklearn.ensemble import RandomForestClassifier

 # I find it easiest to just define all the features you'll be using in a list. In
 # our case it's just the fuzzy scores that we generated using fuzzywuzzy.
 features = [
    'name_ratio',
    'name_token_sort_ratio',
    'name_partial_ratio',
    'street_ratio',
    'street_token_sort_ratio',
    'street_partial_ratio'
 ]

 # We're going to fit a RandomForest model to the data. Our target variable will be match
 # which is a 0/1 variable indicating whether or not a given set of names/addresses are 
 # actually the same record.
 clf = RandomForestClassifier()
 clf.fit(df[features], df['match'])

 # Just taking a look at the results. Note that this isn't cross-validated, I'm just trying
 # to get a quick look at the model.
 pd.crosstab(clf.predict(df[features]), df['match'])
	# building the classifier
	from sklearn.ensemble import RandomForestClassifier

	# I find it easiest to just define all the features you'll be using in a list. In
	# our case it's just the fuzzy scores that we generated using fuzzywuzzy.
	features = [
	'name_ratio',
	'name_token_sort_ratio',
	'name_partial_ratio',
	'street_ratio',
	'street_token_sort_ratio',
	'street_partial_ratio'
	]

	# We're going to fit a RandomForest model to the data. Our target variable will be match
	# which is a 0/1 variable indicating whether or not a given set of names/addresses are
	# actually the same record.
	clf = RandomForestClassifier()
	clf.fit(df[features], df['match'])

	# Just taking a look at the results. Note that this isn't cross-validated, I'm just trying
	# to get a quick look at the model.
	pd.crosstab(clf.predict(df[features]), df['match'])