-
-
Save tcvieira/fd17b62419faf365e157f0fc63c481bd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Formattinng data | |
data['state'] = data['state'].str.upper() # Capitalize the whole thing | |
data['state'] = data['state'].replace( # Changing the format of the string | |
to_replace=["CA", "C.A", "CALI"], | |
value=["CALIFORNIA", "CALIFORNIA", "CALIFORNIA"]) | |
# Dates and times are quite common in large datasets | |
# Converting all strings to datetime objects is good standardisation practice | |
# Here, the data["time"] strings will look like "2019-01-15", which is exactly | |
# how we set the "format" variable below | |
data["time"] = pd.to_datetime(data["time"], format='%Y-%m-%d', errors='ignore') | |
# Discretising continous variables | |
# Height is in inches and we make a few bins with labels | |
data["height"] = pd.cut(data['height'], | |
bins=[0, 48, 60, 66, 72, 78, 100], | |
labels=["Super Short","Short", "Average", "Above Average","Tall","Super Tall"]) | |
# Replace string values with some floats that our | |
# ML models will be able to handle | |
mapping = {'Adult Woman': 1, 'Adult Man': 2, "Child": 3} | |
data"Person" = data["Person"].replace(mapping) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment