Last active
December 20, 2020 13:23
-
-
Save sjtalkar/11b3ca21c440b2508f7d5fa53f2f112a to your computer and use it in GitHub Desktop.
Cleaning the rental dataframe and identifying property types
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def createPropertyTypeCol(rental_df): | |
"""This function adds a column called property_type_class to the dataframe | |
Args: | |
rental_df ([type]): [The dataframe has a column called property_type] | |
Returns: | |
[type]: [Dataframe with a more concise list of property types] | |
""" | |
# Property types Private room and Shared Room identified | |
property_df = rental_df[["property_type"]].copy() | |
property_df.loc[ | |
property_df["property_type"].str.contains("Private room|Room in"), | |
"property_type", | |
] = "Private Room" | |
property_df.loc[ | |
property_df["property_type"].str.contains("Shared room"), "property_type" | |
] = "Shared Room" | |
# Extract the second half of all "Entire" property types to get the actual type such as house.. | |
property_df.loc[ | |
property_df["property_type"].str.contains("Entire "), "property_type" | |
] = ( | |
property_df.loc[ | |
property_df["property_type"].str.contains("Entire "), "property_type" | |
] | |
.str.replace("Entire ", "") | |
.str.capitalize() | |
) | |
# We will call Bungalows, Tiny House, Cabins as Cottages: A house is pretty much any structure where people live, usually one or more families. | |
# As per definition: | |
#But traditionally, a cabin is a small house built with simple tools | |
#A bungalow is a style of house or cottage | |
property_df.loc[ | |
property_df["property_type"].str.contains("Tiny house|Bungalow|Cabin"), "property_type" | |
] = "Cottage" | |
rental_df["property_type_class"] = property_df["property_type"] | |
return rental_df | |
############################################################################################################## | |
# Clean up the dataframe | |
############################################################################################################## | |
def cleanRentalDF(filename): | |
# Read the data into a dataframe | |
full_df = pd.read_csv(filename) | |
# select out the columns we are interested in | |
rental_df = full_df[ | |
[ | |
"id", | |
"price", | |
"listing_url", | |
"host_id", | |
"host_response_rate", | |
"host_response_time", | |
"host_acceptance_rate", | |
"review_scores_communication", | |
"review_scores_location", | |
"review_scores_value", | |
"review_scores_checkin", | |
"reviews_per_month", | |
"review_scores_cleanliness", | |
"license", | |
"instant_bookable", | |
"number_of_reviews", | |
"first_review", | |
"last_review", | |
"neighbourhood_cleansed", | |
"neighbourhood_group_cleansed", | |
"latitude", | |
"longitude", | |
"accommodates", | |
"bathrooms_text", | |
"property_type", | |
"has_availability", | |
"availability_30", | |
"availability_60", | |
"availability_90", | |
"availability_365", | |
] | |
].copy() | |
# Make price a float column | |
rental_df["price"] = ( | |
rental_df["price"].str.replace("$", "").str.replace(",", "").astype("float64") | |
) | |
# Change host response rate from string to float so that it is a continuous value | |
# Mean of their reponse rate for the rest | |
# Convert the response rate to float | |
rental_df["host_response_rate_percent"] = ( | |
rental_df["host_response_rate"].str.replace("%", "").astype("float64") | |
) | |
rental_df["host_response_rate_percent"] = rental_df.groupby(["host_id"])[ | |
"host_response_rate_percent" | |
].transform(lambda x: x.fillna(x.mean())) | |
# All the values that are still Nan, we do not have any info about and so fill with zero | |
rental_df["host_response_rate_percent"] = rental_df[ | |
"host_response_rate_percent" | |
].fillna(0) | |
rental_df = rental_df.drop("host_response_rate", axis="columns") | |
# Change response time to one within a dict | |
rank_response_time = { | |
"within an hour": 1, | |
"within a few hours": 2, | |
"within a day": 3, | |
"a few days or more": 4, | |
} | |
rental_df["host_reponse_time_rank"] = rental_df["host_response_time"].map( | |
rank_response_time | |
) | |
rental_df["host_reponse_time_rank"] = rental_df["host_reponse_time_rank"].fillna(0) | |
rental_df = rental_df.drop("host_response_time", axis="columns") | |
# Use the same logic as host_reponse_rate for host_acceptance_rate | |
rental_df["host_acceptance_rate_percent"] = ( | |
rental_df["host_acceptance_rate"].str.replace("%", "").astype("float64") | |
) | |
rental_df["host_acceptance_rate_percent"] = rental_df.groupby(["host_id"])[ | |
"host_acceptance_rate_percent" | |
].transform(lambda x: x.fillna(x.mean())) | |
# All the values that are still Nan, we do not have any info about and so fill with zero | |
rental_df["host_acceptance_rate_percent"] = rental_df[ | |
"host_acceptance_rate_percent" | |
].fillna(0) | |
rental_df = rental_df.drop("host_acceptance_rate", axis="columns") | |
# (‘t’ means available and ‘f’ means not available) | |
# *Convert t (*true) = 1 , f (false) = 0 | |
availability_code_dict = { | |
"t": 1, | |
"f": 0, | |
} | |
rental_df["instant_bookable"] = rental_df["instant_bookable"].map( | |
availability_code_dict | |
) | |
rental_df["has_availability"] = rental_df["has_availability"].map( | |
availability_code_dict | |
) | |
rental_df["first_review"] = pd.to_datetime(rental_df["first_review"]) | |
rental_df["last_review"] = pd.to_datetime(rental_df["last_review"]) | |
# Add a column for a smaller list of property types | |
rental_df = createPropertyTypeCol(rental_df) | |
return rental_df | |
################################################################################################################ | |
# Call the cleanup function and setup a global dataframe | |
################################################################################################################ | |
full_df = cleanRentalDF("data\listings_1.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment