Skip to content

Instantly share code, notes, and snippets.

@sjtalkar
Last active December 20, 2020 13:23
Show Gist options
  • Save sjtalkar/11b3ca21c440b2508f7d5fa53f2f112a to your computer and use it in GitHub Desktop.
Save sjtalkar/11b3ca21c440b2508f7d5fa53f2f112a to your computer and use it in GitHub Desktop.
Cleaning the rental dataframe and identifying property types
def createPropertyTypeCol(rental_df):
"""This function adds a column called property_type_class to the dataframe
Args:
rental_df ([type]): [The dataframe has a column called property_type]
Returns:
[type]: [Dataframe with a more concise list of property types]
"""
# Property types Private room and Shared Room identified
property_df = rental_df[["property_type"]].copy()
property_df.loc[
property_df["property_type"].str.contains("Private room|Room in"),
"property_type",
] = "Private Room"
property_df.loc[
property_df["property_type"].str.contains("Shared room"), "property_type"
] = "Shared Room"
# Extract the second half of all "Entire" property types to get the actual type such as house..
property_df.loc[
property_df["property_type"].str.contains("Entire "), "property_type"
] = (
property_df.loc[
property_df["property_type"].str.contains("Entire "), "property_type"
]
.str.replace("Entire ", "")
.str.capitalize()
)
# We will call Bungalows, Tiny House, Cabins as Cottages: A house is pretty much any structure where people live, usually one or more families.
# As per definition:
#But traditionally, a cabin is a small house built with simple tools
#A bungalow is a style of house or cottage
property_df.loc[
property_df["property_type"].str.contains("Tiny house|Bungalow|Cabin"), "property_type"
] = "Cottage"
rental_df["property_type_class"] = property_df["property_type"]
return rental_df
##############################################################################################################
# Clean up the dataframe
##############################################################################################################
def cleanRentalDF(filename):
# Read the data into a dataframe
full_df = pd.read_csv(filename)
# select out the columns we are interested in
rental_df = full_df[
[
"id",
"price",
"listing_url",
"host_id",
"host_response_rate",
"host_response_time",
"host_acceptance_rate",
"review_scores_communication",
"review_scores_location",
"review_scores_value",
"review_scores_checkin",
"reviews_per_month",
"review_scores_cleanliness",
"license",
"instant_bookable",
"number_of_reviews",
"first_review",
"last_review",
"neighbourhood_cleansed",
"neighbourhood_group_cleansed",
"latitude",
"longitude",
"accommodates",
"bathrooms_text",
"property_type",
"has_availability",
"availability_30",
"availability_60",
"availability_90",
"availability_365",
]
].copy()
# Make price a float column
rental_df["price"] = (
rental_df["price"].str.replace("$", "").str.replace(",", "").astype("float64")
)
# Change host response rate from string to float so that it is a continuous value
# Mean of their reponse rate for the rest
# Convert the response rate to float
rental_df["host_response_rate_percent"] = (
rental_df["host_response_rate"].str.replace("%", "").astype("float64")
)
rental_df["host_response_rate_percent"] = rental_df.groupby(["host_id"])[
"host_response_rate_percent"
].transform(lambda x: x.fillna(x.mean()))
# All the values that are still Nan, we do not have any info about and so fill with zero
rental_df["host_response_rate_percent"] = rental_df[
"host_response_rate_percent"
].fillna(0)
rental_df = rental_df.drop("host_response_rate", axis="columns")
# Change response time to one within a dict
rank_response_time = {
"within an hour": 1,
"within a few hours": 2,
"within a day": 3,
"a few days or more": 4,
}
rental_df["host_reponse_time_rank"] = rental_df["host_response_time"].map(
rank_response_time
)
rental_df["host_reponse_time_rank"] = rental_df["host_reponse_time_rank"].fillna(0)
rental_df = rental_df.drop("host_response_time", axis="columns")
# Use the same logic as host_reponse_rate for host_acceptance_rate
rental_df["host_acceptance_rate_percent"] = (
rental_df["host_acceptance_rate"].str.replace("%", "").astype("float64")
)
rental_df["host_acceptance_rate_percent"] = rental_df.groupby(["host_id"])[
"host_acceptance_rate_percent"
].transform(lambda x: x.fillna(x.mean()))
# All the values that are still Nan, we do not have any info about and so fill with zero
rental_df["host_acceptance_rate_percent"] = rental_df[
"host_acceptance_rate_percent"
].fillna(0)
rental_df = rental_df.drop("host_acceptance_rate", axis="columns")
# (‘t’ means available and ‘f’ means not available)
# *Convert t (*true) = 1 , f (false) = 0
availability_code_dict = {
"t": 1,
"f": 0,
}
rental_df["instant_bookable"] = rental_df["instant_bookable"].map(
availability_code_dict
)
rental_df["has_availability"] = rental_df["has_availability"].map(
availability_code_dict
)
rental_df["first_review"] = pd.to_datetime(rental_df["first_review"])
rental_df["last_review"] = pd.to_datetime(rental_df["last_review"])
# Add a column for a smaller list of property types
rental_df = createPropertyTypeCol(rental_df)
return rental_df
################################################################################################################
# Call the cleanup function and setup a global dataframe
################################################################################################################
full_df = cleanRentalDF("data\listings_1.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment