Skip to content

Instantly share code, notes, and snippets.

@ravishchawla
Created February 25, 2019 20:10
Show Gist options
  • Save ravishchawla/2c7137db660408eaf7c6646593c651e3 to your computer and use it in GitHub Desktop.
Save ravishchawla/2c7137db660408eaf7c6646593c651e3 to your computer and use it in GitHub Desktop.
AirBnB post: Data exploration
def df_stats(df):
print('Shape: ' , df.shape);
missings = df.isnull().sum() / len(df);
missing_vals = dict(zip(df.columns[missings > 0], missings[missings > 0]));
print('# Columns with any missing elements : ' , [(w, missing_vals[w]) for w in sorted(missing_vals, key=missing_vals.get, reverse=True)])
print();
print('Listings: ');
df_stats(listings);
print('Reviews: ');
df_stats(reviews);
print('Calandar: ');
df_stats(calandar);
#Let's look at a histogram of the different values
sns.set(style='ticks')
sns.pairplot(listings)
#The price attribute looks very skewed, and is important. Let's look at in detail:
fig, ax = plt.subplots();
listings['price'].hist(ax=ax, bins=500);
ax.set_xscale('log')
#Let's look at the correlation between the attributes in Listings
plt.figure(figsize=(14,10))
corrs = listings.corr();
sns.heatmap(corrs);
listings_cats = listings[['neighbourhood_group', 'neighbourhood', 'room_type']];
plt.figure(figsize=(12,4))
plt.subplot(1,2,1);
g = sns.countplot(x='neighbourhood_group', data=listings_cats);
g.set_xticklabels(g.get_xticklabels(), rotation=90);
plt.subplot(1,2,2);
g = sns.countplot(x='room_type', data=listings_cats);
g.set_xticklabels(g.get_xticklabels(), rotation=45);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment