Skip to content

Instantly share code, notes, and snippets.

View dipanjanS's full-sized avatar
:octocat:

Dipanjan (DJ) Sarkar dipanjanS

:octocat:
View GitHub Profile
host_day_df = logs_df.select(logs_df.host,
F.dayofmonth('time').alias('day'))
host_day_df.show(5, truncate=False)
unique_host_count = (logs_df
.select('host')
.distinct()
.count())
unique_host_count
not200_df = (logs_df
.filter(logs_df['status'] != 200))
error_endpoints_freq_df = (not200_df
.groupBy('endpoint')
.count()
.sort('count', ascending=False)
.limit(10)
)
paths_df = (logs_df
.groupBy('endpoint')
.count()
.sort('count', ascending=False).limit(20))
paths_pd_df = paths_df.toPandas()
paths_pd_df
host_sum_df =(logs_df
.groupBy('host')
.count()
.sort('count', ascending=False).limit(10))
host_sum_df.show(truncate=False)
log_freq_pd_df = (log_freq_df
.toPandas()
.sort_values(by=['log(count)'],
ascending=False))
sns.catplot(x='status', y='log(count)', data=log_freq_pd_df,
kind='bar', order=status_freq_pd_df['status'])
log_freq_df = status_freq_df.withColumn('log(count)',
F.log(status_freq_df['count']))
log_freq_df.show()
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
sns.catplot(x='status', y='count', data=status_freq_pd_df,
kind='bar', order=status_freq_pd_df['status'])
status_freq_pd_df = (status_freq_df
.toPandas()
.sort_values(by=['count'],
ascending=False))
status_freq_pd_df
status_freq_df = (logs_df
.groupBy('status')
.count()
.sort('status')
.cache())
print('Total distinct HTTP Status Codes:', status_freq_df.count())