audhiaprilliant · November 11, 2022 16:55
diff --git a/matplotlib-102-dataprep.py b/matplotlib-102-dataprep.py
 # ---------- IMPORT PACKAGES ----------

 # Dataframe manipulation
 import pandas as pd

 # Matrices operation
 import numpy as np

 # Data viz with matplotlib
 import matplotlib
 import matplotlib.pyplot as plt
 from matplotlib import style

 # Check packages' version
 print('pandas     ', pd.__version__)
 print('numpy      ', np.__version__)
 print('matplotlib ', matplotlib.__version__)
 print('seaborn    ', sns.__version__)

 # ---------- LOAD DATA SET ----------

 # Load the data set into Python
 df = pd.read_csv(filepath_or_buffer = 'data/WA_Fn-UseC_-Telco-Customer-Churn.csv', sep = ';')

 # Print top 5 rows
 df.head(n = 5)

 # Metadata
 df.info()

 # Change column types
 df = df.astype({'SeniorCitizen': object})

 # ---------- DATA PREPARATION ----------

 # Number of missing values in a data frame
 df.isnull().sum()

 # Summary statistics
 df.describe()

 # Check the unique values of categorical columns
 for col in df.select_dtypes('object').columns:
    print(df[str(col)].value_counts(), '\n')

 # Data aggregation
 df_group_1 = df.groupby('PaymentMethod')[['customerID']].count().reset_index()

 # ---------- AGGREGATE DATA FOR SAMPLES ----------

 # 1 Number of customer by payment method
 df_group_1 = df.groupby('PaymentMethod')[['customerID']].count().reset_index()
 df_group_1.sort_values(by = 'customerID', ascending = False, inplace = True)
 df_group_1['CummulativePerc'] = (df_group_1['customerID'].cumsum() / df_group_1['customerID'].sum())
 df_group_1['CummulativeSum'] = df_group_1['customerID'].cumsum()

 # 2 Number of customer by gender
 df_group_2 = df.groupby('gender')[['customerID']].count().reset_index()

 # 3 Number of customer by senior citizen status
 df_group_3 = df.groupby('Contract')[['customerID']].count().reset_index()

 # 4 Number of customer by paperless billing status
 df_group_4 = df.groupby('PaperlessBilling')[['customerID']].count().reset_index()

 # 5 Number of customer by churn status
 df_group_5 = df.groupby('Churn')[['customerID']].count().reset_index()
	# ---------- IMPORT PACKAGES ----------

	# Dataframe manipulation
	import pandas as pd

	# Matrices operation
	import numpy as np

	# Data viz with matplotlib
	import matplotlib
	import matplotlib.pyplot as plt
	from matplotlib import style

	# Check packages' version
	print('pandas ', pd.__version__)
	print('numpy ', np.__version__)
	print('matplotlib ', matplotlib.__version__)
	print('seaborn ', sns.__version__)

	# ---------- LOAD DATA SET ----------

	# Load the data set into Python
	df = pd.read_csv(filepath_or_buffer = 'data/WA_Fn-UseC_-Telco-Customer-Churn.csv', sep = ';')

	# Print top 5 rows
	df.head(n = 5)

	# Metadata
	df.info()

	# Change column types
	df = df.astype({'SeniorCitizen': object})

	# ---------- DATA PREPARATION ----------

	# Number of missing values in a data frame
	df.isnull().sum()

	# Summary statistics
	df.describe()

	# Check the unique values of categorical columns
	for col in df.select_dtypes('object').columns:
	print(df[str(col)].value_counts(), '\n')

	# Data aggregation
	df_group_1 = df.groupby('PaymentMethod')[['customerID']].count().reset_index()

	# ---------- AGGREGATE DATA FOR SAMPLES ----------

	# 1 Number of customer by payment method
	df_group_1 = df.groupby('PaymentMethod')[['customerID']].count().reset_index()
	df_group_1.sort_values(by = 'customerID', ascending = False, inplace = True)
	df_group_1['CummulativePerc'] = (df_group_1['customerID'].cumsum() / df_group_1['customerID'].sum())
	df_group_1['CummulativeSum'] = df_group_1['customerID'].cumsum()

	# 2 Number of customer by gender
	df_group_2 = df.groupby('gender')[['customerID']].count().reset_index()

	# 3 Number of customer by senior citizen status
	df_group_3 = df.groupby('Contract')[['customerID']].count().reset_index()

	# 4 Number of customer by paperless billing status
	df_group_4 = df.groupby('PaperlessBilling')[['customerID']].count().reset_index()

	# 5 Number of customer by churn status
	df_group_5 = df.groupby('Churn')[['customerID']].count().reset_index()
No results found