atodev · December 28, 2023 15:46
diff --git a/Correlation b/Correlation
 https://www.youtube.com/watch?v=rr-KwIjinpM 
 (25-57)
diff --git a/counts b/counts

 score_counts = df1['TrustScore'].value_counts()

 print('count of rows by location')
 dfa["location"].value_counts()

 print('count of rows by Tenure')
 dfa["tenure(months)"].value_counts()

 #Pre List Departmet activity by day of the week
 dfpr.groupby('department')['day_of_week'].value_counts()

 dfah1.groupby ('address')["out"].count().sort_values(ascending=False).head(10)
 dfpr.groupby('gender')['day_of_week'].value_counts()
 dfpo.groupby('gender')['department'].value_counts()
 Prencc=dfpr.groupby ('address')['out'].count().sort_values(ascending=False)
 df['gender'].value_counts()

 ----https://github.com/atodev/People-Analytics/blob/main/RemoteWorkRevised.ipynb
 sns.set_style("whitegrid")
 order = dfpr['department'].value_counts(ascending=True).index
 sns.countplot(x='department', data=dfpr, order=order).set(title = 'Pre Remote working for Departments');

 print('Pre-count of rows by department')
 dfpr["department"].value_counts()

 #means
 print ('post: level mean        ',dfpo.level.mean())
 print ('post: TrustScore mean   ',dfpo.ts.mean())
 print ('post: Messages out mean ',dfpo.out.mean())

 ---SUM
 dfpr.groupby ('address')['out'].sum().sort_values(ascending=False).head(10)

diff --git a/DateTime b/DateTime
 from Epoc to DT
 df['Created'] = pd.to_datetime(df['DateTime'],unit='s')
diff --git a/drop duplicates if reverse is present in 2 columns b/drop duplicates if reverse is present in 2 columns
 cols = ['Local','Remote']

 dfu [cols] = np.sort(dfu[cols].values,axis=1)
 dfun = dfu.drop_duplicates(subset=cols)
diff --git a/emails to self.py b/emails to self.py
 -- check for emails to self

 dfd = df[df['sender'] == df['recipient']]
 --- remove them
 dfe = df[df['sender'] != df['recipient']]
diff --git a/groupby b/groupby
 https://www.youtube.com/watch?v=CObmzgGYtX8
diff --git a/HIST b/HIST
 #DRAW HIST
 dfe.hist(column='TrustScore')
 OUTPUT
 pd.cut(dfe['TrustScore'], 10).value_counts().sort_index()
 seperate counts of TS
 dff=pd.value_counts(dfe.values.ravel())
diff --git a/lectures b/lectures
 good one 
 https://www.youtube.com/watch?v=VfRyjSh1v0I&list=PL9ATnizYJ7f-qc0eDIY081kfqvJduiEkv&index=4
diff --git a/lil pd's b/lil pd's
 df.nunique() # lists all cols and counts the unique items in them

 # measure time taken of the next block 
 %%time
 (df.groupby('Neigbourhodd').agg(median_price=('SalesPrice','median')))
 ---converting the column to categorical reduces time by more than 50%
 df2 = df1.copy()
 df2['Neigbourhodd'] = df2['Neigbourhodd'].as type ('catagory')
 %%time
 (df.groupby('Neigbourhodd').agg(median_price=('SalesPrice','median')))
diff --git a/offset TZ b/offset TZ
 https://stackoverflow.com/questions/23917144/python-pandas-dateoffset-using-value-from-another-column
 df['new_date'] = df['orig_date'] + df['offset'].astype('timedelta64[D]'))
diff --git a/pandas.py b/pandas.py
 #import libraries
 import numpy as np
 import pandas as pd
 import datetime , timedelta
 import seaborn as sns; sns.set()
 import plotly.express as px
 %matplotlib inline

 #load the data
 dfgp=pd.read_csv('../Data/GeneralPre.csv')
 diversity = div2[div2['ncs'] > 38] 

 #The Merger
 df2=pd.merge(dfgpo, df, on='address')
 df2=pd.merge(df2, dfn, on='recipient', how ='right').fillna('NA')

 #Create CSV
 df2.to_csv(r'AccCovid-wth-nw-post.csv', index = False)
 #Create HTML
 !jupyter nbconvert --to html cae-count.ipynb
 #Strip white spaces
 dfgp=dfgp.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

 # New DF from old - select cols
 df3 = df2[["sender","recipient"]]

 #Query
 dfpr.query('address == 2').sort_values(by='datetime')
 dfahS=dfah2.query('address == 122')
 # Group by
 PostNCC=dfpo.groupby ('address')['out'].count().sort_values(ascending=False)
 #filter by Female Hipo and Influences
 femaledf = df.query('gender == "female" and Hipo == "yes" and Influencers =="Y"')

 #Split on date
 split_date ='3/14/2020'
 df_pre = df.loc[df['DateTime'] <= split_date]
 df_post = df.loc[df['DateTime'] > split_date]
 df_pre

 #Drop Duplicates
 len(dfpo['datetime'])-len(dfpo['datetime'].drop_duplicates())

 #Convert to datetime
 dfpr[["datetime"]] = dfpr[["datetime"]].apply(pd.to_datetime)

 # Min Max dates
 min(df2['Created'])
 max(df2['Created'])

 # list all days that messages were sent - seperate the date
 df4 = df2[["Created"]]
 df4[["Created"]] = df4[["Created"]].apply(pd.to_datetime)
 df4 = df4['Created'].dt.date
 df4= df4.sort_values().drop_duplicates(keep='first')

 #collect weekdays from datetime
 dfpr['day_of_week'] = dfpr['datetime'].dt.day_name()
 #collect hours from datetime
 dfpr['hour'] = dfpr['datetime'].dt.strftime('%H').astype(int)
 #Pre List Departmet activity by day of the week
 dfpr.groupby('department')['day_of_week'].value_counts()
 #gender by hour
 dfah2.groupby('gender')['hour'].value_counts()
 #level by day
 dfah1.groupby('level')['day_of_week'].value_counts()

 #extract int from address
 dfgpo['recipient']=dfgpo['recipient'].str.extract('(\d+)').astype(int)
 # contract the users and groups
 dfr['From User'] = dfr['From User'].str.slice(0,8)
 dfr['To User'] = dfr['To User'].str.slice(0,8)
 dfr['Group ID'] = dfr['Group ID'].str.slice(0,8)

 #Conditionals
 df_select = g_data[g_data[options] == item]
 df_select = df_select.reset_index(drop=True)

 dfe20 = dfe[dfe['TrustScore']  >  19] 
 diversity = div2[div2['ncs'] > 38] 
 #less than and greater than
 dfe2025 = dfe[dfe['TrustScore'] > 20] 
 dfe2025 = dfe2025[dfe2025['TrustScore'] < 25] 
 #pivot table
 bplt=np.round(df.pivot_table(index='Department', columns='Level'))

 #DataType
 df.dtypes
 df.describe()
 df.head(10)
 #change header names
 dfr.rename({'From User': 'source', 'To User': 'target'}, axis=1, inplace=True)
 dfr
diff --git a/pandasql b/pandasql
 https://www.youtube.com/watch?v=xY54Emo8rQM
 from pandasql import sqldf
 # example 
 print (sqldf("select * from col-name. where col-name > 2500 limit 5;",locals()))
diff --git a/plots.py b/plots.py
 # Plot overlaid histograms for continuous features

 import matplotlib.pyplot as plt
 import seaborn as sns
 %matplotlib inline


 for i in ['age', 'fare']:
    died = list(titanic[titanic['survived'] == 0][i].dropna())
    survived = list(titanic[titanic['survived'] == 1][i].dropna())
    xmin = min(min(died), min(survived))
    xmax = max(max(died), max(survived))
    width = (xmax - xmin) / 40
    sns.distplot(died, color='r', kde=False, bins=np.arange(xmin, xmax, width))
    sns.distplot(survived, color='g', kde=False, bins=np.arange(xmin, xmax, width))
    plt.legend(['Did not survive', 'Survived'])
    plt.title('Overlaid histogram for {}'.format(i))
    plt.show()
    
 # Generate categorical plots for ordinal features
 for col in ['pclass', 'sibsp', 'parch']:
    sns.catplot(x=col, y='survived', data=titanic, kind='point', aspect=2, )
    plt.ylim(0, 1)
    
diff --git a/queries b/queries
 name = '33522'
 df1 = df.query('sender == @name')
diff --git a/renameto NaN on input b/renameto NaN on input
 m_data=pd.read_csv("<your_file>", na_values=['?']),names = ['col_1','col_2'])
diff --git a/snippits.py b/snippits.py
 #find the isnulls
 titanic.isnull().sum()

 #find the unique values in columns
 for col in titanic.columns:
    print('{}: {} unique values'.format(col,titanic[col].nunique()))

 #Drop Columns
 cat_feat = ["sex","embarked","class","who","adult_male","deck","embark_town","alive","alone"]
 titanic.drop(cat_feat,axis=1,inplace=True)
 titanic.head()

 #see Distribution
 titanic['survived'].value_counts()
 {{{{ begin this 
 #describe features
 from scipy import stats
 def describe_cont_feature(feature):
    print('\n*** Results for {} ***'.format(feature))
    print(titanic.groupby('survived')[feature].describe())
    print(ttest(feature))
    
 def ttest(feature):
    survived = titanic[titanic['survived']==1][feature]
    not_survived = titanic[titanic['survived']==0][feature]
    tstat, pval = stats.ttest_ind(survived, not_survived, equal_var=False)
    print('t-statistic: {:.1f}, p-value: {:.3}'.format(tstat, pval))   
    
 #look at the distrubution of each feature at each level of the target variable
 for feature in ['pclass','age','sibsp','parch','fare']:
    describe_cont_feature(feature)    
 end this }}}}    


 # 
 df.info

 for col in df.columns:
  print (col,len(df[col].unique()),df[col].unique())
  
 df.describe()
 df.describe(include='object')

  
diff --git a/sqlalchemy b/sqlalchemy
 https://www.youtube.com/watch?v=TJK-Ct9WnCw
diff --git a/strip @domain from alt & ego b/strip @domain from alt & ego
 strip @domain from alt & ego
 df['recipient']=df['recipient'].str.extract('(\d+)').astype(int)
 df['sender']=df['sender'].str.extract('(\d+)').astype(int)
 https://stackoverflow.com/questions/25797245/how-to-properly-add-hours-to-a-pandas-tseries-index-datetimeindex
 test[1].index + pd.DateOffset(hours=16)

 TZ
 https://stackoverflow.com/questions/1111056/get-time-zone-information-of-the-system-in-python
 https://stackoverflow.com/questions/34789888/convert-datetime-columns-to-a-different-timezone-pandas

 --just get the domain
 df['domain_to'] = df['To'].apply(lambda x: x.split('@')[1])

	score_counts = df1['TrustScore'].value_counts()

	print('count of rows by location')
	dfa["location"].value_counts()

	print('count of rows by Tenure')
	dfa["tenure(months)"].value_counts()

	#Pre List Departmet activity by day of the week
	dfpr.groupby('department')['day_of_week'].value_counts()

	dfah1.groupby ('address')["out"].count().sort_values(ascending=False).head(10)
	dfpr.groupby('gender')['day_of_week'].value_counts()
	dfpo.groupby('gender')['department'].value_counts()
	Prencc=dfpr.groupby ('address')['out'].count().sort_values(ascending=False)
	df['gender'].value_counts()

	----https://github.com/atodev/People-Analytics/blob/main/RemoteWorkRevised.ipynb
	sns.set_style("whitegrid")
	order = dfpr['department'].value_counts(ascending=True).index
	sns.countplot(x='department', data=dfpr, order=order).set(title = 'Pre Remote working for Departments');

	print('Pre-count of rows by department')
	dfpr["department"].value_counts()

	#means
	print ('post: level mean ',dfpo.level.mean())
	print ('post: TrustScore mean ',dfpo.ts.mean())
	print ('post: Messages out mean ',dfpo.out.mean())

	---SUM
	dfpr.groupby ('address')['out'].sum().sort_values(ascending=False).head(10)
	from Epoc to DT
	df['Created'] = pd.to_datetime(df['DateTime'],unit='s')
	cols = ['Local','Remote']

	dfu [cols] = np.sort(dfu[cols].values,axis=1)
	dfun = dfu.drop_duplicates(subset=cols)
	-- check for emails to self

	dfd = df[df['sender'] == df['recipient']]
	--- remove them
	dfe = df[df['sender'] != df['recipient']]
	#DRAW HIST
	dfe.hist(column='TrustScore')
	OUTPUT
	pd.cut(dfe['TrustScore'], 10).value_counts().sort_index()
	seperate counts of TS
	dff=pd.value_counts(dfe.values.ravel())
	good one
	https://www.youtube.com/watch?v=VfRyjSh1v0I&list=PL9ATnizYJ7f-qc0eDIY081kfqvJduiEkv&index=4
	df.nunique() # lists all cols and counts the unique items in them

	# measure time taken of the next block
	%%time
	(df.groupby('Neigbourhodd').agg(median_price=('SalesPrice','median')))
	---converting the column to categorical reduces time by more than 50%
	df2 = df1.copy()
	df2['Neigbourhodd'] = df2['Neigbourhodd'].as type ('catagory')
	%%time
	(df.groupby('Neigbourhodd').agg(median_price=('SalesPrice','median')))
	https://stackoverflow.com/questions/23917144/python-pandas-dateoffset-using-value-from-another-column
	df['new_date'] = df['orig_date'] + df['offset'].astype('timedelta64[D]'))
	#import libraries
	import numpy as np
	import pandas as pd
	import datetime , timedelta
	import seaborn as sns; sns.set()
	import plotly.express as px
	%matplotlib inline

	#load the data
	dfgp=pd.read_csv('../Data/GeneralPre.csv')
	diversity = div2[div2['ncs'] > 38]

	#The Merger
	df2=pd.merge(dfgpo, df, on='address')
	df2=pd.merge(df2, dfn, on='recipient', how ='right').fillna('NA')

	#Create CSV
	df2.to_csv(r'AccCovid-wth-nw-post.csv', index = False)
	#Create HTML
	!jupyter nbconvert --to html cae-count.ipynb
	#Strip white spaces
	dfgp=dfgp.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

	# New DF from old - select cols
	df3 = df2[["sender","recipient"]]

	#Query
	dfpr.query('address == 2').sort_values(by='datetime')
	dfahS=dfah2.query('address == 122')
	# Group by
	PostNCC=dfpo.groupby ('address')['out'].count().sort_values(ascending=False)
	#filter by Female Hipo and Influences
	femaledf = df.query('gender == "female" and Hipo == "yes" and Influencers =="Y"')

	#Split on date
	split_date ='3/14/2020'
	df_pre = df.loc[df['DateTime'] <= split_date]
	df_post = df.loc[df['DateTime'] > split_date]
	df_pre

	#Drop Duplicates
	len(dfpo['datetime'])-len(dfpo['datetime'].drop_duplicates())

	#Convert to datetime
	dfpr[["datetime"]] = dfpr[["datetime"]].apply(pd.to_datetime)

	# Min Max dates
	min(df2['Created'])
	max(df2['Created'])

	# list all days that messages were sent - seperate the date
	df4 = df2[["Created"]]
	df4[["Created"]] = df4[["Created"]].apply(pd.to_datetime)
	df4 = df4['Created'].dt.date
	df4= df4.sort_values().drop_duplicates(keep='first')

	#collect weekdays from datetime
	dfpr['day_of_week'] = dfpr['datetime'].dt.day_name()
	#collect hours from datetime
	dfpr['hour'] = dfpr['datetime'].dt.strftime('%H').astype(int)
	#Pre List Departmet activity by day of the week
	dfpr.groupby('department')['day_of_week'].value_counts()
	#gender by hour
	dfah2.groupby('gender')['hour'].value_counts()
	#level by day
	dfah1.groupby('level')['day_of_week'].value_counts()

	#extract int from address
	dfgpo['recipient']=dfgpo['recipient'].str.extract('(\d+)').astype(int)
	# contract the users and groups
	dfr['From User'] = dfr['From User'].str.slice(0,8)
	dfr['To User'] = dfr['To User'].str.slice(0,8)
	dfr['Group ID'] = dfr['Group ID'].str.slice(0,8)

	#Conditionals
	df_select = g_data[g_data[options] == item]
	df_select = df_select.reset_index(drop=True)

	dfe20 = dfe[dfe['TrustScore'] > 19]
	diversity = div2[div2['ncs'] > 38]
	#less than and greater than
	dfe2025 = dfe[dfe['TrustScore'] > 20]
	dfe2025 = dfe2025[dfe2025['TrustScore'] < 25]
	#pivot table
	bplt=np.round(df.pivot_table(index='Department', columns='Level'))

	#DataType
	df.dtypes
	df.describe()
	df.head(10)
	#change header names
	dfr.rename({'From User': 'source', 'To User': 'target'}, axis=1, inplace=True)
	dfr
	https://www.youtube.com/watch?v=xY54Emo8rQM
	from pandasql import sqldf
	# example
	print (sqldf("select * from col-name. where col-name > 2500 limit 5;",locals()))