Skip to content

Instantly share code, notes, and snippets.

@atodev
Last active December 28, 2023 15:46
Show Gist options
  • Save atodev/8724b5a5d665e5e145523170bec9fc5a to your computer and use it in GitHub Desktop.
Save atodev/8724b5a5d665e5e145523170bec9fc5a to your computer and use it in GitHub Desktop.
Pandas #pandas
https://www.youtube.com/watch?v=rr-KwIjinpM
(25-57)
score_counts = df1['TrustScore'].value_counts()
print('count of rows by location')
dfa["location"].value_counts()
print('count of rows by Tenure')
dfa["tenure(months)"].value_counts()
#Pre List Departmet activity by day of the week
dfpr.groupby('department')['day_of_week'].value_counts()
dfah1.groupby ('address')["out"].count().sort_values(ascending=False).head(10)
dfpr.groupby('gender')['day_of_week'].value_counts()
dfpo.groupby('gender')['department'].value_counts()
Prencc=dfpr.groupby ('address')['out'].count().sort_values(ascending=False)
df['gender'].value_counts()
----https://github.com/atodev/People-Analytics/blob/main/RemoteWorkRevised.ipynb
sns.set_style("whitegrid")
order = dfpr['department'].value_counts(ascending=True).index
sns.countplot(x='department', data=dfpr, order=order).set(title = 'Pre Remote working for Departments');
print('Pre-count of rows by department')
dfpr["department"].value_counts()
#means
print ('post: level mean ',dfpo.level.mean())
print ('post: TrustScore mean ',dfpo.ts.mean())
print ('post: Messages out mean ',dfpo.out.mean())
---SUM
dfpr.groupby ('address')['out'].sum().sort_values(ascending=False).head(10)
from Epoc to DT
df['Created'] = pd.to_datetime(df['DateTime'],unit='s')
cols = ['Local','Remote']
dfu [cols] = np.sort(dfu[cols].values,axis=1)
dfun = dfu.drop_duplicates(subset=cols)
-- check for emails to self
dfd = df[df['sender'] == df['recipient']]
--- remove them
dfe = df[df['sender'] != df['recipient']]
https://www.youtube.com/watch?v=CObmzgGYtX8
#DRAW HIST
dfe.hist(column='TrustScore')
OUTPUT
pd.cut(dfe['TrustScore'], 10).value_counts().sort_index()
seperate counts of TS
dff=pd.value_counts(dfe.values.ravel())
good one
https://www.youtube.com/watch?v=VfRyjSh1v0I&list=PL9ATnizYJ7f-qc0eDIY081kfqvJduiEkv&index=4
df.nunique() # lists all cols and counts the unique items in them
# measure time taken of the next block
%%time
(df.groupby('Neigbourhodd').agg(median_price=('SalesPrice','median')))
---converting the column to categorical reduces time by more than 50%
df2 = df1.copy()
df2['Neigbourhodd'] = df2['Neigbourhodd'].as type ('catagory')
%%time
(df.groupby('Neigbourhodd').agg(median_price=('SalesPrice','median')))
https://stackoverflow.com/questions/23917144/python-pandas-dateoffset-using-value-from-another-column
df['new_date'] = df['orig_date'] + df['offset'].astype('timedelta64[D]'))
#import libraries
import numpy as np
import pandas as pd
import datetime , timedelta
import seaborn as sns; sns.set()
import plotly.express as px
%matplotlib inline
#load the data
dfgp=pd.read_csv('../Data/GeneralPre.csv')
diversity = div2[div2['ncs'] > 38]
#The Merger
df2=pd.merge(dfgpo, df, on='address')
df2=pd.merge(df2, dfn, on='recipient', how ='right').fillna('NA')
#Create CSV
df2.to_csv(r'AccCovid-wth-nw-post.csv', index = False)
#Create HTML
!jupyter nbconvert --to html cae-count.ipynb
#Strip white spaces
dfgp=dfgp.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# New DF from old - select cols
df3 = df2[["sender","recipient"]]
#Query
dfpr.query('address == 2').sort_values(by='datetime')
dfahS=dfah2.query('address == 122')
# Group by
PostNCC=dfpo.groupby ('address')['out'].count().sort_values(ascending=False)
#filter by Female Hipo and Influences
femaledf = df.query('gender == "female" and Hipo == "yes" and Influencers =="Y"')
#Split on date
split_date ='3/14/2020'
df_pre = df.loc[df['DateTime'] <= split_date]
df_post = df.loc[df['DateTime'] > split_date]
df_pre
#Drop Duplicates
len(dfpo['datetime'])-len(dfpo['datetime'].drop_duplicates())
#Convert to datetime
dfpr[["datetime"]] = dfpr[["datetime"]].apply(pd.to_datetime)
# Min Max dates
min(df2['Created'])
max(df2['Created'])
# list all days that messages were sent - seperate the date
df4 = df2[["Created"]]
df4[["Created"]] = df4[["Created"]].apply(pd.to_datetime)
df4 = df4['Created'].dt.date
df4= df4.sort_values().drop_duplicates(keep='first')
#collect weekdays from datetime
dfpr['day_of_week'] = dfpr['datetime'].dt.day_name()
#collect hours from datetime
dfpr['hour'] = dfpr['datetime'].dt.strftime('%H').astype(int)
#Pre List Departmet activity by day of the week
dfpr.groupby('department')['day_of_week'].value_counts()
#gender by hour
dfah2.groupby('gender')['hour'].value_counts()
#level by day
dfah1.groupby('level')['day_of_week'].value_counts()
#extract int from address
dfgpo['recipient']=dfgpo['recipient'].str.extract('(\d+)').astype(int)
# contract the users and groups
dfr['From User'] = dfr['From User'].str.slice(0,8)
dfr['To User'] = dfr['To User'].str.slice(0,8)
dfr['Group ID'] = dfr['Group ID'].str.slice(0,8)
#Conditionals
df_select = g_data[g_data[options] == item]
df_select = df_select.reset_index(drop=True)
dfe20 = dfe[dfe['TrustScore'] > 19]
diversity = div2[div2['ncs'] > 38]
#less than and greater than
dfe2025 = dfe[dfe['TrustScore'] > 20]
dfe2025 = dfe2025[dfe2025['TrustScore'] < 25]
#pivot table
bplt=np.round(df.pivot_table(index='Department', columns='Level'))
#DataType
df.dtypes
df.describe()
df.head(10)
#change header names
dfr.rename({'From User': 'source', 'To User': 'target'}, axis=1, inplace=True)
dfr
https://www.youtube.com/watch?v=xY54Emo8rQM
from pandasql import sqldf
# example
print (sqldf("select * from col-name. where col-name > 2500 limit 5;",locals()))
# Plot overlaid histograms for continuous features
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
for i in ['age', 'fare']:
died = list(titanic[titanic['survived'] == 0][i].dropna())
survived = list(titanic[titanic['survived'] == 1][i].dropna())
xmin = min(min(died), min(survived))
xmax = max(max(died), max(survived))
width = (xmax - xmin) / 40
sns.distplot(died, color='r', kde=False, bins=np.arange(xmin, xmax, width))
sns.distplot(survived, color='g', kde=False, bins=np.arange(xmin, xmax, width))
plt.legend(['Did not survive', 'Survived'])
plt.title('Overlaid histogram for {}'.format(i))
plt.show()
# Generate categorical plots for ordinal features
for col in ['pclass', 'sibsp', 'parch']:
sns.catplot(x=col, y='survived', data=titanic, kind='point', aspect=2, )
plt.ylim(0, 1)
name = '33522'
df1 = df.query('sender == @name')
m_data=pd.read_csv("<your_file>", na_values=['?']),names = ['col_1','col_2'])
#find the isnulls
titanic.isnull().sum()
#find the unique values in columns
for col in titanic.columns:
print('{}: {} unique values'.format(col,titanic[col].nunique()))
#Drop Columns
cat_feat = ["sex","embarked","class","who","adult_male","deck","embark_town","alive","alone"]
titanic.drop(cat_feat,axis=1,inplace=True)
titanic.head()
#see Distribution
titanic['survived'].value_counts()
{{{{ begin this
#describe features
from scipy import stats
def describe_cont_feature(feature):
print('\n*** Results for {} ***'.format(feature))
print(titanic.groupby('survived')[feature].describe())
print(ttest(feature))
def ttest(feature):
survived = titanic[titanic['survived']==1][feature]
not_survived = titanic[titanic['survived']==0][feature]
tstat, pval = stats.ttest_ind(survived, not_survived, equal_var=False)
print('t-statistic: {:.1f}, p-value: {:.3}'.format(tstat, pval))
#look at the distrubution of each feature at each level of the target variable
for feature in ['pclass','age','sibsp','parch','fare']:
describe_cont_feature(feature)
end this }}}}
#
df.info
for col in df.columns:
print (col,len(df[col].unique()),df[col].unique())
df.describe()
df.describe(include='object')
https://www.youtube.com/watch?v=TJK-Ct9WnCw
strip @domain from alt & ego
df['recipient']=df['recipient'].str.extract('(\d+)').astype(int)
df['sender']=df['sender'].str.extract('(\d+)').astype(int)
https://stackoverflow.com/questions/25797245/how-to-properly-add-hours-to-a-pandas-tseries-index-datetimeindex
test[1].index + pd.DateOffset(hours=16)
TZ
https://stackoverflow.com/questions/1111056/get-time-zone-information-of-the-system-in-python
https://stackoverflow.com/questions/34789888/convert-datetime-columns-to-a-different-timezone-pandas
--just get the domain
df['domain_to'] = df['To'].apply(lambda x: x.split('@')[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment