Created
October 1, 2022 03:29
-
-
Save 152334H/799f0f0aa9bab2172fe3cb3fc2bfca47 to your computer and use it in GitHub Desktop.
hacked out script made in <1hr to look at the tech interview spreadsheet data a bit better for my purposes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' NOTE: | |
0. install pandas and numpy to run this | |
1. Download the data at https://docs.google.com/spreadsheets/d/1QtC8efWw0mVkGXW4QA9bX4f0nJhGbmFqCfVLSumMZ0I/edit **AS A CSV FILE** | |
2. **REMOVE THE FIRST LINE FROM THE CSV FILE BEFORE RUNNING THIS** | |
''' | |
import pandas as pd | |
import numpy as np | |
import re | |
def lof(it): print(len(it)) | |
def regex(s: str, nocase=True): return re.compile(s, re.IGNORECASE) if nocase else re.compile(s) | |
df = pd.read_csv('./singapore_tech_salaries_responses.csv', names=[ | |
'Timestamp', 'Type', 'Company', 'Role', | |
'Salary', # 'Monthly/Annual Salary (Specify if not SGD)', | |
'Stocks', # 'Total Stocks (Specify if not SGD)', | |
'SignBonus',#'Sign On (Specify if not SGD)', | |
'Comp', # 'Annual Total Compensation (Specify if not SGD)', | |
'TargetBonus', #'Target Annual Bonus', | |
'ExtraInfo',#'Extra Information' | |
], skiprows=[0]) # pyright: ignore | |
assert isinstance(df, pd.DataFrame) # pyright | |
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce') | |
prev_len = len(df) | |
df = df.dropna(subset=['Salary']) | |
assert isinstance(df, pd.DataFrame) # pyright | |
print(f'dropped {prev_len-len(df)} rows!') | |
print(df) | |
print(df.columns) | |
internships = df[df.Type == 'Internship'].groupby('Company').filter(lambda x: len(x) > 2) | |
ROLES = { | |
"ML": [regex("data"), regex('AI', False), regex("ML", False), regex("machine learning"), regex('vision')], | |
"InfoSec": [regex('cyber'), regex('security'), regex('infosec')], | |
"SWE": [regex('software'), regex('developer'), regex('backend'), regex('programming'), regex('programmer')], | |
"infra": [regex("devops"), regex('cloud')], | |
"dontcare": [regex('trader'), regex('manage'), regex('techops'), regex('design'), regex('business'), regex('consulting'), regex('supply')], | |
"unknown": [regex('')] | |
} | |
role_agg = {k:set() for k in ROLES} | |
for role in internships.Role.unique(): | |
for typ,ls in ROLES.items(): | |
if any(r.search(role) for r in ls): | |
role_agg[typ].add(role) | |
break | |
print("Categorised job roles: ") | |
for rtyp,ls in role_agg.items(): | |
print('-----', rtyp, '-----') | |
for role in ls: | |
print('\t', role) | |
def rolecount(group): | |
d = {'Total': len(group)} | |
for role,rset in role_agg.items(): | |
d[role] = len(group.Role[group.Role.str.contains('|'.join(rset))]) | |
return pd.Series(d) | |
print( | |
internships.groupby('Company')["Salary"].agg(["count", np.mean, np.std, np.var]) | |
) | |
print( | |
internships.groupby('Company').apply(rolecount) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment