''' NOTE: 0. install pandas and numpy to run this 1. Download the data at https://docs.google.com/spreadsheets/d/1QtC8efWw0mVkGXW4QA9bX4f0nJhGbmFqCfVLSumMZ0I/edit **AS A CSV FILE** 2. **REMOVE THE FIRST LINE FROM THE CSV FILE BEFORE RUNNING THIS** ''' import pandas as pd import numpy as np import re def lof(it): print(len(it)) def regex(s: str, nocase=True): return re.compile(s, re.IGNORECASE) if nocase else re.compile(s) df = pd.read_csv('./singapore_tech_salaries_responses.csv', names=[ 'Timestamp', 'Type', 'Company', 'Role', 'Salary', # 'Monthly/Annual Salary (Specify if not SGD)', 'Stocks', # 'Total Stocks (Specify if not SGD)', 'SignBonus',#'Sign On (Specify if not SGD)', 'Comp', # 'Annual Total Compensation (Specify if not SGD)', 'TargetBonus', #'Target Annual Bonus', 'ExtraInfo',#'Extra Information' ], skiprows=[0]) # pyright: ignore assert isinstance(df, pd.DataFrame) # pyright df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce') prev_len = len(df) df = df.dropna(subset=['Salary']) assert isinstance(df, pd.DataFrame) # pyright print(f'dropped {prev_len-len(df)} rows!') print(df) print(df.columns) internships = df[df.Type == 'Internship'].groupby('Company').filter(lambda x: len(x) > 2) ROLES = { "ML": [regex("data"), regex('AI', False), regex("ML", False), regex("machine learning"), regex('vision')], "InfoSec": [regex('cyber'), regex('security'), regex('infosec')], "SWE": [regex('software'), regex('developer'), regex('backend'), regex('programming'), regex('programmer')], "infra": [regex("devops"), regex('cloud')], "dontcare": [regex('trader'), regex('manage'), regex('techops'), regex('design'), regex('business'), regex('consulting'), regex('supply')], "unknown": [regex('')] } role_agg = {k:set() for k in ROLES} for role in internships.Role.unique(): for typ,ls in ROLES.items(): if any(r.search(role) for r in ls): role_agg[typ].add(role) break print("Categorised job roles: ") for rtyp,ls in role_agg.items(): print('-----', rtyp, '-----') for role in ls: print('\t', role) def rolecount(group): d = {'Total': len(group)} for role,rset in role_agg.items(): d[role] = len(group.Role[group.Role.str.contains('|'.join(rset))]) return pd.Series(d) print( internships.groupby('Company')["Salary"].agg(["count", np.mean, np.std, np.var]) ) print( internships.groupby('Company').apply(rolecount) )