''' NOTE:
0. install pandas and numpy to run this
1. Download the data at https://docs.google.com/spreadsheets/d/1QtC8efWw0mVkGXW4QA9bX4f0nJhGbmFqCfVLSumMZ0I/edit **AS A CSV FILE**
2. **REMOVE THE FIRST LINE FROM THE CSV FILE BEFORE RUNNING THIS**
'''


import pandas as pd
import numpy as np
import re
def lof(it): print(len(it))
def regex(s: str, nocase=True): return re.compile(s, re.IGNORECASE) if nocase else re.compile(s)
df = pd.read_csv('./singapore_tech_salaries_responses.csv', names=[
    'Timestamp', 'Type', 'Company', 'Role',
    'Salary',   # 'Monthly/Annual Salary (Specify if not SGD)',
    'Stocks',   # 'Total Stocks (Specify if not SGD)',
    'SignBonus',#'Sign On (Specify if not SGD)',
    'Comp',     # 'Annual Total Compensation (Specify if not SGD)',
    'TargetBonus', #'Target Annual Bonus',
    'ExtraInfo',#'Extra Information'
], skiprows=[0]) # pyright: ignore
assert isinstance(df, pd.DataFrame) # pyright
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
prev_len = len(df)
df = df.dropna(subset=['Salary'])
assert isinstance(df, pd.DataFrame) # pyright
print(f'dropped {prev_len-len(df)} rows!')
print(df)
print(df.columns)
internships = df[df.Type == 'Internship'].groupby('Company').filter(lambda x: len(x) > 2)

ROLES = {
    "ML": [regex("data"), regex('AI', False), regex("ML", False), regex("machine learning"), regex('vision')],
    "InfoSec": [regex('cyber'), regex('security'), regex('infosec')],
    "SWE": [regex('software'), regex('developer'), regex('backend'), regex('programming'), regex('programmer')],
    "infra": [regex("devops"), regex('cloud')],
    "dontcare": [regex('trader'), regex('manage'), regex('techops'), regex('design'), regex('business'), regex('consulting'), regex('supply')],
    "unknown": [regex('')]
}

role_agg = {k:set() for k in ROLES}

for role in internships.Role.unique():
    for typ,ls in ROLES.items():
        if any(r.search(role) for r in ls):
            role_agg[typ].add(role)
            break
print("Categorised job roles: ")
for rtyp,ls in role_agg.items():
    print('-----', rtyp, '-----')
    for role in ls:
        print('\t', role)

def rolecount(group):
    d = {'Total': len(group)}
    for role,rset in role_agg.items():
        d[role] = len(group.Role[group.Role.str.contains('|'.join(rset))])
    return pd.Series(d)
print(
    internships.groupby('Company')["Salary"].agg(["count", np.mean, np.std, np.var])
)
print(
    internships.groupby('Company').apply(rolecount)
)