Skip to content

Instantly share code, notes, and snippets.

@ShivnarenSrinivasan
Last active May 12, 2022 13:34
Show Gist options
  • Save ShivnarenSrinivasan/4dbcd3ff3bc35072efe03d4833c4ab7c to your computer and use it in GitHub Desktop.
Save ShivnarenSrinivasan/4dbcd3ff3bc35072efe03d4833c4ab7c to your computer and use it in GitHub Desktop.
w7
import string
import re
from typing import (
Tuple,
Collection,
)
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import (
figure,
axes,
)
from sklearn import (
base,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import (
TfidfVectorizer
)
from nltk import corpus
import data_analysis as da
FEATURES = ['Resume']
TARGET = 'Category'
def _split_data(file: str) -> None:
SOURCE = pd.read_csv(file)
train, test = train_test_split(
SOURCE, test_size=0.1, random_state=0, stratify=SOURCE[TARGET]
)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
def load_train_dev(file: str = 'train.csv') -> Tuple[pd.DataFrame, pd.DataFrame]:
return train_test_split(pd.read_csv(file), test_size=0.2, random_state=0)
def load_test(file: str = 'test.csv') -> pd.DataFrame:
return pd.read_csv(file)
def drop_dupes(df: pd.DataFrame) -> pd.DataFrame:
return df.loc[~df.duplicated()]
# Plotting
def plot_category_counts(df: pd.DataFrame) -> Tuple[figure.Figure, axes.Axes]:
category_order = df[TARGET].value_counts().index
fig, ax = plt.subplots(figsize=(25, 5))
sns.countplot(x=TARGET, data=df, order=category_order, palette='flare', ax=ax)
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
axes_text = {'fontsize': 15, 'fontweight': 'bold'}
ax.set_ylabel('Count', **axes_text)
ax.xaxis.get_label().set(**axes_text)
title_text = {'fontsize': 25, 'fontweight': 'bold'}
ax.set_title('Counts of Resume Categories', **title_text)
median = df.groupby(TARGET).count().median()['Resume']
ax.axhline(median, linewidth=2)
x_pos = _mean(ax.get_xlim())
ax.annotate(
f'median = {median}',
(x_pos, median),
xytext=(x_pos // 0.95, median // 0.8),
arrowprops={'arrowstyle': '->'},
)
return fig, ax
def plot_pie(df: pd.DataFrame, figsize=(10, 10), **kwargs):
targetCounts = df['Category'].value_counts()
targetLabels = df['Category'].unique()
# Make square figures and axes
fig, ax = plt.subplots(figsize=figsize)
wedges, texts, _ = ax.pie(
targetCounts,
labels=targetLabels,
colors=mpl.cm.tab20b(range(len(targetLabels))),
explode=[0.1] * len(targetLabels),
startangle=-40,
autopct='%.1f%%',
)
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
# for i, p in enumerate(wedges):
# ang = (p.theta2 - p.theta1) / 2.0 + p.theta1
# y = np.sin(np.deg2rad(ang))
# x = np.cos(np.deg2rad(ang))
# horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
# connectionstyle = "angle,angleA=0,angleB={}".format(ang)
# kw["arrowprops"].update({"connectionstyle": connectionstyle})
# ax.annotate(targetLabels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
# horizontalalignment=horizontalalignment, **kw)
# YOUR CODE HERE to display pie chart with color coding (eg. `coolwarm`)
def plot_donut(df: pd.DataFrame):
return plot_pie(df, wedgeprops=dict(width=0.2), pctdistance=0.9)
# Parse
_trans = {ord(s): 32 for s in string.punctuation}
STOPWORDS = frozenset(corpus.stopwords.words('english'))
def clean_resume(txt: str):
url_pat = r'https?://\S+|www\.\S+'
# whitespace = r'\s+'
url = re.compile('|'.join([url_pat]))
clean = url.sub('', txt)
ascii_text = clean.lower().translate(_trans).encode('ascii', 'ignore').decode()
# return ascii_text
return ' '.join(word for word in ascii_text.split() if not word.isspace() and word not in STOPWORDS)
def transform(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series]:
trans = df.pipe(drop_dupes).assign(Resume=df['Resume'].apply(clean_resume))
return trans['Resume'], trans[TARGET]
# Utilities
def _mean(vals: Collection[float]) -> float:
return sum(vals) / len(vals)
# Model
def make_model(model: base.BaseEstimator) -> Pipeline:
return make_pipeline(
TfidfVectorizer(),
model
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment