Last active
May 12, 2022 13:34
-
-
Save ShivnarenSrinivasan/4dbcd3ff3bc35072efe03d4833c4ab7c to your computer and use it in GitHub Desktop.
w7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import re | |
from typing import ( | |
Tuple, | |
Collection, | |
) | |
import numpy as np | |
import pandas as pd | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from matplotlib import ( | |
figure, | |
axes, | |
) | |
from sklearn import ( | |
base, | |
) | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import make_pipeline, Pipeline | |
from sklearn.feature_extraction.text import ( | |
TfidfVectorizer | |
) | |
from nltk import corpus | |
import data_analysis as da | |
FEATURES = ['Resume'] | |
TARGET = 'Category' | |
def _split_data(file: str) -> None: | |
SOURCE = pd.read_csv(file) | |
train, test = train_test_split( | |
SOURCE, test_size=0.1, random_state=0, stratify=SOURCE[TARGET] | |
) | |
train.to_csv('train.csv', index=False) | |
test.to_csv('test.csv', index=False) | |
def load_train_dev(file: str = 'train.csv') -> Tuple[pd.DataFrame, pd.DataFrame]: | |
return train_test_split(pd.read_csv(file), test_size=0.2, random_state=0) | |
def load_test(file: str = 'test.csv') -> pd.DataFrame: | |
return pd.read_csv(file) | |
def drop_dupes(df: pd.DataFrame) -> pd.DataFrame: | |
return df.loc[~df.duplicated()] | |
# Plotting | |
def plot_category_counts(df: pd.DataFrame) -> Tuple[figure.Figure, axes.Axes]: | |
category_order = df[TARGET].value_counts().index | |
fig, ax = plt.subplots(figsize=(25, 5)) | |
sns.countplot(x=TARGET, data=df, order=category_order, palette='flare', ax=ax) | |
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45) | |
axes_text = {'fontsize': 15, 'fontweight': 'bold'} | |
ax.set_ylabel('Count', **axes_text) | |
ax.xaxis.get_label().set(**axes_text) | |
title_text = {'fontsize': 25, 'fontweight': 'bold'} | |
ax.set_title('Counts of Resume Categories', **title_text) | |
median = df.groupby(TARGET).count().median()['Resume'] | |
ax.axhline(median, linewidth=2) | |
x_pos = _mean(ax.get_xlim()) | |
ax.annotate( | |
f'median = {median}', | |
(x_pos, median), | |
xytext=(x_pos // 0.95, median // 0.8), | |
arrowprops={'arrowstyle': '->'}, | |
) | |
return fig, ax | |
def plot_pie(df: pd.DataFrame, figsize=(10, 10), **kwargs): | |
targetCounts = df['Category'].value_counts() | |
targetLabels = df['Category'].unique() | |
# Make square figures and axes | |
fig, ax = plt.subplots(figsize=figsize) | |
wedges, texts, _ = ax.pie( | |
targetCounts, | |
labels=targetLabels, | |
colors=mpl.cm.tab20b(range(len(targetLabels))), | |
explode=[0.1] * len(targetLabels), | |
startangle=-40, | |
autopct='%.1f%%', | |
) | |
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72) | |
kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center") | |
# for i, p in enumerate(wedges): | |
# ang = (p.theta2 - p.theta1) / 2.0 + p.theta1 | |
# y = np.sin(np.deg2rad(ang)) | |
# x = np.cos(np.deg2rad(ang)) | |
# horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))] | |
# connectionstyle = "angle,angleA=0,angleB={}".format(ang) | |
# kw["arrowprops"].update({"connectionstyle": connectionstyle}) | |
# ax.annotate(targetLabels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y), | |
# horizontalalignment=horizontalalignment, **kw) | |
# YOUR CODE HERE to display pie chart with color coding (eg. `coolwarm`) | |
def plot_donut(df: pd.DataFrame): | |
return plot_pie(df, wedgeprops=dict(width=0.2), pctdistance=0.9) | |
# Parse | |
_trans = {ord(s): 32 for s in string.punctuation} | |
STOPWORDS = frozenset(corpus.stopwords.words('english')) | |
def clean_resume(txt: str): | |
url_pat = r'https?://\S+|www\.\S+' | |
# whitespace = r'\s+' | |
url = re.compile('|'.join([url_pat])) | |
clean = url.sub('', txt) | |
ascii_text = clean.lower().translate(_trans).encode('ascii', 'ignore').decode() | |
# return ascii_text | |
return ' '.join(word for word in ascii_text.split() if not word.isspace() and word not in STOPWORDS) | |
def transform(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series]: | |
trans = df.pipe(drop_dupes).assign(Resume=df['Resume'].apply(clean_resume)) | |
return trans['Resume'], trans[TARGET] | |
# Utilities | |
def _mean(vals: Collection[float]) -> float: | |
return sum(vals) / len(vals) | |
# Model | |
def make_model(model: base.BaseEstimator) -> Pipeline: | |
return make_pipeline( | |
TfidfVectorizer(), | |
model | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment