ShivnarenSrinivasan · May 12, 2022 13:34
diff --git a/helpers.py b/helpers.py
 import string
 import re

 from typing import (
    Tuple,
    Collection,
 )

 import numpy as np
 import pandas as pd
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sns

 from matplotlib import (
    figure,
    axes,
 )
 from sklearn import (
    base,
 )
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.feature_extraction.text import (
    TfidfVectorizer
 )
 from nltk import corpus
 import data_analysis as da


 FEATURES = ['Resume']
 TARGET = 'Category'


 def _split_data(file: str) -> None:
    SOURCE = pd.read_csv(file)
    train, test = train_test_split(
        SOURCE, test_size=0.1, random_state=0, stratify=SOURCE[TARGET]
    )
    train.to_csv('train.csv', index=False)
    test.to_csv('test.csv', index=False)


 def load_train_dev(file: str = 'train.csv') -> Tuple[pd.DataFrame, pd.DataFrame]:
    return train_test_split(pd.read_csv(file), test_size=0.2, random_state=0)


 def load_test(file: str = 'test.csv') -> pd.DataFrame:
    return pd.read_csv(file)


 def drop_dupes(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[~df.duplicated()]

 # Plotting


 def plot_category_counts(df: pd.DataFrame) -> Tuple[figure.Figure, axes.Axes]:
    category_order = df[TARGET].value_counts().index

    fig, ax = plt.subplots(figsize=(25, 5))
    sns.countplot(x=TARGET, data=df, order=category_order, palette='flare', ax=ax)
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

    axes_text = {'fontsize': 15, 'fontweight': 'bold'}

    ax.set_ylabel('Count', **axes_text)
    ax.xaxis.get_label().set(**axes_text)

    title_text = {'fontsize': 25, 'fontweight': 'bold'}
    ax.set_title('Counts of Resume Categories', **title_text)

    median = df.groupby(TARGET).count().median()['Resume']
    ax.axhline(median, linewidth=2)
    x_pos = _mean(ax.get_xlim())
    ax.annotate(
        f'median = {median}',
        (x_pos, median),
        xytext=(x_pos // 0.95, median // 0.8),
        arrowprops={'arrowstyle': '->'},
    )

    return fig, ax


 def plot_pie(df: pd.DataFrame, figsize=(10, 10), **kwargs):
    targetCounts = df['Category'].value_counts()
    targetLabels = df['Category'].unique()
    # Make square figures and axes
    fig, ax = plt.subplots(figsize=figsize)
    wedges, texts, _ = ax.pie(
        targetCounts,
        labels=targetLabels,
        colors=mpl.cm.tab20b(range(len(targetLabels))),
        explode=[0.1] * len(targetLabels),
        startangle=-40,
        autopct='%.1f%%',
    )

    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

    # for i, p in enumerate(wedges):
    #     ang = (p.theta2 - p.theta1) / 2.0 + p.theta1
    #     y = np.sin(np.deg2rad(ang))
    #     x = np.cos(np.deg2rad(ang))
    #     horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    #     connectionstyle = "angle,angleA=0,angleB={}".format(ang)
    #     kw["arrowprops"].update({"connectionstyle": connectionstyle})
    # ax.annotate(targetLabels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
    # horizontalalignment=horizontalalignment, **kw)


 # YOUR CODE HERE to display pie chart with color coding (eg. `coolwarm`)


 def plot_donut(df: pd.DataFrame):
    return plot_pie(df, wedgeprops=dict(width=0.2), pctdistance=0.9)


 # Parse

 _trans = {ord(s): 32 for s in string.punctuation}
 STOPWORDS = frozenset(corpus.stopwords.words('english'))

 def clean_resume(txt: str):
    url_pat = r'https?://\S+|www\.\S+'
    # whitespace = r'\s+'
    url = re.compile('|'.join([url_pat]))
    clean = url.sub('', txt)
    ascii_text = clean.lower().translate(_trans).encode('ascii', 'ignore').decode()
    # return ascii_text
    return ' '.join(word for word in ascii_text.split() if not word.isspace() and word not in STOPWORDS)


 def transform(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series]:
    trans = df.pipe(drop_dupes).assign(Resume=df['Resume'].apply(clean_resume))
    return trans['Resume'], trans[TARGET]

 # Utilities
 def _mean(vals: Collection[float]) -> float:
    return sum(vals) / len(vals)


 # Model
 def make_model(model: base.BaseEstimator) -> Pipeline:
    return make_pipeline(
        TfidfVectorizer(),
        model
    )
	import string
	import re

	from typing import (
	Tuple,
	Collection,
	)

	import numpy as np
	import pandas as pd
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import seaborn as sns

	from matplotlib import (
	figure,
	axes,
	)
	from sklearn import (
	base,
	)
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import make_pipeline, Pipeline
	from sklearn.feature_extraction.text import (
	TfidfVectorizer
	)
	from nltk import corpus
	import data_analysis as da


	FEATURES = ['Resume']
	TARGET = 'Category'


	def _split_data(file: str) -> None:
	SOURCE = pd.read_csv(file)
	train, test = train_test_split(
	SOURCE, test_size=0.1, random_state=0, stratify=SOURCE[TARGET]
	)
	train.to_csv('train.csv', index=False)
	test.to_csv('test.csv', index=False)


	def load_train_dev(file: str = 'train.csv') -> Tuple[pd.DataFrame, pd.DataFrame]:
	return train_test_split(pd.read_csv(file), test_size=0.2, random_state=0)


	def load_test(file: str = 'test.csv') -> pd.DataFrame:
	return pd.read_csv(file)


	def drop_dupes(df: pd.DataFrame) -> pd.DataFrame:
	return df.loc[~df.duplicated()]

	# Plotting


	def plot_category_counts(df: pd.DataFrame) -> Tuple[figure.Figure, axes.Axes]:
	category_order = df[TARGET].value_counts().index

	fig, ax = plt.subplots(figsize=(25, 5))
	sns.countplot(x=TARGET, data=df, order=category_order, palette='flare', ax=ax)
	_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

	axes_text = {'fontsize': 15, 'fontweight': 'bold'}

	ax.set_ylabel('Count', **axes_text)
	ax.xaxis.get_label().set(**axes_text)

	title_text = {'fontsize': 25, 'fontweight': 'bold'}
	ax.set_title('Counts of Resume Categories', **title_text)

	median = df.groupby(TARGET).count().median()['Resume']
	ax.axhline(median, linewidth=2)
	x_pos = _mean(ax.get_xlim())
	ax.annotate(
	f'median = {median}',
	(x_pos, median),
	xytext=(x_pos // 0.95, median // 0.8),
	arrowprops={'arrowstyle': '->'},
	)

	return fig, ax


	def plot_pie(df: pd.DataFrame, figsize=(10, 10), **kwargs):
	targetCounts = df['Category'].value_counts()
	targetLabels = df['Category'].unique()
	# Make square figures and axes
	fig, ax = plt.subplots(figsize=figsize)
	wedges, texts, _ = ax.pie(
	targetCounts,
	labels=targetLabels,
	colors=mpl.cm.tab20b(range(len(targetLabels))),
	explode=[0.1] * len(targetLabels),
	startangle=-40,
	autopct='%.1f%%',
	)

	bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
	kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")

	# for i, p in enumerate(wedges):
	# ang = (p.theta2 - p.theta1) / 2.0 + p.theta1
	# y = np.sin(np.deg2rad(ang))
	# x = np.cos(np.deg2rad(ang))
	# horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
	# connectionstyle = "angle,angleA=0,angleB={}".format(ang)
	# kw["arrowprops"].update({"connectionstyle": connectionstyle})
	# ax.annotate(targetLabels[i], xy=(x, y), xytext=(1.35np.sign(x), 1.4y),
	# horizontalalignment=horizontalalignment, **kw)


	# YOUR CODE HERE to display pie chart with color coding (eg. `coolwarm`)


	def plot_donut(df: pd.DataFrame):
	return plot_pie(df, wedgeprops=dict(width=0.2), pctdistance=0.9)


	# Parse

	_trans = {ord(s): 32 for s in string.punctuation}
	STOPWORDS = frozenset(corpus.stopwords.words('english'))

	def clean_resume(txt: str):
	url_pat = r'https?://\S+\|www\.\S+'
	# whitespace = r'\s+'
	url = re.compile('\|'.join([url_pat]))
	clean = url.sub('', txt)
	ascii_text = clean.lower().translate(_trans).encode('ascii', 'ignore').decode()
	# return ascii_text
	return ' '.join(word for word in ascii_text.split() if not word.isspace() and word not in STOPWORDS)


	def transform(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series]:
	trans = df.pipe(drop_dupes).assign(Resume=df['Resume'].apply(clean_resume))
	return trans['Resume'], trans[TARGET]

	# Utilities
	def _mean(vals: Collection[float]) -> float:
	return sum(vals) / len(vals)


	# Model
	def make_model(model: base.BaseEstimator) -> Pipeline:
	return make_pipeline(
	TfidfVectorizer(),
	model
	)