import pandas as pd
import re
from collections import Counter
import spacy
from tqdm import tqdm as tqdm
from urllib.parse import urlparse
import matplotlib.pyplot as plt
# if en_core_web_sm is not installed
# !python -m spacy download en_core_web_sm
scores = pd.read_csv('./scoredNotes.tsv',sep='\t')
notes = pd.read_csv('./notes-00000.tsv', sep='\t')
merged = pd.merge(scores, notes, on='noteId')
nlp = spacy.load('en_core_web_sm')
merged['words'] = merged['summary'].progress_apply(lambda x: set([token.text for token in nlp(x) if not token.is_stop and not token.is_punct and ' ' not in token.text]))
merged['urls'] = merged['summary'].progress_apply(lambda x: re.findall(r'(https?://\S+)', x))
merged['urls'] = merged['urls'].progress_apply(lambda x: [urlparse(my_url).netloc.replace('www.','') for my_url in x])
status = merged['ratingStatus'].unique()
colors = ['cornflowerblue', 'slategrey', 'lightsteelblue']
for kw in ['words','urls']:
f, ax = plt.subplots(1, 3, figsize=(12,4), facecolor='white')
for idx, _ in enumerate(status):
sub = merged[merged['ratingStatus'] == _]
all_words = []
[all_words.extend(_) for _ in sub[kw]]
y,x = zip(*Counter(all_words).most_common(10))
bars = ax[idx].barh(y[::-1],x[::-1], color=colors[idx])
for loc in ['top','right','bottom','left']:
plt.savefig(f"{kw}.png", dpi=300)
