Skip to content

Instantly share code, notes, and snippets.

View mmahbub's full-sized avatar

Maria Mahbub mmahbub

View GitHub Profile
df_test = pd.DataFrame(test_ds, columns=['text', 'author'])
# make a unique list of authors
auth = sorted(set(df['author']))
# make a dict of possible signatures where key is the labeled entity and value is a list of possible signatures
auth_dict = {}
auth_dict[auth[0]] = ['ben', 'benjamin', 'rogers', 'benjamin rogers','ben rogers','br']
auth_dict[auth[1]] = ['chris', 'dorland','chris dorland','cd']
auth_dict[auth[2]] = ['drew','fossum','drew fossum','df']
auth_dict[auth[3]] = ['jeffrey','shankman','jeffrey shankman','js']
auth_dict[auth[4]] = ['kevin','presto','kevin presto','kp']
def plot_confusion_matrix(cm,
target_names,
title,
cmap=None,
normalize=True):
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
def tsnescatterplot(model, word, list_names, size):
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
its list of most similar words, and a list of words.
"""
arrays = np.empty((0, size), dtype='f')
word_labels = [word]
color_list = ['red']
# adds the vector of the query word
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
# pip install PyPDF2 - > Read and parse your content pdf
# pip install requests - > request for get the pdf
# pip install BeautifulSoup - > for parse the html
from PyPDF2 import PdfFileReader
import requests
import io
from bs4 import BeautifulSoup
def getPdfMeta(pdfLink):
ADJ: adjective, e.g. big, old, green, incomprehensible, first
ADP: adposition, e.g. in, to, during
ADV: adverb, e.g. very, tomorrow, down, where, there
AUX: auxiliary, e.g. is, has (done), will (do), should (do)
CONJ: conjunction, e.g. and, or, but
CCONJ: coordinating conjunction, e.g. and, or, but
DET: determiner, e.g. a, an, the
INTJ: interjection, e.g. psst, ouch, bravo, hello
NOUN: noun, e.g. girl, cat, tree, air, beauty
NUM: numeral, e.g. 1, 2017, one, seventy-seven, IV, MMXIV