This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df_test = pd.DataFrame(test_ds, columns=['text', 'author']) | |
# make a unique list of authors | |
auth = sorted(set(df['author'])) | |
# make a dict of possible signatures where key is the labeled entity and value is a list of possible signatures | |
auth_dict = {} | |
auth_dict[auth[0]] = ['ben', 'benjamin', 'rogers', 'benjamin rogers','ben rogers','br'] | |
auth_dict[auth[1]] = ['chris', 'dorland','chris dorland','cd'] | |
auth_dict[auth[2]] = ['drew','fossum','drew fossum','df'] | |
auth_dict[auth[3]] = ['jeffrey','shankman','jeffrey shankman','js'] | |
auth_dict[auth[4]] = ['kevin','presto','kevin presto','kp'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_confusion_matrix(cm, | |
target_names, | |
title, | |
cmap=None, | |
normalize=True): | |
accuracy = np.trace(cm) / float(np.sum(cm)) | |
misclass = 1 - accuracy | |
if cmap is None: | |
cmap = plt.get_cmap('Blues') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tsnescatterplot(model, word, list_names, size): | |
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word, | |
its list of most similar words, and a list of words. | |
""" | |
arrays = np.empty((0, size), dtype='f') | |
word_labels = [word] | |
color_list = ['red'] | |
# adds the vector of the query word | |
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install PyPDF2 - > Read and parse your content pdf | |
# pip install requests - > request for get the pdf | |
# pip install BeautifulSoup - > for parse the html | |
from PyPDF2 import PdfFileReader | |
import requests | |
import io | |
from bs4 import BeautifulSoup | |
def getPdfMeta(pdfLink): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ADJ: adjective, e.g. big, old, green, incomprehensible, first | |
ADP: adposition, e.g. in, to, during | |
ADV: adverb, e.g. very, tomorrow, down, where, there | |
AUX: auxiliary, e.g. is, has (done), will (do), should (do) | |
CONJ: conjunction, e.g. and, or, but | |
CCONJ: coordinating conjunction, e.g. and, or, but | |
DET: determiner, e.g. a, an, the | |
INTJ: interjection, e.g. psst, ouch, bravo, hello | |
NOUN: noun, e.g. girl, cat, tree, air, beauty | |
NUM: numeral, e.g. 1, 2017, one, seventy-seven, IV, MMXIV |