Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
en_core_web_lg = spacy.load("en_core_web_lg") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name = "Simon" | |
doc=en_core_web_trf(f"My name is {name}") | |
print(f"Name = {name}. Detected entities: {doc.ents}") | |
name = "Katy" | |
doc=en_core_web_trf(f"My name is {name}") | |
print(f"Name = {name}. Detected entities: {doc.ents}") | |
name = "Moses" | |
doc=en_core_web_trf(f"This is what God said to {name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
import spacy | |
nlp = spacy.load("en_core_web_trf") | |
detailed_results = {} | |
print("Model name: en_core_web_trf") | |
for template, name_set in itertools.product(templates, name_sets.items()): | |
print(f"Name set: {name_set[0]}, Template: \"{template}\"") | |
results = names_recall(nlp, name_set[1], template) | |
detailed_results[template, name_set[0]] = results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
detailed_results = {} | |
nlp = en_core_web_lg | |
print("Model name: en_core_web_lg") | |
for template, name_set in itertools.product(templates, name_sets.items()): | |
print(f"Name set: {name_set[0]}, Template: \"{template}\"") | |
results = names_recall(nlp, name_set[1], template) | |
detailed_results[template, name_set[0]] = results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def names_recall(nlp: spacy.lang.en.English, names: List[str], template: str): | |
""" | |
Run the spaCy NLP model on the template + name, | |
calculate recall for detecting the "PERSON" entity | |
and return a detailed list of detection | |
:param nlp: spaCy nlp model | |
:param names: list of names to run model on | |
:param template: sentence with placeholder for name (e.g. "He calls himself {}") | |
""" | |
results = {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
biblical_names = ["David", "Moses", "Abraham", "Samuel", "Jacob", | |
"Isaac", "Jesus", "Matthew", | |
"John", "Judas","Simon", "Mary"] # Random biblical names | |
other_names = ["Beyonce", "Ariana", "Katy", # Singers | |
"Lebron", "Coby", # NBA players | |
"William", "Charles","Robert", "Margaret","Frank", "Helen", # Popular (non biblical) names in 1900 (https://www.ssa.gov/oact/babynames/decades/names1900s.html) | |
"Ronald", "George", "Bill", "Barack", "Donald", "Joe" # Presidents | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from time import time | |
class TimeTook(object): | |
""" | |
Calculates the time a block took to run. | |
Example usage: | |
with TimeTook("sample"): | |
s = [x for x in range(10000000)] | |
Modified from: https://blog.usejournal.com/how-to-create-your-own-timing-context-manager-in-python-a0e944b48cf8 | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from typing import List | |
import spacy | |
from spacy.tokens import Doc | |
from tqdm import tqdm | |
class SpacyPreprocessor: | |
def __init__( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Databricks notebook source | |
# install sparklyr (we need this every time we start our cluster as it has to install packages on all workers) | |
install.packages("sparklyr") | |
#titanic data | |
install.packages('titanic') | |
library(titanic) | |
# Load sparklyr package. | |
library(sparklyr) |
NewerOlder