Created
January 3, 2018 20:37
-
-
Save wpm/feb9aea744870674d353ffd55344becb to your computer and use it in GitHub Desktop.
Generate a corpus of texts mentioning birthdays that can be used to train a Prodigy named entity recognizer.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
import time | |
from random import choice, random | |
from typing import TextIO, Callable, Sequence, Tuple, Optional | |
import click | |
NAME = DATE = str | |
SPAN_OFFSET = Tuple[int, int] | |
def generate(name_factory: Callable[[], NAME], lifespan_factory: Callable[[], Tuple[DATE, DATE]]) \ | |
-> Tuple[str, Optional[SPAN_OFFSET], Optional[SPAN_OFFSET]]: | |
def find_span(date: DATE) -> SPAN_OFFSET: | |
i = text.find(date) | |
j = i + len(date) | |
return i, j | |
name = name_factory() | |
born, died = lifespan_factory() | |
texts = [ | |
(f"{name} was born on {born}.", True, False), | |
(f"{name} has a birthday on {born}.", True, False), | |
(f"{name} was born on {born} and died {died}.", True, True), | |
(f"On {born} {name} was born.", True, False), | |
(f"On {died} {name} died.", False, True), | |
(f"{name} died on {died}.", False, True), | |
(f"RIP {name}: {born}-{died}.", True, True), | |
(f"A skilled carpenter, {name} lived from {born} until {died}.", True, True), | |
(f"{died} was the day {name} died.", False, True), | |
(f"{born} was the day {name} was born.", True, False), | |
(f"{name} is a skilled juggler.", False, False), | |
(f"Where are you, {name}?", False, False) | |
] | |
text, contains_born, contains_died = choice(texts) | |
born_span = died_span = None | |
if contains_born: | |
born_span = find_span(born) | |
if contains_died: | |
died_span = find_span(died) | |
return text, born_span, died_span | |
def name_generator(first_names: Sequence[str], last_names: Sequence[str]) -> Callable[[], NAME]: | |
def factory() -> str: | |
if random() < 0.5: | |
return f"{choice(first_names)} {choice(last_names)}" | |
else: | |
return f"{choice(first_names)}" | |
return factory | |
def lifespan_generator(start="1/1/1900", end="12/31/2010") -> Callable[[], Tuple[DATE, DATE]]: | |
start = time.mktime(time.strptime(start, "%m/%d/%Y")) | |
end = time.mktime(time.strptime(end, "%m/%d/%Y")) | |
formats = ["%m/%d/%Y", "%B %d, %Y", "%d %B %Y"] | |
def factory() -> Tuple[DATE, DATE]: | |
def make_date(timestamp): | |
date = time.strftime(fmt, time.localtime(timestamp)) | |
return re.sub(r'\b0(\d)', r'\1', date) # Remove leading zeroes from numbers. | |
born = start + (end - start) * random() | |
died = born + (end - born) * random() | |
fmt = choice(formats) | |
return make_date(born), make_date(died) | |
return factory | |
@click.command() | |
@click.option("--n", default=10000, help="number of samples to generate") | |
@click.option("--first-names", type=click.File(), help="list of first names, one per line") | |
@click.option("--last-names", type=click.File(), help="list of last names, one per line") | |
def birthday_corpus(n: int, first_names: Optional[TextIO], last_names: Optional[TextIO]): | |
""" | |
Generate a corpus of texts describing birth and death dates for people. | |
The texts refer to dates on which a person was born and or died. The appropriate date spans are annoated with a | |
BIRTHDAY label. This is used to create a training file that can be used by Prodigy. | |
If the first names or last names file is not specified, a short default list of names is used. | |
See https://prodi.gy. | |
""" | |
def annotation_span(span, accept): | |
return {"text": text[span[0]:span[1]], "start": span[0], "end": span[1], "label": "BIRTHDAY", "accept": accept} | |
if first_names is not None: | |
first_names_list = list(name.title().strip() for name in first_names) | |
else: | |
first_names_list = ["Mary", "Sue", "John", "Roger"] | |
if last_names is not None: | |
last_names_list = list(name.title().strip() for name in last_names) | |
else: | |
last_names_list = ["Smith", "Jones", "Jackson", "Ruiz"] | |
for _ in range(n): | |
text, born_span, died_span = generate(name_generator(first_names_list, last_names_list), lifespan_generator()) | |
spans = [] | |
if born_span: | |
spans.append(annotation_span(born_span, True)) | |
if died_span: | |
spans.append(annotation_span(died_span, False)) | |
click.echo(json.dumps({"text": text, "spans": spans})) | |
if __name__ == "__main__": | |
birthday_corpus() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment