wpm · January 3, 2018 20:37
diff --git a/birthday_corpus.py b/birthday_corpus.py
 import json
 import re
 import time
 from random import choice, random
 from typing import TextIO, Callable, Sequence, Tuple, Optional

 import click

 NAME = DATE = str
 SPAN_OFFSET = Tuple[int, int]


 def generate(name_factory: Callable[[], NAME], lifespan_factory: Callable[[], Tuple[DATE, DATE]]) \
        -> Tuple[str, Optional[SPAN_OFFSET], Optional[SPAN_OFFSET]]:
    def find_span(date: DATE) -> SPAN_OFFSET:
        i = text.find(date)
        j = i + len(date)
        return i, j

    name = name_factory()
    born, died = lifespan_factory()
    texts = [
        (f"{name} was born on {born}.", True, False),
        (f"{name} has a birthday on {born}.", True, False),
        (f"{name} was born on {born} and died {died}.", True, True),
        (f"On {born} {name} was born.", True, False),
        (f"On {died} {name} died.", False, True),
        (f"{name} died on {died}.", False, True),
        (f"RIP {name}: {born}-{died}.", True, True),
        (f"A skilled carpenter, {name} lived from {born} until {died}.", True, True),
        (f"{died} was the day {name} died.", False, True),
        (f"{born} was the day {name} was born.", True, False),
        (f"{name} is a skilled juggler.", False, False),
        (f"Where are you, {name}?", False, False)
    ]
    text, contains_born, contains_died = choice(texts)
    born_span = died_span = None
    if contains_born:
        born_span = find_span(born)
    if contains_died:
        died_span = find_span(died)
    return text, born_span, died_span


 def name_generator(first_names: Sequence[str], last_names: Sequence[str]) -> Callable[[], NAME]:
    def factory() -> str:
        if random() < 0.5:
            return f"{choice(first_names)} {choice(last_names)}"
        else:
            return f"{choice(first_names)}"

    return factory


 def lifespan_generator(start="1/1/1900", end="12/31/2010") -> Callable[[], Tuple[DATE, DATE]]:
    start = time.mktime(time.strptime(start, "%m/%d/%Y"))
    end = time.mktime(time.strptime(end, "%m/%d/%Y"))
    formats = ["%m/%d/%Y", "%B %d, %Y", "%d %B %Y"]

    def factory() -> Tuple[DATE, DATE]:
        def make_date(timestamp):
            date = time.strftime(fmt, time.localtime(timestamp))
            return re.sub(r'\b0(\d)', r'\1', date)  # Remove leading zeroes from numbers.

        born = start + (end - start) * random()
        died = born + (end - born) * random()
        fmt = choice(formats)
        return make_date(born), make_date(died)

    return factory


 @click.command()
 @click.option("--n", default=10000, help="number of samples to generate")
 @click.option("--first-names", type=click.File(), help="list of first names, one per line")
 @click.option("--last-names", type=click.File(), help="list of last names, one per line")
 def birthday_corpus(n: int, first_names: Optional[TextIO], last_names: Optional[TextIO]):
    """
    Generate a corpus of texts describing birth and death dates for people.

    The texts refer to dates on which a person was born and or died. The appropriate date spans are annoated with a
    BIRTHDAY label. This is used to create a training file that can be used by Prodigy.

    If the first names or last names file is not specified, a short default list of names is used.

    See https://prodi.gy.
    """

    def annotation_span(span, accept):
        return {"text": text[span[0]:span[1]], "start": span[0], "end": span[1], "label": "BIRTHDAY", "accept": accept}

    if first_names is not None:
        first_names_list = list(name.title().strip() for name in first_names)
    else:
        first_names_list = ["Mary", "Sue", "John", "Roger"]
    if last_names is not None:
        last_names_list = list(name.title().strip() for name in last_names)
    else:
        last_names_list = ["Smith", "Jones", "Jackson", "Ruiz"]
    for _ in range(n):
        text, born_span, died_span = generate(name_generator(first_names_list, last_names_list), lifespan_generator())
        spans = []
        if born_span:
            spans.append(annotation_span(born_span, True))
        if died_span:
            spans.append(annotation_span(died_span, False))
        click.echo(json.dumps({"text": text, "spans": spans}))


 if __name__ == "__main__":
    birthday_corpus()
	import json
	import re
	import time
	from random import choice, random
	from typing import TextIO, Callable, Sequence, Tuple, Optional

	import click

	NAME = DATE = str
	SPAN_OFFSET = Tuple[int, int]


	def generate(name_factory: Callable[[], NAME], lifespan_factory: Callable[[], Tuple[DATE, DATE]]) \
	-> Tuple[str, Optional[SPAN_OFFSET], Optional[SPAN_OFFSET]]:
	def find_span(date: DATE) -> SPAN_OFFSET:
	i = text.find(date)
	j = i + len(date)
	return i, j

	name = name_factory()
	born, died = lifespan_factory()
	texts = [
	(f"{name} was born on {born}.", True, False),
	(f"{name} has a birthday on {born}.", True, False),
	(f"{name} was born on {born} and died {died}.", True, True),
	(f"On {born} {name} was born.", True, False),
	(f"On {died} {name} died.", False, True),
	(f"{name} died on {died}.", False, True),
	(f"RIP {name}: {born}-{died}.", True, True),
	(f"A skilled carpenter, {name} lived from {born} until {died}.", True, True),
	(f"{died} was the day {name} died.", False, True),
	(f"{born} was the day {name} was born.", True, False),
	(f"{name} is a skilled juggler.", False, False),
	(f"Where are you, {name}?", False, False)
	]
	text, contains_born, contains_died = choice(texts)
	born_span = died_span = None
	if contains_born:
	born_span = find_span(born)
	if contains_died:
	died_span = find_span(died)
	return text, born_span, died_span


	def name_generator(first_names: Sequence[str], last_names: Sequence[str]) -> Callable[[], NAME]:
	def factory() -> str:
	if random() < 0.5:
	return f"{choice(first_names)} {choice(last_names)}"
	else:
	return f"{choice(first_names)}"

	return factory


	def lifespan_generator(start="1/1/1900", end="12/31/2010") -> Callable[[], Tuple[DATE, DATE]]:
	start = time.mktime(time.strptime(start, "%m/%d/%Y"))
	end = time.mktime(time.strptime(end, "%m/%d/%Y"))
	formats = ["%m/%d/%Y", "%B %d, %Y", "%d %B %Y"]

	def factory() -> Tuple[DATE, DATE]:
	def make_date(timestamp):
	date = time.strftime(fmt, time.localtime(timestamp))
	return re.sub(r'\b0(\d)', r'\1', date) # Remove leading zeroes from numbers.

	born = start + (end - start) * random()
	died = born + (end - born) * random()
	fmt = choice(formats)
	return make_date(born), make_date(died)

	return factory


	@click.command()
	@click.option("--n", default=10000, help="number of samples to generate")
	@click.option("--first-names", type=click.File(), help="list of first names, one per line")
	@click.option("--last-names", type=click.File(), help="list of last names, one per line")
	def birthday_corpus(n: int, first_names: Optional[TextIO], last_names: Optional[TextIO]):
	"""
	Generate a corpus of texts describing birth and death dates for people.

	The texts refer to dates on which a person was born and or died. The appropriate date spans are annoated with a
	BIRTHDAY label. This is used to create a training file that can be used by Prodigy.

	If the first names or last names file is not specified, a short default list of names is used.

	See https://prodi.gy.
	"""

	def annotation_span(span, accept):
	return {"text": text[span[0]:span[1]], "start": span[0], "end": span[1], "label": "BIRTHDAY", "accept": accept}

	if first_names is not None:
	first_names_list = list(name.title().strip() for name in first_names)
	else:
	first_names_list = ["Mary", "Sue", "John", "Roger"]
	if last_names is not None:
	last_names_list = list(name.title().strip() for name in last_names)
	else:
	last_names_list = ["Smith", "Jones", "Jackson", "Ruiz"]
	for _ in range(n):
	text, born_span, died_span = generate(name_generator(first_names_list, last_names_list), lifespan_generator())
	spans = []
	if born_span:
	spans.append(annotation_span(born_span, True))
	if died_span:
	spans.append(annotation_span(died_span, False))
	click.echo(json.dumps({"text": text, "spans": spans}))


	if __name__ == "__main__":
	birthday_corpus()