Created
December 22, 2017 19:30
-
-
Save wpm/2a40784364a7398b556fb63124acf32c to your computer and use it in GitHub Desktop.
Utility that matches text patterns in spaCy/Prodigy training data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from json import JSONDecodeError | |
from typing import Sequence, Iterable, List | |
import click | |
import spacy | |
from spacy.matcher import Matcher | |
def match_patterns(nlp, patterns: Sequence[dict], corpus: Iterable[str]) -> Iterable[str]: | |
matcher = Matcher(nlp.vocab) | |
matcher.add("Pattern Matcher", None, *patterns) | |
for document in nlp.pipe(corpus): | |
for _, start, end in matcher(document): | |
yield document[start:end] | |
class Patterns(click.ParamType): | |
name = "patterns" | |
def convert(self, value: str, _, __) -> List[dict]: | |
ext = value.split(".")[-1] | |
try: | |
with open(value) as f: | |
if ext == "jsonl": | |
patterns = [json.loads(line)["pattern"] for line in f.readlines()] | |
else: | |
patterns = [obj["pattern"] for obj in json.load(f)] | |
if not isinstance(patterns, list): | |
self.fail("Invalid patterns file.") | |
return patterns | |
except OSError: | |
self.fail("Cannot open {value}.") | |
except JSONDecodeError as e: | |
self.fail("Invalid JSON {e}") | |
class Corpus(click.ParamType): | |
name = "corpus" | |
def convert(self, value: str, _, __) -> Iterable[str]: | |
ext = value.split(".")[-1] | |
try: | |
with open(value) as f: | |
if ext == "json": | |
return (item["text"] for item in json.load(f)) | |
elif ext == "jsonl": | |
return (json.loads(line)["text"] for line in f.readlines()) | |
else: | |
return f.readlines() | |
except OSError: | |
self.fail("Cannot open {value}") | |
except JSONDecodeError as e: | |
self.fail("Invalid JSON {e}") | |
@click.command() | |
@click.argument("corpus", type=Corpus()) | |
@click.argument("patterns", type=Patterns()) | |
@click.option("--language-model", default="en", help="spaCy language model (default 'en')") | |
def pattern_match(corpus: Iterable[str], patterns: List[dict], language_model: str): | |
""" | |
Print all the strings in the CORPUS that match the PATTERNS. | |
CORPUS is a .txt, .json, or .jsonl file that can be used as input to Prodigy. | |
PATTERNS is a .json file that can be passed to Prodigy's --patterns option. | |
""" | |
for match in match_patterns(spacy.load(language_model), patterns, corpus): | |
click.echo(match) | |
if __name__ == "__main__": | |
pattern_match() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment