Created
August 10, 2020 02:47
-
-
Save uliang/229cfd16c2da0b521f7ac0bc050cda9c to your computer and use it in GitHub Desktop.
A simple text filtering pipeline component in the spaCy style.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import List, Dict, Union | |
| from spacy.tokens import Doc, Token | |
| from spacy.matcher import Matcher | |
| from srsly import read_json | |
| class FilterTextPreprocessing: | |
| def __init__(self, nlp, | |
| patterns: List[Dict[str, Union[str, List[Dict]]]]) : | |
| Doc.set_extension('bow', default=[]) | |
| Token.set_extension('keep', default=True) | |
| self.matcher = Matcher(nlp.vocab) | |
| for patt_obj in patterns: | |
| string_id = patt_obj.get('string_id') | |
| pattern = patt_obj.get('pattern') | |
| self.matcher.add(string, self.on_match, pattern) | |
| def on_match(self, matcher, doc, i, matches): | |
| _, start, end = matches[i] | |
| for tkn in doc[start:end]: | |
| tkn._.keep = False | |
| def __call__(self, doc) : | |
| self.matcher(doc) | |
| doc._.bow = [tkn.text for tkn in doc if tkn._.keep] | |
| return doc | |
| @classmethod | |
| def from_pattern_file(cls, nlp, path) : | |
| patterns = read_json(path) | |
| return cls(nlp, patterns) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment