Skip to content

Instantly share code, notes, and snippets.

@ines
Last active May 2, 2021 04:32
Show Gist options
  • Save ines/236461dec6f804530a9326e0ba993484 to your computer and use it in GitHub Desktop.
Save ines/236461dec6f804530a9326e0ba993484 to your computer and use it in GitHub Desktop.
spaCy v2.0 example: Get and set token text character spans as custom attribute extensions
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.tokens import Token
# Example of using spaCy v2.0's custom attribute extensions to get and set
# character spans of a token text as token attributes.
# Discussion: https://twitter.com/DavidWell/status/920245066404450304
# use basic English class for simiplicity
nlp = English()
# create an internal atrribute that keeps the list of highlights
Token.set_extension('_hl', default=list())
def get_highlights(token):
# get highlights from internal attribute
return [(start, end, token.text[start:end]) for start, end in token._._hl]
def set_highlight(token, value):
# append value to existing list
token._._hl.append(value)
# user-facing attribute used for getting and setting
Token.set_extension('highlight', getter=get_highlights, setter=set_highlight)
doc = nlp(u"This is a test.")
token = doc[0]
print('Example using getters and setters')
print(token._.highlight, 'No highlights set yet')
token._.highlight = (1, 3)
print(token._.highlight, 'One highlight set')
token._.highlight = (3, 4)
print(token._.highlight, 'Two highlights set')
# However, maybe this logic is actually confusing – it's not exactly intuitive
# that setting the value of ._.highlight *appends* to the list. So alternatively,
# you could also simply create a method:
Token.set_extension('highlights', default=list())
def highlight_method(token, value):
# append value to existing list
start, end = value
token._.highlights.append((start, end, token.text[start:end]))
Token.set_extension('add_highlight', method=highlight_method)
print('Alternative approach using a method')
print(token._.highlights, 'No highlights set yet')
token._.add_highlight((1, 3))
print(token._.highlights, 'One highlight set')
token._.add_highlight((3, 4))
print(token._.highlights, 'Two hightlights set')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment