Last active
June 11, 2022 14:59
-
-
Save jacquesfize/5086c7c4f6c56e9d3c7cfb1eb0010fe8 to your computer and use it in GitHub Desktop.
A function to delete tokens from a spacy Doc object without losing associated information (PartOfSpeech, Dependance, Lemma, ...)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_tokens(doc, index_to_del, list_attr=[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]): | |
""" | |
Remove tokens from a Spacy *Doc* object without losing | |
associated information (PartOfSpeech, Dependance, Lemma, extensions, ...) | |
Parameters | |
---------- | |
doc : spacy.tokens.doc.Doc | |
spacy representation of the text | |
index_to_del : list of integer | |
positions of each token you want to delete from the document | |
list_attr : list, optional | |
Contains the Spacy attributes you want to keep (the default is | |
[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]) | |
Returns | |
------- | |
spacy.tokens.doc.Doc | |
Filtered version of doc | |
""" | |
np_array = doc.to_array(list_attr) # Array representation of Doc | |
# Creating a mask: boolean array of the indexes to delete | |
mask_to_del = np.ones(len(np_array), np.bool) | |
mask_to_del[index_to_del] = 0 | |
np_array_2 = np_array[mask_to_del] | |
doc2 = Doc(doc.vocab, words=[t.text for t in doc if t.i not in index_to_del]) | |
doc2.from_array(list_attr, np_array_2) | |
### Modification made by @yarongon https://gist.github.com/Jacobe2169/5086c7c4f6c56e9d3c7cfb1eb0010fe8#gistcomment-2941380 | |
# Handling user extensions | |
# The `doc.user_data` dictionary is holding the data backing user-defined attributes. | |
# The data is based on characters offset, so a conversion is needed from the | |
# old Doc to the new one. | |
# More info here: https://github.com/explosion/spaCy/issues/2532 | |
arr = np.arange(len(doc)) | |
new_index_to_old = arr[mask_to_del] | |
doc_offset_2_token = {tok.idx : tok.i for tok in doc} # needed for the user data | |
doc2_token_2_offset = {tok.i : tok.idx for tok in doc2} # needed for the user data | |
new_user_data = {} | |
for ((prefix, ext_name, offset, x), val) in doc.user_data.items(): | |
old_token_index = doc_offset_2_token[offset] | |
new_token_index = np.where(new_index_to_old == old_token_index)[0] | |
if new_token_index.size == 0: # Case this index was deleted | |
continue | |
new_char_index = doc2_token_2_offset[new_token_index[0]] | |
new_user_data[(prefix, ext_name, new_char_index, x)] = val | |
doc2.user_data = new_user_data | |
return doc2 |
You need to import: from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP
This function was very helpful for me, and really saved me time.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I modified your code a little bit both for supporting user extensions and for performance.