jacquesfize · June 11, 2022 14:59 · yarongon · Jun 12, 2019 · joshlk · Dec 2, 2019
diff --git a/remove_token.py b/remove_token.py
 def remove_tokens(doc, index_to_del, list_attr=[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]):
    """
    Remove tokens from a Spacy *Doc* object without losing 
    associated information (PartOfSpeech, Dependance, Lemma, extensions, ...)
    
    Parameters
    ----------
    doc : spacy.tokens.doc.Doc
        spacy representation of the text
    index_to_del : list of integer 
         positions of each token you want to delete from the document
    list_attr : list, optional
        Contains the Spacy attributes you want to keep (the default is 
        [LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP])
    Returns
    -------
    spacy.tokens.doc.Doc
        Filtered version of doc
    """
    
    np_array = doc.to_array(list_attr) # Array representation of Doc
    
    # Creating a mask: boolean array of the indexes to delete
    mask_to_del = np.ones(len(np_array), np.bool)
    mask_to_del[index_to_del] = 0
    
    np_array_2 = np_array[mask_to_del]
    doc2 = Doc(doc.vocab, words=[t.text for t in doc if t.i not in index_to_del])
    doc2.from_array(list_attr, np_array_2)
    
    ### Modification made by @yarongon https://gist.github.com/Jacobe2169/5086c7c4f6c56e9d3c7cfb1eb0010fe8#gistcomment-2941380
    # Handling user extensions
    #  The `doc.user_data` dictionary is holding the data backing user-defined attributes.
    #  The data is based on characters offset, so a conversion is needed from the
    #  old Doc to the new one.
    #  More info here: https://github.com/explosion/spaCy/issues/2532
    arr = np.arange(len(doc))
    new_index_to_old = arr[mask_to_del]
    doc_offset_2_token = {tok.idx : tok.i  for tok in doc}  # needed for the user data
    doc2_token_2_offset = {tok.i : tok.idx  for tok in doc2}  # needed for the user data
    new_user_data = {}
    for ((prefix, ext_name, offset, x), val) in doc.user_data.items():
        old_token_index = doc_offset_2_token[offset]
        new_token_index = np.where(new_index_to_old == old_token_index)[0]
        if new_token_index.size == 0:  # Case this index was deleted
            continue
        new_char_index = doc2_token_2_offset[new_token_index[0]]
        new_user_data[(prefix, ext_name, new_char_index, x)] = val
    doc2.user_data = new_user_data
    
    return doc2
	def remove_tokens(doc, index_to_del, list_attr=[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]):
	"""
	Remove tokens from a Spacy Doc object without losing
	associated information (PartOfSpeech, Dependance, Lemma, extensions, ...)

	Parameters
	----------
	doc : spacy.tokens.doc.Doc
	spacy representation of the text
	index_to_del : list of integer
	positions of each token you want to delete from the document
	list_attr : list, optional
	Contains the Spacy attributes you want to keep (the default is
	[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP])
	Returns
	-------
	spacy.tokens.doc.Doc
	Filtered version of doc
	"""

	np_array = doc.to_array(list_attr) # Array representation of Doc

	# Creating a mask: boolean array of the indexes to delete
	mask_to_del = np.ones(len(np_array), np.bool)
	mask_to_del[index_to_del] = 0

	np_array_2 = np_array[mask_to_del]
	doc2 = Doc(doc.vocab, words=[t.text for t in doc if t.i not in index_to_del])
	doc2.from_array(list_attr, np_array_2)

	### Modification made by @yarongon https://gist.github.com/Jacobe2169/5086c7c4f6c56e9d3c7cfb1eb0010fe8#gistcomment-2941380
	# Handling user extensions
	# The `doc.user_data` dictionary is holding the data backing user-defined attributes.
	# The data is based on characters offset, so a conversion is needed from the
	# old Doc to the new one.
	# More info here: https://github.com/explosion/spaCy/issues/2532
	arr = np.arange(len(doc))
	new_index_to_old = arr[mask_to_del]
	doc_offset_2_token = {tok.idx : tok.i for tok in doc} # needed for the user data
	doc2_token_2_offset = {tok.i : tok.idx for tok in doc2} # needed for the user data
	new_user_data = {}
	for ((prefix, ext_name, offset, x), val) in doc.user_data.items():
	old_token_index = doc_offset_2_token[offset]
	new_token_index = np.where(new_index_to_old == old_token_index)[0]
	if new_token_index.size == 0: # Case this index was deleted
	continue
	new_char_index = doc2_token_2_offset[new_token_index[0]]
	new_user_data[(prefix, ext_name, new_char_index, x)] = val
	doc2.user_data = new_user_data

	return doc2