joshua-taylor · October 10, 2020 09:45 · al3xsh · Apr 19, 2021 · joshua-taylor · Apr 24, 2021
diff --git a/sPacy tokenize.py b/sPacy tokenize.py
 nlp = spacy.load("en_core_web_sm")
 tok_text=[] # OUTPUT for our tokenised corpus
 text = df.text.str.lower().values
 text = [fix_text(str(i)) for i in text]

 #Tokenising using SpaCy:
 for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
   tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
   tok_text.append(tok)
	nlp = spacy.load("en_core_web_sm")
	tok_text=[] # OUTPUT for our tokenised corpus
	text = df.text.str.lower().values
	text = [fix_text(str(i)) for i in text]

	#Tokenising using SpaCy:
	for doc in tqdm(nlp.pipe(text, n_threads=2, disable=["tagger", "parser","ner"])):
	tok = [t.text for t in doc if (t.is_ascii and not t.is_punct and not t.is_space)]
	tok_text.append(tok)