Created
November 24, 2019 13:16
-
-
Save joshua-taylor/3abad7e105e25cfb46f3c0f56ad16c85 to your computer and use it in GitHub Desktop.
BERT vectors and TFIDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import torch | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import IPython | |
is_using_gpu = spacy.prefer_gpu() | |
if is_using_gpu: | |
torch.set_default_tensor_type("torch.cuda.FloatTensor") | |
nlp = spacy.load("en_trf_bertbaseuncased_lg") | |
vectorizer = TfidfVectorizer(min_df=0.0,lowercase=True) | |
tfidf = vectorizer.fit(df.questionText.values) | |
tkn = tfidf.build_tokenizer() | |
print('creating a lookup dictionary') #this speeds up the script significantly... | |
tfidf_lookup = {} | |
for key,value in tfidf.vocabulary_.items(): | |
tfidf_lookup[key]=tfidf.idf_[value] | |
from tqdm import tqdm | |
vect = [] | |
for doc in tqdm(nlp.pipe(df.questionText.values,batch_size=5000)): | |
weighted_doc_tensor = [] | |
try: | |
for cnt, wrd_vec in enumerate(doc.tensor): | |
word = doc[cnt].text | |
try: | |
weight = tfidf_lookup[word.lower()] | |
except: | |
#print('{} not found'.format(word)) | |
weight = 0.5 | |
pass | |
doc.tensor[cnt] = doc.tensor[cnt]*weight | |
vect.append(np.mean(doc.tensor,axis=0)) | |
except: | |
vect.append(np.zeros(768,))#In case there are any blank items | |
pass | |
vect = np.vstack(vect) | |
#takes 39 sec w/o gpu, 6 with! | |
np.save('question_vects_tfidf.npy', vect) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment