Created
November 3, 2023 23:16
-
-
Save 19h/2fa2607e539804fc6eb043a2ca6c3d60 to your computer and use it in GitHub Desktop.
This Python code efficiently extracts sentence embeddings from a large CSV dataset of news articles using a pretrained BERT model for natural language processing. It first loads the BERT model and tokenizer, then reads the input CSV row by row, extracting the title and article text. It batches the titles, feeds them to the BERT model to generate…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import torch | |
from tqdm import tqdm | |
from transformers import AutoModel, BertTokenizerFast | |
import ctypes as ct | |
csv.field_size_limit(int(ct.c_ulong(-1).value // 2)) | |
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) | |
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model.to(device) | |
model.eval() | |
if device.type == 'cuda': | |
model = model.half() | |
with open('all-the-news-2-1.csv', 'r') as f: | |
reader = csv.DictReader(f) | |
with open('atn-embd.csv', 'w') as fout: | |
writer = csv.DictWriter(fout, fieldnames=['date', 'year', 'month', 'day', 'author', 'section', 'publication', 'title', 'embedding']) | |
writer.writeheader() | |
batch_size = 64 | |
texts = [] | |
rows = [] | |
for i, row in enumerate(tqdm(reader)): | |
date = row['date'] | |
year = row['year'] | |
month = row['month'] | |
day = row['day'] | |
author = row['author'] | |
title = row['title'] | |
article = row['article'] | |
section = row['section'] | |
publication = row['publication'] | |
text = title + '\n' + article | |
texts.append(text) | |
rows.append({ | |
'date': date, | |
'year': year, | |
'month': month, | |
'day': day, | |
'author': author, | |
'section': section, | |
'publication': publication, | |
'title': title, | |
}) | |
if len(texts) == batch_size: | |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)["input_ids"] | |
with torch.no_grad(): | |
outputs = model(inputs) | |
embeddings = outputs.last_hidden_state[:, 0, :].float().cpu().numpy() | |
for j, row in enumerate(rows): | |
writer.writerow({ | |
'date': row['date'], | |
'year': row['year'], | |
'month': row['month'], | |
'day': row['day'], | |
'author': row['author'], | |
'section': row['section'], | |
'publication': row['publication'], | |
'title': row['title'], | |
'embedding': json.dumps(embeddings[j].tolist()) | |
}) | |
texts = [] | |
rows = [] | |
if len(texts) > 0: | |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device) | |
with torch.no_grad(): | |
outputs = model(inputs) | |
embeddings = outputs.last_hidden_state[:, 0, :].float().cpu().numpy() | |
for j, row in enumerate(rows): | |
writer.writerow({ | |
'date': row['date'], | |
'year': row['year'], | |
'month': row['month'], | |
'day': row['day'], | |
'author': row['author'], | |
'section': row['section'], | |
'publication': row['publication'], | |
'title': row['title'], | |
'embedding': json.dumps(embeddings[j].tolist()) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment