Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 11, 2023 16:23
Show Gist options
  • Save fsndzomga/015616e931d58ca1072b13acd8a1299c to your computer and use it in GitHub Desktop.
Save fsndzomga/015616e931d58ca1072b13acd8a1299c to your computer and use it in GitHub Desktop.
Data enrichment case 1
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import os
import pdb
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
# Initialize GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
remove=('headers', 'footers', 'quotes'),
categories=categories)
data = newsgroups_train.data
newsgroups_train_enriched = []
def process_text(index, text):
elt = text.split(" ")
if len(elt) > 2000:
elt = elt[:2000]
text = ' '.join(elt)
prompt = f"Give one category to the following text: {text}. The category should be among this list: {categories}"
# Tokenize input and get token count
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
input_token_count = inputs["input_ids"].shape[1]
# Calculate remaining token space for the output
remaining_token_space = 800 - input_token_count
try:
with torch.no_grad():
outputs = model.generate(**inputs, max_length=800, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Processed index: {index}")
return text + ' ' + response.split(":")[-1] # Assuming the response contains the category after ":"
except Exception as e:
print(f"Exception at index {index}: {e}")
return None
with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(process_text, index, text): index for index, text in enumerate(data)}
for future in as_completed(futures):
result = future.result()
if result is not None:
newsgroups_train_enriched.append(result)
# Continue with your existing code
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train_enriched)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
newsgroups_test = fetch_20newsgroups(subset='test',
remove=('headers', 'footers', 'quotes'),
categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment