fsndzomga · September 11, 2023 16:23
diff --git a/data_enrichment_gpt2.py b/data_enrichment_gpt2.py
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn import metrics
 import numpy as np
 import os
 import pdb
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import torch

 # Initialize GPT-2 model and tokenizer
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 model = GPT2LMHeadModel.from_pretrained("gpt2")

 categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

 newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

 data = newsgroups_train.data
 newsgroups_train_enriched = []

 def process_text(index, text):
    elt = text.split(" ")
    if len(elt) > 2000:
        elt = elt[:2000]
    text = ' '.join(elt)
    prompt = f"Give one category to the following text: {text}. The category should be among this list: {categories}"

    # Tokenize input and get token count
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    input_token_count = inputs["input_ids"].shape[1]

    # Calculate remaining token space for the output
    remaining_token_space = 800 - input_token_count

    try:
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=800, pad_token_id=tokenizer.eos_token_id)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Processed index: {index}")
        return text + ' ' + response.split(":")[-1]  # Assuming the response contains the category after ":"
    except Exception as e:
        print(f"Exception at index {index}: {e}")
        return None

 with ThreadPoolExecutor(max_workers=5) as executor:
    futures = {executor.submit(process_text, index, text): index for index, text in enumerate(data)}

    for future in as_completed(futures):
        result = future.result()
        if result is not None:
            newsgroups_train_enriched.append(result)

 # Continue with your existing code
 vectorizer = TfidfVectorizer()
 vectors = vectorizer.fit_transform(newsgroups_train_enriched)
 clf = MultinomialNB(alpha=.01)
 clf.fit(vectors, newsgroups_train.target)

 newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

 vectors_test = vectorizer.transform(newsgroups_test.data)
 pred = clf.predict(vectors_test)

 print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn import metrics
	import numpy as np
	import os
	import pdb
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from transformers import GPT2LMHeadModel, GPT2Tokenizer
	import torch

	# Initialize GPT-2 model and tokenizer
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	model = GPT2LMHeadModel.from_pretrained("gpt2")

	categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

	newsgroups_train = fetch_20newsgroups(subset='train',
	remove=('headers', 'footers', 'quotes'),
	categories=categories)

	data = newsgroups_train.data
	newsgroups_train_enriched = []

	def process_text(index, text):
	elt = text.split(" ")
	if len(elt) > 2000:
	elt = elt[:2000]
	text = ' '.join(elt)
	prompt = f"Give one category to the following text: {text}. The category should be among this list: {categories}"

	# Tokenize input and get token count
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
	input_token_count = inputs["input_ids"].shape[1]

	# Calculate remaining token space for the output
	remaining_token_space = 800 - input_token_count

	try:
	with torch.no_grad():
	outputs = model.generate(**inputs, max_length=800, pad_token_id=tokenizer.eos_token_id)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print(f"Processed index: {index}")
	return text + ' ' + response.split(":")[-1] # Assuming the response contains the category after ":"
	except Exception as e:
	print(f"Exception at index {index}: {e}")
	return None

	with ThreadPoolExecutor(max_workers=5) as executor:
	futures = {executor.submit(process_text, index, text): index for index, text in enumerate(data)}

	for future in as_completed(futures):
	result = future.result()
	if result is not None:
	newsgroups_train_enriched.append(result)

	# Continue with your existing code
	vectorizer = TfidfVectorizer()
	vectors = vectorizer.fit_transform(newsgroups_train_enriched)
	clf = MultinomialNB(alpha=.01)
	clf.fit(vectors, newsgroups_train.target)

	newsgroups_test = fetch_20newsgroups(subset='test',
	remove=('headers', 'footers', 'quotes'),
	categories=categories)

	vectors_test = vectorizer.transform(newsgroups_test.data)
	pred = clf.predict(vectors_test)

	print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))