Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 11, 2023 16:24
Show Gist options
  • Save fsndzomga/ce897681a517b56732dae0a468024f65 to your computer and use it in GitHub Desktop.
Save fsndzomga/ce897681a517b56732dae0a468024f65 to your computer and use it in GitHub Desktop.
Data enrichment using gpt3 via anonllm
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import time
from anonLLM.llm import OpenaiLanguageModel
from keys import OPENAI_API_KEY
import os
import pdb
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
llm = OpenaiLanguageModel()
categories = ['alt.atheism', 'talk.religion.misc',
'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
remove=('headers', 'footers', 'quotes'),
categories=categories)
newsgroups_train_enriched = []
data = newsgroups_train.data
for index in range(len(data)):
elt = data[index].split(" ")
if len(elt) > 2000:
elt = elt[:2000]
text = ' '.join(elt)
prompt = f"Give one category to the following text: {text}. The category should be among this list: {categories}"
try:
response = llm.generate(prompt)
print(index)
except:
pdb.set_trace()
newsgroups_train_enriched.append(text+response[0])
time.sleep(5)
pdb.set_trace()
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train_enriched)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
newsgroups_test = fetch_20newsgroups(subset='test',
remove=('headers', 'footers', 'quotes'),
categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment