Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 11, 2023 16:25
Show Gist options
  • Save fsndzomga/7c8d70702140760f3a792b9872513d57 to your computer and use it in GitHub Desktop.
Save fsndzomga/7c8d70702140760f3a792b9872513d57 to your computer and use it in GitHub Desktop.
data enrichment using gpt3 and langchain
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from langchain.chat_models import ChatOpenAI
from keys import OPENAI_API_KEY
import os
from langchain.schema import (
HumanMessage,
SystemMessage
)
import pdb
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
llm = ChatOpenAI(max_tokens=200)
categories = ['alt.atheism', 'talk.religion.misc',
'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
remove=('headers', 'footers', 'quotes'),
categories=categories)
newsgroups_train_enriched = []
data = newsgroups_train.data
for index in range(len(data)):
elt = data[index].split(" ")
if len(elt) > 2000:
elt = elt[:2000]
text = ' '.join(elt)
messages = [
SystemMessage(content="You are a helpful assistant that classifies texts."),
HumanMessage(content=f"Give one category to the following text: {text}. The category should be among this list: {categories}")
]
try:
response = llm(messages)
except:
pdb.set_trace()
newsgroups_train_enriched.append(text+response.content)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train_enriched)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
newsgroups_test = fetch_20newsgroups(subset='test',
remove=('headers', 'footers', 'quotes'),
categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
print(metrics.f1_score(pred, newsgroups_test.target, average='macro'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment