Last active
February 3, 2023 13:45
-
-
Save MichelNivard/ca0ab87bb5e618ce42061321a4b333f2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sys.setenv( # get an API key here: https://platform.openai.com/account/api-keys | |
OPENAI_API_KEY = 'YOUR_API_KEY_HERE' | |
) | |
### Make a text "database" to search: | |
library(tm) | |
library(dplyr) | |
library(corpus) | |
library(rjson) | |
library(stringi) | |
library(reticulate) | |
library(openai) | |
# Check if python is available, some help setting this up is here: https://rpubs.com/eR_ic/transfoRmers | |
reticulate::py_available() | |
# Importing 🤗 transformers into R session (source: https://rpubs.com/eR_ic/transfoRmers) | |
transformers <- reticulate::import("transformers") | |
# read the news articles, source of news: https://www.kaggle.com/competitions/learn-ai-bbc/data | |
BBC.News.Train <- read.csv("BBC_News_Train.csv") | |
# Search based on this example: http://www.dataperspective.info/2017/11/information-retrieval-document-search-using-vector-space-model-in-r.html | |
### Function to pick documents related to your query: | |
doc.picker <- function(query,docs){ | |
length <- nrow(docs) | |
ds <- DataframeSource(docs) | |
x <- Corpus(ds) | |
my.corpus <- x | |
#remove punctuation | |
my.corpus <- tm_map(my.corpus, removePunctuation) | |
#remove numbers, uppercase, additional spaces | |
my.corpus <- tm_map(my.corpus, removeNumbers) | |
my.corpus <- tm_map(my.corpus, content_transformer(tolower)) | |
my.corpus <- tm_map(my.corpus, stripWhitespace) | |
#create document matrix in a format that is efficient | |
term.doc.matrix.stm <- TermDocumentMatrix(my.corpus) | |
#constructing the Vector Space Model | |
get.tf.idf.weights <- function(tf.vec) { | |
# Compute tfidf weights from term frequency vector | |
n.docs <- length(tf.vec) | |
doc.frequency <- length(tf.vec[tf.vec > 0]) | |
weights <- rep(0, length(tf.vec)) | |
weights[tf.vec > 0] <- (1 + log2(tf.vec[tf.vec > 0])) * log2(n.docs/doc.frequency) | |
return(weights) | |
} | |
# normalized frequency of words in each document: | |
tfidf.matrix <- t(apply(term.doc.matrix.stm, 1, | |
FUN = function(row) {get.tf.idf.weights(row)})) | |
colnames(tfidf.matrix) <- colnames(term.doc.matrix.stm) | |
tfidf.matrix <- scale(tfidf.matrix, center = FALSE, | |
scale = sqrt(colSums(tfidf.matrix^2))) | |
# split Q from rest of documents: | |
query.vector <- tfidf.matrix[,length ] | |
tfidf.matrix <- tfidf.matrix[, 1:(length -1)] | |
# score the documents: | |
doc.scores <- t(query.vector) %*% tfidf.matrix | |
# collect results | |
results.df <- data.frame(doc = docs[1:(length-1),]$doc_id, score = t(doc.scores)) | |
# rank the docs: | |
results.df <- results.df[order(results.df$score, decreasing = TRUE), ] | |
# return the docs but ranked: | |
results.df | |
} | |
# Perform a very poor news corpus search | |
news.search <- function(query,docs=BBC.News.Train){ | |
# reead the document corpus, append the search term: | |
docs <- data.frame(doc_id = c(BBC.News.Train[,1],9999), | |
text = c(BBC.News.Train[,2],query), | |
dmeta1 = c(BBC.News.Train[,3],query), | |
stringsAsFactors = FALSE) | |
# search all docs in the newslibrary: | |
ranked.docs <- doc.picker(query = query,texts = texts,docs=docs) | |
docs[docs[,1] %in% ranked.docs[1:3,1],2] | |
} | |
# ask OpenAI's GPT a question, and provide it documents as context: | |
news.search.gpt <- function(query){ | |
question = query | |
context = news.search(query = query) | |
# this is what we will ask GPT 3.5: | |
prompt = paste0("you are a chatbot anwsering a question about the news, | |
you are provided with both a question, and for context some related news articles from the BBC, | |
use the context provided in formulating an anwser. | |
The user question is:",question,". | |
The following information is the context for anwsering the question:") | |
# here we mush together all the context: | |
full.prompt <- paste(prompt,paste(context[1],context[2],context[3])) | |
# Get an Awnser from OpenAI: | |
anwser <- create_completion( | |
model = "text-davinci-003", | |
prompt = full.prompt, | |
max_tokens = 500 | |
) | |
list( GPT_response = anwser$choices, actual_context_provided = context) | |
} | |
### Huggin Face implementation, this uses open source language models, not OpenAI: | |
news.seach.hf <- function(query){ | |
# Specify task | |
reader <- transformers$pipeline(task = "question-answering", model = "deepset/minilm-uncased-squad2") | |
searched <- news.search(query = query) | |
context <- paste0(searched[1],searched[2],searched[3]) | |
outputs <- reader(question = query, context = context) | |
outputs | |
} | |
# get an awnser from a different AI model: | |
anwser <- news.seach.hf("Why did Alex Ferguson want Thierry Henry punished in 2004?") | |
anwser # o wow not great... | |
# Get an Answer from OpenAI with outr context: | |
news.search.gpt("Why did Alex Ferguson want Thierry Henry punished in 2004?") # context aware implementation | |
# Get an Answer from Vanilla GPT 3.5 without context: | |
create_completion(model = "text-davinci-003", prompt = "Why did Alex Ferguson want Thierry Henry punished in 2004?", max_tokens = 500) | |
# get an awnser from a different AI model: | |
anwser <- news.seach.hf("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?") | |
anwser # o wow not great... | |
# Get an Awnser from OpenAI with our context: | |
news.search.gpt("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?") # context aware implementation | |
# Get an Awnser from Vanilla GPT 3.5 without context: | |
create_completion(model = "text-davinci-003", prompt = "What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?", max_tokens = 500) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment