Last active
March 23, 2021 15:31
-
-
Save infinex/b456a6e19bd4cbb06e1b9ffad5d2dc04 to your computer and use it in GitHub Desktop.
ranking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import TFAutoModel, AutoTokenizer,AutoModel | |
import os | |
import tensorflow as tf | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
tf_model = TFAutoModel.from_pretrained('sentence-transformers/msmarco-distilroberta-base-v2',from_pt=True) | |
model = AutoModel.from_pretrained('sentence-transformers/msmarco-distilroberta-base-v2') | |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-distilroberta-base-v2') | |
texts = ['I love to sleep','Sleeping is me'] | |
dataset = tokenizer.batch_encode_plus(texts,padding='longest') | |
tf_dataset = tokenizer.batch_encode_plus(texts,padding='max_length',return_tensors='tf',max_length=128) | |
tf_iter = tf.data.Dataset.from_tensor_slices(tf_dataset).batch(32) | |
for batch in tf_iter: | |
print(tf_model(batch)) | |
class ModelFn(tf.Module): | |
def __init__(self, model): | |
self.model = model | |
self.max_seq_length = 512 | |
@tf.function(input_signature=[ | |
tf.TensorSpec(shape=(None, 128), dtype=tf.int64), | |
tf.TensorSpec(shape=(None, 128), dtype=tf.int64), | |
]) | |
def predict_export(self, input_ids, attention_mask): | |
results = self.model( | |
{'input_ids': input_ids, 'attention_mask': attention_mask}, | |
training=False).pooler_output | |
return results | |
@tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string) | |
]) | |
def predict_export_serialised(self, serialized): | |
input_features = { | |
'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64), | |
'attention_mask': tf.io.FixedLenFeature( | |
[self.max_seq_length], tf.int64), | |
} | |
example = tf.io.parse_example(serialized=serialized, features=input_features) | |
# convert to list input as per define in keras model | |
results = self.model(example,training=False).pooler_output | |
return results | |
def export_save_model(self, export_dir): | |
export_dir = "%s/1/" % export_dir | |
tf.saved_model.save(self.model, export_dir, | |
{'predict_b64': self.predict_export_serialised, | |
'predict': self.predict_export}) | |
ModelFn(tf_model).export_save_model('model') | |
class TransformerDataset(torch.utils.data.Dataset): | |
def __init__(self, encodings): | |
self.encodings = encodings | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
return item | |
def __len__(self): | |
return len(self.encodings['input_ids']) | |
ds = TransformerDataset(dataset) | |
trainloader=torch.utils.data.DataLoader(ds, batch_size=32, shuffle=False, num_workers=8) | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model.to(device) | |
for batch in trainloader: | |
batch = {k: v.to(device) for k, v in batch.items()} | |
outputs = model(**batch) | |
print(outputs.pooler_output) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import AutoTokenizer, T5ForConditionalGeneration | |
passages = [['7744105', 'For Earth-centered it was Geocentric Theory proposed by greeks under the guidance of Ptolemy and Sun-centered was Heliocentric theory proposed by Nicolas Copernicus in 16th century A.D. In short, Your Answers are: 1st blank - Geo-Centric Theory. 2nd blank - Heliocentric Theory.'], ['2593796', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.he geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.'], ['6217200', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.opernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.'], ['3276925', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.Simple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect.ou might want to check out one article on the history of the geocentric model and one regarding the geocentric theory. Here are links to two other articles from Universe Today on what the center of the universe is and Galileo one of the advocates of the heliocentric model.'], ['6217208', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.Simple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect.opernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.'], ['4280557', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.imple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect. You might want to check out one article on the history of the geocentric model and one regarding the geocentric theory.'], ['264181', 'Nicolaus Copernicus (b. 1473â\x80\x93d. 1543) was the first modern author to propose a heliocentric theory of the universe. From the time that Ptolemy of Alexandria (c. 150 CE) constructed a mathematically competent version of geocentric astronomy to Copernicusâ\x80\x99s mature heliocentric version (1543), experts knew that the Ptolemaic system diverged from the geocentric concentric-sphere conception of Aristotle.'], ['4280558', 'A Geocentric theory is an astronomical theory which describes the universe as a Geocentric system, i.e., a system which puts the Earth in the center of the universe, and describes other objects from the point of view of the Earth. Geocentric theory is an astronomical theory which describes the universe as a Geocentric system, i.e., a system which puts the Earth in the center of the universe, and describes other objects from the point of view of the Earth.'], ['3276926', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 91 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.ou might want to check out one article on the history of the geocentric model and one regarding the geocentric theory. Here are links to two other articles from Universe Today on what the center of the universe is and Galileo one of the advocates of the heliocentric model.'], ['5183032', "After 1,400 years, Copernicus was the first to propose a theory which differed from Ptolemy's geocentric system, according to which the earth is at rest in the center with the rest of the planets revolving around it."]] | |
query = 'who proposed the geocentric theory' | |
pattern = "Query: {query} Document: {document} Relevant:" | |
model = 'castorini/monot5-base-msmarco' | |
texts = [p[1] for p in passages] | |
model = T5ForConditionalGeneration.from_pretrained(model) | |
tokenizer = AutoTokenizer.from_pretrained('t5-base', use_fast=False) | |
EOS = tokenizer.eos_token | |
MAX_LENGTH= 512 | |
t4_tokenizer = {'return_attention_mask': True, | |
'padding': 'longest', | |
'truncation': True, | |
'max_length': 512} | |
dataset = tokenizer.batch_encode_plus( | |
[pattern.format(query=query, document=document) for | |
document in | |
texts],**t4_tokenizer) | |
@torch.no_grad() | |
def greedy_decode(model, | |
input_ids: torch.Tensor, | |
length: int, | |
attention_mask: torch.Tensor = None, | |
return_last_logits: bool = True): | |
decode_ids = torch.full((input_ids.size(0), 1), | |
model.config.decoder_start_token_id, | |
dtype=torch.long).to(input_ids.device) | |
encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask) | |
next_token_logits = None | |
for _ in range(length): | |
model_inputs = model.prepare_inputs_for_generation( | |
decode_ids, | |
encoder_outputs=encoder_outputs, | |
past=None, | |
attention_mask=attention_mask, | |
use_cache=True) | |
outputs = model(**model_inputs) # (batch_size, cur_len, vocab_size) | |
next_token_logits = outputs[0][:, -1, :] # (batch_size, vocab_size) | |
decode_ids = torch.cat([decode_ids, | |
next_token_logits.max(1)[1].unsqueeze(-1)], | |
dim=-1) | |
if return_last_logits: | |
return decode_ids, next_token_logits | |
return decode_ids | |
import torch | |
class MonoT5Dataset(torch.utils.data.Dataset): | |
def __init__(self, encodings): | |
self.encodings = encodings | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
return item | |
def __len__(self): | |
return len(self.encodings['input_ids']) | |
ds = MonoT5Dataset(dataset) | |
trainloader=torch.utils.data.DataLoader(ds, batch_size=32, shuffle=False, num_workers=8) | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model.to(device) | |
for batch in trainloader: | |
input_ids = batch['input_ids'].to(device) | |
attn_mask = batch['attention_mask'].to(device) | |
_, batch_scores = greedy_decode(model, | |
input_ids, | |
length=1, | |
attention_mask=attn_mask, | |
return_last_logits=True) | |
# 6136 and 1176 are the indexes of the tokens false and true in T6. | |
batch_scores = batch_scores[:, [6136, 1176]] | |
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1) | |
batch_log_probs = batch_scores[:, 1].tolist() | |
for score in batch_log_probs: | |
print(score) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This examples demonstrates the setup for Query / Question-Answer-Retrieval. | |
You can input a query or a question. The script then uses semantic search | |
to find relevant passages in Simple English Wikipedia (as it is smaller and fits better in RAM). | |
For semantic search, we use SentenceTransformer('msmarco-distilbert-base-v2') and retrieve | |
100 potentially passages that answer the input query. | |
Next, we use a more powerful CrossEncoder (cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')) that | |
scores the query and all retrieved passages for their relevancy. The cross-encoder is neccessary to filter out certain noise | |
that might be retrieved from the semantic search step. | |
Google Colab Example: https://colab.research.google.com/drive/1l6stpYdRMmeDBK_vw0L5NitdiAuhdsAr?usp=sharing | |
""" | |
import json | |
from pathlib import Path | |
from sentence_transformers import SentenceTransformer, CrossEncoder, util | |
import time | |
import gzip | |
import os | |
import torch | |
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search | |
model_name = 'msmarco-distilbert-base-v2' | |
bi_encoder = SentenceTransformer(model_name) | |
top_k = 100 #Number of passages we want to retrieve with the bi-encoder | |
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality | |
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') | |
# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only | |
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder | |
wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz' | |
if not os.path.exists(wikipedia_filepath): | |
util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath) | |
passages = [] | |
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn: | |
for line in fIn: | |
data = json.loads(line.strip()) | |
passages.extend(data['paragraphs']) | |
#If you like, you can also limit the number of passages you want to use | |
print("Passages:", len(passages)) | |
# To speed things up, pre-computed embeddings are downloaded. | |
# The provided file encoded the passages with the model 'msmarco-distilbert-base-v2' | |
embeddings_filepath = f'{Path(wikipedia_filepath).stem}-{model_name}.pt' | |
if os.path.exists(embeddings_filepath): | |
corpus_embeddings = torch.load(embeddings_filepath) | |
corpus_embeddings = corpus_embeddings.float() #Convert embedding file to float | |
if torch.cuda.is_available(): | |
corpus_embeddings = corpus_embeddings.to('cuda') | |
else: #Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU) | |
corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True) | |
torch.save(corpus_embeddings,Path(embeddings_filepath)) | |
while True: | |
query = input("Please enter a question: ") | |
#Encode the query using the bi-encoder and find potentially relevant passages | |
start_time = time.time() | |
question_embedding = bi_encoder.encode(query, convert_to_tensor=True) | |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k) | |
hits = hits[0] # Get the hits for the first query | |
#Now, score all retrieved passages with the cross_encoder | |
cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits] | |
cross_scores = cross_encoder.predict(cross_inp) | |
#Sort results by the cross-encoder scores | |
for idx in range(len(cross_scores)): | |
hits[idx]['cross-score'] = cross_scores[idx] | |
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) | |
end_time = time.time() | |
#Output of top-5 hits | |
print("Input question:", query) | |
print("Results (after {:.3f} seconds):".format(end_time - start_time)) | |
for hit in hits[0:5]: | |
print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']])) | |
print("\n\n========\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment