Last active
April 8, 2024 16:36
-
-
Save lh0x00/74590d15a0f530a3f074ca005a286312 to your computer and use it in GitHub Desktop.
74590d15a0f530a3f074ca005a286312
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import torch | |
import json | |
import glob | |
import logging | |
import os | |
import argparse | |
import time | |
from transformers import ( | |
AutoModelForCausalLM, | |
BitsAndBytesConfig, | |
AutoTokenizer, | |
AutoConfig, | |
) | |
from string import Template | |
from pathlib import Path | |
from tqdm import tqdm | |
import warnings | |
warnings.simplefilter("ignore") | |
torch.backends.cuda.enable_mem_efficient_sdp(False) | |
torch.backends.cuda.enable_flash_sdp(False) | |
def main(args): | |
llm = args.llm | |
device = args.device | |
folder = args.folder | |
path = llm.replace("/", "-") | |
## create directory | |
directory_path = "./logs" | |
if not os.path.exists(directory_path): | |
os.makedirs(directory_path) | |
# Configure logging | |
logging.basicConfig( | |
filename=f"./logs/{path}.log", | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s: %(message)s", | |
) | |
logging.info(f"Model name: {llm}") | |
# config = AutoConfig.from_pretrained(llm, trust_remote_code=True) | |
# config.init_device = device | |
# # config.attn_config['attn_impl'] = 'triton' # Enable if "triton" installed! | |
# model = AutoModelForCausalLM.from_pretrained( | |
# llm, config=config, torch_dtype=torch.bfloat16, trust_remote_code=True | |
# ) | |
# model.to(device) | |
base_model = llm | |
tokenizer = AutoTokenizer.from_pretrained( | |
base_model, | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
base_model, | |
torch_dtype=torch.bfloat16, | |
low_cpu_mem_usage=True, | |
device_map="auto", | |
) | |
model.config.use_cache = False | |
if "sealion" in llm or "Qwen" in llm: | |
tokenizer = AutoTokenizer.from_pretrained(llm, trust_remote_code=True) | |
else: | |
tokenizer = AutoTokenizer.from_pretrained(llm) | |
# Create empty lists to store data | |
ids = [] | |
questions = [] | |
choices_A = [] | |
choices_B = [] | |
choices_C = [] | |
choices_D = [] | |
choices_E = [] | |
# Read JSONL files | |
data_path = Path(folder) | |
jsonl_files = list(data_path.glob("test.jsonl")) | |
for file in jsonl_files: | |
with open(file, "r", encoding="utf-8") as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
ids.append(data["id"]) | |
questions.append(data["question"]) | |
choices = data["choices"] | |
try: | |
choices_A.append(choices[0]) | |
except: | |
choices_A.append("") | |
try: | |
choices_B.append(choices[1]) | |
except: | |
choices_B.append("") | |
try: | |
choices_C.append(choices[2]) | |
except: | |
choices_C.append("") | |
try: | |
choices_D.append(choices[3]) | |
except: | |
choices_D.append("") | |
try: | |
choices_E.append(choices[4]) | |
except: | |
choices_E.append("") | |
# Create a DataFrame | |
df = pd.DataFrame( | |
{ | |
"id": ids, | |
"prompt": questions, | |
"A": choices_A, | |
"B": choices_B, | |
"C": choices_C, | |
"D": choices_D, | |
"E": choices_E, | |
} | |
) | |
logging.info(df.head()) | |
preamble = "Chỉ đưa ra chữ cái đứng trước câu trả lời đúng (A, B, C, D hoặc E) của câu hỏi trắc nghiệm sau: " | |
template = Template("$preamble\n\n$prompt\n\n $a\n $b\n $c\n $d\n $e\nĐáp án:") | |
def format_input(df, idx): | |
prompt = df.loc[idx, "prompt"] | |
a = df.loc[idx, "A"] | |
b = df.loc[idx, "B"] | |
c = df.loc[idx, "C"] | |
d = df.loc[idx, "D"] | |
e = df.loc[idx, "E"] | |
input_text = template.substitute( | |
preamble=preamble, prompt=prompt, a=a, b=b, c=c, d=d, e=e | |
) | |
return input_text | |
def generate_response( | |
question: str, | |
): | |
messages = [ | |
{"role": "system", "content": ""}, | |
{"role": "user", "content": question}, | |
] | |
prompt = tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
inputs = tokenizer( | |
prompt, | |
return_tensors="pt", | |
add_special_tokens=False, | |
) | |
for k, v in inputs.items(): | |
inputs[k] = v.cuda() | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=5, | |
) | |
# results = tokenizer.batch_decode(outputs)[0] | |
results = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :])[0] | |
return results | |
# Test a toy example | |
if "ghost" in llm: | |
outputs = generate_response(question=format_input(df, 0)) | |
answer = ( | |
outputs.replace("*", "").replace("-", "").replace(" ", "").replace(".", "") | |
) | |
answer = answer[0] | |
elif "falcon" in llm: | |
inputs = tokenizer( | |
format_input(df, 0), return_tensors="pt", return_token_type_ids=False | |
).to(device) | |
outputs = model.generate( | |
**inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1 | |
) | |
elif "sealion" in llm: | |
inputs = tokenizer(format_input(df, 0), return_tensors="pt").to(device) | |
outputs = model.generate(inputs["input_ids"], max_new_tokens=1) | |
else: | |
inputs = tokenizer(format_input(df, 0), return_tensors="pt").to(device) | |
outputs = model.generate(**inputs, max_new_tokens=1) | |
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
logging.info("Contruct a toy eg") | |
logging.info("Generated answer: %s", answer) | |
answers = [] | |
start = time.time() | |
for idx in tqdm(df.index): | |
if "ghost" in llm: | |
outputs = generate_response(question=format_input(df, idx)) | |
answer = ( | |
outputs.replace("*", "") | |
.replace("-", "") | |
.replace(" ", "") | |
.replace(".", "") | |
) | |
answer = answer[0] | |
answers.append(answer) | |
if idx == 0 or idx == 10 or idx % 100 == 0: | |
print("\n", outputs, [answer]) | |
else: | |
if "falcon" in llm: | |
inputs = tokenizer( | |
format_input(df, idx), | |
return_tensors="pt", | |
return_token_type_ids=False, | |
).to(device) | |
outputs = model.generate( | |
**inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1 | |
) | |
elif "sealion" in llm: | |
inputs = tokenizer(format_input(df, idx), return_tensors="pt").to( | |
device | |
) | |
outputs = model.generate(inputs["input_ids"], max_new_tokens=1) | |
else: | |
inputs = tokenizer(format_input(df, idx), return_tensors="pt").to( | |
device | |
) | |
outputs = model.generate(**inputs, max_new_tokens=1) | |
answer_decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
last_element = answer_decoded[-1] | |
answer = last_element.split()[-1] | |
answers.append(answer) | |
end = time.time() | |
duration = end - start | |
print("Time taken for running inference: ", duration) | |
df["answer"] = answers | |
logging.info(df.head()) | |
# save the answer csv | |
df[["id", "answer"]].to_csv(f"./logs/{path}.csv", index=False) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Your script description here") | |
# Add command-line arguments | |
parser.add_argument( | |
"--llm", | |
type=str, | |
default="bigscience/bloom-1b7", | |
help="Specify the llm value (default: bigscience/bloom-1b7)", | |
) | |
parser.add_argument( | |
"--device", | |
type=str, | |
default="cuda" if torch.cuda.is_available() else "cpu", | |
help="Specify the device", | |
) | |
parser.add_argument( | |
"--folder", type=str, default="./vmlu_v1.5/", help="Specify the folder data" | |
) | |
# Parse the command-line arguments6 | |
args = parser.parse_args() | |
# Call the main function with the parsed arguments | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment