Last active
December 20, 2024 17:05
-
-
Save shawngraham/05481c1a52b74d3686e234420323b3ae to your computer and use it in GitHub Desktop.
I finetuned smol-135 on archaeological metadata from ADS reports to make an archae metadata extractor. This code snippet passes a row of downloaded data (where all the fields have been smooshed together: ie, dirty data!) through smol135 AND my finetuned version so you can see the difference.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# and here we're going to load the models from huggingface and test them against the same output | |
# so you can see the difference between fine-tuning the original smol-135 makes | |
# this was trained on free-tier google colab for not very long on 800 rows of training data that I | |
# wrangled into correct shape. Training scripts etc will be shared in due course, but not bad for a first stab, eh? | |
import torch | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
from peft import PeftModel | |
import json | |
def generate_response_pipeline(pipeline, input_text, max_length=500): | |
""" | |
Generates a response using a Hugging Face pipeline | |
""" | |
response = pipeline(input_text, max_length=max_length)[0]['generated_text'] | |
return response | |
def generate_response(model, tokenizer, input_text, max_length=500): | |
""" | |
Generates a response using the model | |
""" | |
inputs = tokenizer(input_text, return_tensors="pt").to(model.device) | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_length=max_length) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return response | |
def load_and_prepare_model(model_path, model_id=None, use_merged = True): | |
""" | |
Loads and prepares either the merged model or the LoRA model | |
""" | |
if use_merged: | |
# Load the merged model directly | |
model = AutoModelForCausalLM.from_pretrained( | |
model_path, | |
device_map="auto", | |
torch_dtype=torch.float16 | |
) | |
else: | |
# Load base model and LoRA weights | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", | |
torch_dtype=torch.float16 | |
) | |
model = PeftModel.from_pretrained(model, model_path) | |
tokenizer = AutoTokenizer.from_pretrained(model_path) if use_merged else AutoTokenizer.from_pretrained(model_id) | |
tokenizer.pad_token = tokenizer.eos_token | |
return model, tokenizer | |
def main(): | |
# Model names | |
base_model_id = "HuggingFaceTB/SmolLM-135M" | |
fine_tuned_model_id = "sgraham/merged_archae_metadata_model" | |
# Load the base model | |
base_model, base_tokenizer = load_and_prepare_model(model_path=base_model_id, model_id=base_model_id, use_merged = True) | |
# Load the fine-tuned model | |
ft_model, ft_tokenizer = load_and_prepare_model(model_path=fine_tuned_model_id, model_id=base_model_id, use_merged = True) | |
# Load the pipeline | |
ft_pipeline = pipeline("text-generation", model=fine_tuned_model_id, device_map="auto", torch_dtype=torch.float16) | |
# Test example (uses same format as used in fine-tuning) | |
# location, companies, etc, all are all fake data here. | |
prompt = """<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.\n<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research\n""" | |
# Generate and compare | |
print(f"Input Prompt: {prompt}") | |
base_response = generate_response(base_model, base_tokenizer, prompt) | |
print(f"\nBase Model Response:\n{base_response}") | |
ft_response_direct = generate_response(ft_model, ft_tokenizer, prompt) | |
print(f"\nFine-tuned Model Response (Direct):\n{ft_response_direct}") | |
ft_response_pipeline = generate_response_pipeline(ft_pipeline, prompt) | |
print(f"\nFine-tuned Model Response (Pipeline):\n{ft_response_pipeline}") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Device set to use cuda:0 | |
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. | |
Input Prompt: <|system|>You are a helpful archaeological assistant trained to categorize archaeological reports. | |
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research | |
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. | |
Base Model Response: | |
<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports. | |
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research | |
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research | |
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archae | |
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`. | |
Fine-tuned Model Response (Direct): | |
<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports. | |
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research | |
<|assistant|>{"subjects": ["Excavation", "Pit", "Iron Age", "Romano-British", "Polly", "Field System"], "periods": ["ROMAN", "IRON AGE"], "work_conducted_by": ["Publisher:Archaeology Data Service", "Creator:Starlight Research"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archae | |
Fine-tuned Model Response (Pipeline): | |
<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports. | |
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research | |
<|assistant|>{"subjects": ["Excavation", "Pit", "Iron Age", "Romano-British", "Polly", "Field System"], "periods": ["ROMAN", "IRON AGE"], "work_conducted_by": ["Publisher:Archaeology Data Service", "Creator:Starlight Research"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}} | |
<|assistant|>{"work_conducted_by": ["Publisher:Archae |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment