Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active December 20, 2024 17:05
Show Gist options
  • Save shawngraham/05481c1a52b74d3686e234420323b3ae to your computer and use it in GitHub Desktop.
Save shawngraham/05481c1a52b74d3686e234420323b3ae to your computer and use it in GitHub Desktop.
I finetuned smol-135 on archaeological metadata from ADS reports to make an archae metadata extractor. This code snippet passes a row of downloaded data (where all the fields have been smooshed together: ie, dirty data!) through smol135 AND my finetuned version so you can see the difference.
# and here we're going to load the models from huggingface and test them against the same output
# so you can see the difference between fine-tuning the original smol-135 makes
# this was trained on free-tier google colab for not very long on 800 rows of training data that I
# wrangled into correct shape. Training scripts etc will be shared in due course, but not bad for a first stab, eh?
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
def generate_response_pipeline(pipeline, input_text, max_length=500):
"""
Generates a response using a Hugging Face pipeline
"""
response = pipeline(input_text, max_length=max_length)[0]['generated_text']
return response
def generate_response(model, tokenizer, input_text, max_length=500):
"""
Generates a response using the model
"""
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_length=max_length)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
def load_and_prepare_model(model_path, model_id=None, use_merged = True):
"""
Loads and prepares either the merged model or the LoRA model
"""
if use_merged:
# Load the merged model directly
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype=torch.float16
)
else:
# Load base model and LoRA weights
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(model, model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path) if use_merged else AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
def main():
# Model names
base_model_id = "HuggingFaceTB/SmolLM-135M"
fine_tuned_model_id = "sgraham/merged_archae_metadata_model"
# Load the base model
base_model, base_tokenizer = load_and_prepare_model(model_path=base_model_id, model_id=base_model_id, use_merged = True)
# Load the fine-tuned model
ft_model, ft_tokenizer = load_and_prepare_model(model_path=fine_tuned_model_id, model_id=base_model_id, use_merged = True)
# Load the pipeline
ft_pipeline = pipeline("text-generation", model=fine_tuned_model_id, device_map="auto", torch_dtype=torch.float16)
# Test example (uses same format as used in fine-tuning)
# location, companies, etc, all are all fake data here.
prompt = """<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.\n<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research\n"""
# Generate and compare
print(f"Input Prompt: {prompt}")
base_response = generate_response(base_model, base_tokenizer, prompt)
print(f"\nBase Model Response:\n{base_response}")
ft_response_direct = generate_response(ft_model, ft_tokenizer, prompt)
print(f"\nFine-tuned Model Response (Direct):\n{ft_response_direct}")
ft_response_pipeline = generate_response_pipeline(ft_pipeline, prompt)
print(f"\nFine-tuned Model Response (Pipeline):\n{ft_response_pipeline}")
if __name__ == "__main__":
main()
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Input Prompt: <|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Base Model Response:
<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archae
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Fine-tuned Model Response (Direct):
<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research
<|assistant|>{"subjects": ["Excavation", "Pit", "Iron Age", "Romano-British", "Polly", "Field System"], "periods": ["ROMAN", "IRON AGE"], "work_conducted_by": ["Publisher:Archaeology Data Service", "Creator:Starlight Research"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archae
Fine-tuned Model Response (Pipeline):
<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.
<|user|>Please categorize this archaeological report metadata; return json: Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research
<|assistant|>{"subjects": ["Excavation", "Pit", "Iron Age", "Romano-British", "Polly", "Field System"], "periods": ["ROMAN", "IRON AGE"], "work_conducted_by": ["Publisher:Archaeology Data Service", "Creator:Starlight Research"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Hermon", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archaeology South East", "Creator:Archaeology South East"], "location": {"civil_parish": "Ottawa", "admin_county": "North Yorkshire"}}
<|assistant|>{"work_conducted_by": ["Publisher:Archae
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment