Skip to content

Instantly share code, notes, and snippets.

@thekaranacharya
Last active July 4, 2024 06:48
Show Gist options
  • Save thekaranacharya/fd55033fb7506d4f693236d864c98472 to your computer and use it in GitHub Desktop.
Save thekaranacharya/fd55033fb7506d4f693236d864c98472 to your computer and use it in GitHub Desktop.
Implementation: Using Adapter layers to fine-tune LLMs as described in [Parameter-Efficient Transfer Learning for NLP](https://arxiv.org/pdf/1902.00751)
# Imports
import torch
from functools import partial
from transformers import AutoModelForSequenceClassification
# Classes
# Define the Adapter layer
class Adapter(torch.nn.Module):
"""
Implements Adapter layer as described in the paper.
Architecture:
Linear (down-projection) -> GELU(non-linearity) -> Linear(up-projection)
"""
def __init__(self, linear_out_dim: int, bottleneck_dim: int):
"""
Args:
linear_out_dim: int
Output dimension of the linear layer
bottleneck_dim: int
Dimension of the bottleneck layer
"""
super().__init__()
self.linear1 = torch.nn.Linear(
linear_out_dim, bottleneck_dim
) # Feedforward down-project
self.gelu = torch.nn.GELU() # non-linearity
self.linear2 = torch.nn.Linear(
bottleneck_dim, linear_out_dim
) # Feedforward up-project
def forward(self, x):
"""
Forward propogation of the Adapter layer
"""
residual = x
x = self.gelu(self.linear1(x))
x = self.linear2(x)
return x + residual
# Define the AdaptedLinear layer
class AdaptedLinear(torch.nn.Module):
"""
Accepts a linear layer and adds an Adapter layer to it
"""
def __init__(self, linear, bottleneck_dim):
"""
Args:
linear: torch.nn.Linear
Linear layer to which the Adapter layer is to be added
bottleneck_dim: int
Dimension of the bottleneck layer
"""
super().__init__()
self.linear = linear
self.adapter = Adapter(linear.out_features, bottleneck_dim)
def forward(self, x):
"""
Forward propogation of the AdaptedLinear layer
"""
x = self.linear(x) # Normal linear layer propogation
return self.adapter(x) # Adapter layer propogation
###################
model_uri = "distilbert/distilbert-base-uncased"
num_classes = 2
bottleneck_dim = 16
# Initialise the model
model = AutoModelForSequenceClassification.from_pretrained(
model_uri, num_labels=num_classes
)
# Freeze all the layers
for param in model.parameters():
param.requires_grad = False
adapted_linear = partial(AdaptedLinear, bottleneck_dim=bottleneck_dim)
# Replace some Linear layers within the TransformerBlock with AdaptedLinear
# As specified in the paper
for block in model.distilbert.transformer.layer:
## Transformer Block: Multi-head Self-Attention block
block.attention.out_lin = adapted_linear(block.attention.out_lin)
## Transformer Block: Feed-forward block
block.ffn.lin2 = adapted_linear(block.ffn.lin2)
# Unfreeze (sa_layer_norm) and (output_layer_norm) in each TransformerBlock
for block in model.distilbert.transformer.layer:
block.sa_layer_norm.requires_grad = True
block.output_layer_norm.requires_grad = True
# Unfreeze final classifcation layer
for param in model.classifier.parameters():
param.requires_grad = True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment