Last active
July 4, 2024 06:48
-
-
Save thekaranacharya/fd55033fb7506d4f693236d864c98472 to your computer and use it in GitHub Desktop.
Implementation: Using Adapter layers to fine-tune LLMs as described in [Parameter-Efficient Transfer Learning for NLP](https://arxiv.org/pdf/1902.00751)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports | |
import torch | |
from functools import partial | |
from transformers import AutoModelForSequenceClassification | |
# Classes | |
# Define the Adapter layer | |
class Adapter(torch.nn.Module): | |
""" | |
Implements Adapter layer as described in the paper. | |
Architecture: | |
Linear (down-projection) -> GELU(non-linearity) -> Linear(up-projection) | |
""" | |
def __init__(self, linear_out_dim: int, bottleneck_dim: int): | |
""" | |
Args: | |
linear_out_dim: int | |
Output dimension of the linear layer | |
bottleneck_dim: int | |
Dimension of the bottleneck layer | |
""" | |
super().__init__() | |
self.linear1 = torch.nn.Linear( | |
linear_out_dim, bottleneck_dim | |
) # Feedforward down-project | |
self.gelu = torch.nn.GELU() # non-linearity | |
self.linear2 = torch.nn.Linear( | |
bottleneck_dim, linear_out_dim | |
) # Feedforward up-project | |
def forward(self, x): | |
""" | |
Forward propogation of the Adapter layer | |
""" | |
residual = x | |
x = self.gelu(self.linear1(x)) | |
x = self.linear2(x) | |
return x + residual | |
# Define the AdaptedLinear layer | |
class AdaptedLinear(torch.nn.Module): | |
""" | |
Accepts a linear layer and adds an Adapter layer to it | |
""" | |
def __init__(self, linear, bottleneck_dim): | |
""" | |
Args: | |
linear: torch.nn.Linear | |
Linear layer to which the Adapter layer is to be added | |
bottleneck_dim: int | |
Dimension of the bottleneck layer | |
""" | |
super().__init__() | |
self.linear = linear | |
self.adapter = Adapter(linear.out_features, bottleneck_dim) | |
def forward(self, x): | |
""" | |
Forward propogation of the AdaptedLinear layer | |
""" | |
x = self.linear(x) # Normal linear layer propogation | |
return self.adapter(x) # Adapter layer propogation | |
################### | |
model_uri = "distilbert/distilbert-base-uncased" | |
num_classes = 2 | |
bottleneck_dim = 16 | |
# Initialise the model | |
model = AutoModelForSequenceClassification.from_pretrained( | |
model_uri, num_labels=num_classes | |
) | |
# Freeze all the layers | |
for param in model.parameters(): | |
param.requires_grad = False | |
adapted_linear = partial(AdaptedLinear, bottleneck_dim=bottleneck_dim) | |
# Replace some Linear layers within the TransformerBlock with AdaptedLinear | |
# As specified in the paper | |
for block in model.distilbert.transformer.layer: | |
## Transformer Block: Multi-head Self-Attention block | |
block.attention.out_lin = adapted_linear(block.attention.out_lin) | |
## Transformer Block: Feed-forward block | |
block.ffn.lin2 = adapted_linear(block.ffn.lin2) | |
# Unfreeze (sa_layer_norm) and (output_layer_norm) in each TransformerBlock | |
for block in model.distilbert.transformer.layer: | |
block.sa_layer_norm.requires_grad = True | |
block.output_layer_norm.requires_grad = True | |
# Unfreeze final classifcation layer | |
for param in model.classifier.parameters(): | |
param.requires_grad = True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment