thekaranacharya · July 4, 2024 06:48
diff --git a/adapter_fine_tuning.py b/adapter_fine_tuning.py
 # Imports
 import torch
 from functools import partial
 from transformers import AutoModelForSequenceClassification

 # Classes
 # Define the Adapter layer
 class Adapter(torch.nn.Module):
    """
    Implements Adapter layer as described in the paper.

    Architecture:
    Linear (down-projection) -> GELU(non-linearity) -> Linear(up-projection)
    """

    def __init__(self, linear_out_dim: int, bottleneck_dim: int):
        """
        Args:
            linear_out_dim: int
                Output dimension of the linear layer
            bottleneck_dim: int
                Dimension of the bottleneck layer
        """
        super().__init__()
        self.linear1 = torch.nn.Linear(
            linear_out_dim, bottleneck_dim
        )  # Feedforward down-project
        self.gelu = torch.nn.GELU()  # non-linearity
        self.linear2 = torch.nn.Linear(
            bottleneck_dim, linear_out_dim
        )  # Feedforward up-project

    def forward(self, x):
        """
        Forward propogation of the Adapter layer
        """
        residual = x
        x = self.gelu(self.linear1(x))
        x = self.linear2(x)
        return x + residual


 # Define the AdaptedLinear layer
 class AdaptedLinear(torch.nn.Module):
    """
    Accepts a linear layer and adds an Adapter layer to it
    """

    def __init__(self, linear, bottleneck_dim):
        """
        Args:
            linear: torch.nn.Linear
                Linear layer to which the Adapter layer is to be added
            bottleneck_dim: int
                Dimension of the bottleneck layer
        """
        super().__init__()
        self.linear = linear
        self.adapter = Adapter(linear.out_features, bottleneck_dim)

    def forward(self, x):
        """
        Forward propogation of the AdaptedLinear layer
        """
        x = self.linear(x)  # Normal linear layer propogation
        return self.adapter(x)  # Adapter layer propogation


 ###################
 model_uri = "distilbert/distilbert-base-uncased"
 num_classes = 2
 bottleneck_dim = 16

 # Initialise the model
 model = AutoModelForSequenceClassification.from_pretrained(
    model_uri, num_labels=num_classes
 )

 # Freeze all the layers
 for param in model.parameters():
    param.requires_grad = False
    
 adapted_linear = partial(AdaptedLinear, bottleneck_dim=bottleneck_dim)

 # Replace some Linear layers within the TransformerBlock with AdaptedLinear
 # As specified in the paper
 for block in model.distilbert.transformer.layer:
    ## Transformer Block: Multi-head Self-Attention block
    block.attention.out_lin = adapted_linear(block.attention.out_lin)

    ## Transformer Block: Feed-forward block
    block.ffn.lin2 = adapted_linear(block.ffn.lin2)
    
 # Unfreeze (sa_layer_norm) and (output_layer_norm) in each TransformerBlock
 for block in model.distilbert.transformer.layer:
    block.sa_layer_norm.requires_grad = True
    block.output_layer_norm.requires_grad = True

 # Unfreeze final classifcation layer
 for param in model.classifier.parameters():
    param.requires_grad = True
	# Imports
	import torch
	from functools import partial
	from transformers import AutoModelForSequenceClassification

	# Classes
	# Define the Adapter layer
	class Adapter(torch.nn.Module):
	"""
	Implements Adapter layer as described in the paper.

	Architecture:
	Linear (down-projection) -> GELU(non-linearity) -> Linear(up-projection)
	"""

	def __init__(self, linear_out_dim: int, bottleneck_dim: int):
	"""
	Args:
	linear_out_dim: int
	Output dimension of the linear layer
	bottleneck_dim: int
	Dimension of the bottleneck layer
	"""
	super().__init__()
	self.linear1 = torch.nn.Linear(
	linear_out_dim, bottleneck_dim
	) # Feedforward down-project
	self.gelu = torch.nn.GELU() # non-linearity
	self.linear2 = torch.nn.Linear(
	bottleneck_dim, linear_out_dim
	) # Feedforward up-project

	def forward(self, x):
	"""
	Forward propogation of the Adapter layer
	"""
	residual = x
	x = self.gelu(self.linear1(x))
	x = self.linear2(x)
	return x + residual


	# Define the AdaptedLinear layer
	class AdaptedLinear(torch.nn.Module):
	"""
	Accepts a linear layer and adds an Adapter layer to it
	"""

	def __init__(self, linear, bottleneck_dim):
	"""
	Args:
	linear: torch.nn.Linear
	Linear layer to which the Adapter layer is to be added
	bottleneck_dim: int
	Dimension of the bottleneck layer
	"""
	super().__init__()
	self.linear = linear
	self.adapter = Adapter(linear.out_features, bottleneck_dim)

	def forward(self, x):
	"""
	Forward propogation of the AdaptedLinear layer
	"""
	x = self.linear(x) # Normal linear layer propogation
	return self.adapter(x) # Adapter layer propogation


	###################
	model_uri = "distilbert/distilbert-base-uncased"
	num_classes = 2
	bottleneck_dim = 16

	# Initialise the model
	model = AutoModelForSequenceClassification.from_pretrained(
	model_uri, num_labels=num_classes
	)

	# Freeze all the layers
	for param in model.parameters():
	param.requires_grad = False

	adapted_linear = partial(AdaptedLinear, bottleneck_dim=bottleneck_dim)

	# Replace some Linear layers within the TransformerBlock with AdaptedLinear
	# As specified in the paper
	for block in model.distilbert.transformer.layer:
	## Transformer Block: Multi-head Self-Attention block
	block.attention.out_lin = adapted_linear(block.attention.out_lin)

	## Transformer Block: Feed-forward block
	block.ffn.lin2 = adapted_linear(block.ffn.lin2)

	# Unfreeze (sa_layer_norm) and (output_layer_norm) in each TransformerBlock
	for block in model.distilbert.transformer.layer:
	block.sa_layer_norm.requires_grad = True
	block.output_layer_norm.requires_grad = True

	# Unfreeze final classifcation layer
	for param in model.classifier.parameters():
	param.requires_grad = True