wolfecameron · March 2, 2024 21:43
diff --git a/decoder_only_block.py b/decoder_only_block.py
 """
 Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
 """

 from torch import nn

 class Block(nn.Module):
    def __init__(
        self,
        d,
        H,
        T,
        bias=False,
        dropout=0.2,
    ):
        """
        Arguments:
        d: size of embedding dimension
        H: number of attention heads
        T: maximum length of input sequences (in tokens)
        bias: whether or not to use bias in linear layers
        dropout: probability of dropout
        """
        super().__init__()
        self.ln_1 = nn.LayerNorm(d)
        self.attn = CausalSelfAttention(d, H, T, bias, dropout)
        self.ln_2 = nn.LayerNorm(d)
        self.ffnn = FFNN(d, bias, dropout)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.ffnn(self.ln_2(x))
        return x
	"""
	Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
	"""

	from torch import nn

	class Block(nn.Module):
	def __init__(
	self,
	d,
	H,
	T,
	bias=False,
	dropout=0.2,
	):
	"""
	Arguments:
	d: size of embedding dimension
	H: number of attention heads
	T: maximum length of input sequences (in tokens)
	bias: whether or not to use bias in linear layers
	dropout: probability of dropout
	"""
	super().__init__()
	self.ln_1 = nn.LayerNorm(d)
	self.attn = CausalSelfAttention(d, H, T, bias, dropout)
	self.ln_2 = nn.LayerNorm(d)
	self.ffnn = FFNN(d, bias, dropout)

	def forward(self, x):
	x = x + self.attn(self.ln_1(x))
	x = x + self.ffnn(self.ln_2(x))
	return x