added notes on positional embeddings

devshah21 · devshah21 · commit cf6ba87f3a6b · 2024-07-20T17:27:25.000-04:00
diff --git a/ml-concepts/transformers/positional_embedding.py b/ml-concepts/transformers/positional_embedding.py
@@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+import math
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        
+        # Create a matrix of shape (max_len, d_model) containing the positional encodings
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return x
+
+class TransformerEmbedding(nn.Module):
+    def __init__(self, vocab_size, d_model, max_len):
+        super(TransformerEmbedding, self).__init__()
+        self.token_embedding = nn.Embedding(vocab_size, d_model)
+        self.position_encoding = PositionalEncoding(d_model, max_len)
+
+    def forward(self, x):
+        token_embeddings = self.token_embedding(x)
+        embeddings = self.position_encoding(token_embeddings)
+        return embeddings
+
+# Example usage
+vocab_size = 30522  # vocabulary size
+d_model = 512       # Embedding size
+max_len = 100       # Maximum sequence length
+
+embedding_layer = TransformerEmbedding(vocab_size, d_model, max_len)
+input_ids = torch.tensor([[101, 19204, 2135, 1567, 2003, 2019, 2590, 3350, 1012, 102]])  # Example input
+embeddings = embedding_layer(input_ids)
+
+print(embeddings.shape)  # Output shape should be (sequence_length, batch_size, d_model)
diff --git a/ml-concepts/transformers/transformers.md b/ml-concepts/transformers/transformers.md
@@ -37,5 +37,18 @@
             attention_mask = [1 if id != 0 else 0 for id in padded_token_ids]
             # Output: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
             ```
-            
--
+- **positional encodings**
+    
+    <aside>
+    💡 the purpose of positional encodings is to enable the model to distinguish between identical tokens in different positions — for example, in the sentence “the cat sat on the mat”, the 2 instances of “the” need to be treated differently based on their positions
+    
+    </aside>
+    
+    - these positional encodings are really important as they provide information about the position of tokens in the input sequence — remember, transformers are position-agnostic
+        - this means they don’t have a built-in notion of the order of the tokens — positional encodings allow the model to leverage the order of the sequence
+    - **computing positional encodings**
+        - position encodings can be added to the input embeddings using fixed functions — a common approach is to use sine and cosine functions of different frequencies
+            - $PE_{(pos, 2i)} = sin(\frac{pos}{10000^{2i/d_{model}}})$
+            - $PE_{(pos, 2i+1)} = cos(\frac{pos}{10000^{2i/d_{model}}})$
+                - In these equations, *pos* is the position, *i* is the dimension, and $d_{model}$ is the dimension of the model (embedding size)
+        - check `positional_encoding.py` for an implementation of this