|
5 | 5 | import numpy as np |
6 | 6 | import math |
7 | 7 |
|
| 8 | +import sys |
| 9 | + |
8 | 10 | from tqdm import tqdm |
9 | 11 |
|
10 | 12 | class customNNModule(nn.Module): |
@@ -34,7 +36,7 @@ def train(self, param_dict: dict): |
34 | 36 | counter = 0 |
35 | 37 |
|
36 | 38 | optimizer = optim.AdamW(self.parameters(), lr=learning_rate, weight_decay=0.01) |
37 | | - lamb_reg = 0.1 |
| 39 | + lamb_reg = 0.01 |
38 | 40 | for epoch in tqdm(range(num_epochs)): |
39 | 41 | train_loss = 0 |
40 | 42 | train_correct = 0 |
@@ -86,6 +88,7 @@ def train(self, param_dict: dict): |
86 | 88 |
|
87 | 89 | if (epoch + 1) % 50 == 0 and verbose: |
88 | 90 | print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_dataloader):.4f}, Train Acc: {train_correct / train_total:.4f}, Test Loss: {test_loss / len(test_dataloader):.4f}, Test Acc: {test_correct / test_total:.4f}") |
| 91 | + sys.stdout.flush() |
89 | 92 |
|
90 | 93 | train_losses.append(train_loss / len(train_dataloader)) |
91 | 94 | test_losses.append(test_loss / len(test_dataloader)) |
@@ -252,16 +255,21 @@ def pred_logit(self, x): |
252 | 255 |
|
253 | 256 | # 2-Layer Transformer Model with Explicit Residual Connections |
254 | 257 | class ToyTransformer(customNNModule): |
255 | | - def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len = 16, use_dist_layer = False): |
| 258 | + def __init__(self, vocab_size, d_model, nhead, num_layers, seq_len = 16, use_dist_layer = False, seed=0): |
256 | 259 | super(ToyTransformer, self).__init__() |
| 260 | + |
| 261 | + torch.manual_seed(seed) |
| 262 | + np.random.seed(seed) |
| 263 | + |
| 264 | + |
257 | 265 | self.embedding = nn.Embedding(vocab_size, d_model) |
258 | 266 | nn.init.normal_(self.embedding.weight, mean=0, std=1/np.sqrt(d_model)) |
259 | 267 | self.positional_encoding = nn.Parameter(torch.randn(seq_len, d_model)) |
260 | 268 |
|
261 | 269 | # Define transformer encoder layers |
262 | 270 | self.layers = nn.ModuleList([ |
263 | 271 | nn.TransformerEncoderLayer( |
264 | | - d_model=d_model, nhead=nhead, dim_feedforward=64, batch_first=True |
| 272 | + d_model=d_model, nhead=nhead, dim_feedforward=d_model*4, batch_first=True |
265 | 273 | ) for _ in range(num_layers) |
266 | 274 | ]) |
267 | 275 | self.use_dist_layer = use_dist_layer |
|
0 commit comments