KindXiaoming
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎dataset.py‎
Lines changed: 31 additions & 1 deletion b/‎dataset.py‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎model.py‎
Lines changed: 65 additions & 1 deletion b/‎model.py‎
Lines changed: 65 additions & 1 deletion
diff --git a/‎notebooks/plot_sweep.ipynb‎
Lines changed: 144 additions & 0 deletions b/‎notebooks/plot_sweep.ipynb‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎notebooks/transformer_lattice.ipynb‎
Lines changed: 170 additions & 0 deletions b/‎notebooks/transformer_lattice.ipynb‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎scripts/data_size_sweep.sh‎
Lines changed: 13 additions & 0 deletions b/‎scripts/data_size_sweep.sh‎
Lines changed: 13 additions & 0 deletions
@@ -1,2 +1,4 @@
 __pycache__
+results
 
+scratch.ipynb
@@ -115,4 +115,34 @@ def __len__(self):
         return len(self.inputs)
 
     def __getitem__(self, idx):
-        return self.inputs[idx], self.targets[idx]
+        return self.inputs[idx], self.targets[idx]
+    
+def descendant_dataset(p, num, seed=0, device='cpu'):
+
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    
+    N_sample = num
+    x = np.random.choice(range(1,p), N_sample*2).reshape(N_sample, 2)
+
+    # Check if b is a descendant of a
+    # In a complete binary tree where two children of x is 2x and 2x+1
+    def is_desc(a, b):
+        while b > 1:
+            if b == a:
+                return True
+            b //= 2  # Move up to the parent node
+        return b == a
+    target = np.array([(p+1) if is_desc(x[i,0], x[i,1]) else p for i in range(N_sample)])
+    
+    data_id = torch.from_numpy(x).to(device)
+    labels = torch.from_numpy(target).to(device)
+    
+    vocab_size = p+2
+    
+    dataset = {}
+    dataset['data_id'] = data_id
+    dataset['label'] = labels
+    dataset['vocab_size'] = vocab_size
+    
+    return dataset
@@ -7,6 +7,9 @@
 
 from tqdm import tqdm
 
+from itertools import combinations
+from sklearn.decomposition import PCA
+
 class MLP(nn.Module):
     def __init__(self, shp, vocab_size, embd_dim, input_token=2, init_scale=1., unembd=False, weight_tied=False, seed=0):
         super(MLP, self).__init__()
@@ -169,17 +172,21 @@ def forward(self, x):
         else:
             logits = self.fc(x[:, -1])  # Only predict the last token
         return logits
+    
     def train(self, param_dict: dict):
 
         num_epochs = param_dict['num_epochs']
         learning_rate = param_dict['learning_rate']
         dataloader = param_dict['dataloader']
+        device = param_dict['device']
         criterion = nn.CrossEntropyLoss()
 
         optimizer = optim.AdamW(self.parameters(), lr=learning_rate)
         for epoch in tqdm(range(num_epochs)):
             total_loss = 0
             for batch_inputs, batch_targets in dataloader:
+                batch_inputs = batch_inputs.to(device)
+                batch_targets = batch_targets.to(device)
                 optimizer.zero_grad()
                 logits = self.forward(batch_inputs)
 
@@ -189,4 +196,61 @@ def train(self, param_dict: dict):
                 total_loss += loss.item()
 
             if (epoch + 1) % 50 == 0:
-                print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")
+                print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")
+
+
+    def eval(self):
+        deviation_arr = []
+        points = [(i, j) for i in range(5) for j in range(5)]
+        
+
+        def side_length_deviation(a, b, c, d):
+            a, b, c, d = np.array(a), np.array(b), np.array(c), np.array(d)
+            
+            # Compute lengths of opposite sides
+            length_ab = np.linalg.norm(b - a)
+            length_cd = np.linalg.norm(d - c)
+            length_ac = np.linalg.norm(c - a)
+            length_bd = np.linalg.norm(b - d)
+            length_bc = np.linalg.norm(c - b)
+            length_ad = np.linalg.norm(d - a)
+            
+            # Calculate side length deviation
+            side_deviation = np.sqrt((length_ab - length_cd)**2 + (length_ac - length_bd)**2) / np.sqrt((length_ab ** 2 + length_bc ** 2 + length_cd ** 2 + length_ad ** 2)/2)
+            
+            return side_deviation
+
+        for quad in combinations(points, 3):
+            a, b, c = quad
+            d = (c[0] + b[0] - a[0], c[1] + b[1] - a[1])
+            if d[0] < 0 or d[0] >= 5 or d[1] < 0 or d[1] >= 5:
+                continue
+
+            if a[0] == b[0] and b[0] == c[0]:
+                continue
+            if a[1] == b[1] and b[1] == c[1]:
+                continue
+
+            a = 5*a[0] + a[1]
+            b = 5*b[0] + b[1]
+            c = 5*c[0] + c[1]
+            d = 5*d[0] + d[1]
+
+            a = self.embedding.weight[a].cpu().detach().numpy()
+            b = self.embedding.weight[b].cpu().detach().numpy()
+            c = self.embedding.weight[c].cpu().detach().numpy()
+            d = self.embedding.weight[d].cpu().detach().numpy()
+            deviation = side_length_deviation(a, b, c, d)
+            deviation_arr.append(deviation)
+
+        pca = PCA(n_components=10)
+        emb_pca = pca.fit_transform(self.embedding.weight.cpu().detach().numpy())
+        pca.fit_transform(emb_pca)
+        variances = pca.explained_variance_ratio_
+
+        result_dict = {
+            'parallelogram_quality': np.mean(deviation_arr),
+            'variances': variances,
+        }
+
+        return result_dict
@@ -0,0 +1,13 @@
+#!/bin/bash
+#SBATCH -t 2:00:00
+#SBATCH --gres=gpu:1
+#SBATCH -n 32
+
+sizes=$(python3 -c "import numpy as np; print(' '.join(map(str, np.logspace(1, 4, num=10, dtype=int))))")
+
+
+for size in $sizes
+do
+    python3 ../sweep_transformers.py --data_size $size --use_harmonic 0
+done
+
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,4 @@`
`1`	`1`	`__pycache__`
	`2`	`+results`
`2`	`3`
	`4`	`+scratch.ipynb`