fix(vis): harden ELK heap scaling and fix flaky signal safety test

johnmarktaylor91 · claude · johnmarktaylor91 · commit 41b9f893692b · 2026-03-07T16:58:59.000-05:00
- Bump ELK Node.js heap scaling from 8x to 16x JSON size to prevent OOM on 250k+ node graphs - Mark 100k node tests as @rare (too slow for regular runs) - Fix flaky TestSignalSafety: use setitimer(50ms) instead of alarm(1s), increase model iterations to 50k, skip if alarm doesn't fire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/tests/test_decoration.py b/tests/test_decoration.py
@@ -753,23 +753,30 @@ def test_sigalrm_during_forward(self):
 
         class SlowModel(nn.Module):
             def forward(self, x):
-                # Do enough work that the alarm fires
-                for _ in range(1000):
+                # Do enough work that the alarm fires mid-logging
+                for _ in range(50000):
                     x = x + 0.001
                 return x
 
+        alarm_fired = False
+
         def alarm_handler(signum, frame):
+            nonlocal alarm_fired
+            alarm_fired = True
             raise TimeoutError("alarm fired")
 
         old_handler = signal.signal(signal.SIGALRM, alarm_handler)
         try:
-            signal.alarm(1)  # 1 second
+            signal.setitimer(signal.ITIMER_REAL, 0.05)  # 50ms — fires mid-forward
             model = SlowModel()
             try:
                 log_forward_pass(model, torch.randn(5))
             except TimeoutError:
                 pass
-            signal.alarm(0)  # cancel alarm
+            signal.setitimer(signal.ITIMER_REAL, 0)  # cancel
+
+            if not alarm_fired:
+                pytest.skip("alarm didn't fire — forward pass too fast")
 
             # Toggle MUST be off
             assert _state._logging_enabled is False
diff --git a/tests/test_large_graphs.py b/tests/test_large_graphs.py
@@ -84,6 +84,7 @@ def test_50k_nodes(self):
         assert 45000 < count < 55000, f"Expected ~50000 nodes, got {count}"
 
     @pytest.mark.slow
+    @pytest.mark.rare
     def test_100k_nodes(self):
         model = RandomGraphModel(target_nodes=100000, seed=42)
         x = torch.randn(2, 64)
@@ -174,6 +175,7 @@ def test_validation_50k(self):
         assert validate_forward_pass(model, torch.randn(2, 64))
 
     @pytest.mark.slow
+    @pytest.mark.rare
     def test_validation_100k(self):
         """Validation passes for 100k-node random model."""
         model = RandomGraphModel(target_nodes=100000, seed=42)
@@ -406,6 +408,7 @@ def test_elk_renders_50k(self):
 
     @pytest.mark.skipif(not elk_available(), reason="elkjs not installed")
     @pytest.mark.slow
+    @pytest.mark.rare
     def test_elk_renders_100k(self):
         """ELK engine works for 100k-node graphs."""
         model = RandomGraphModel(target_nodes=100000, seed=42)
diff --git a/torchlens/visualization/elk_layout.py b/torchlens/visualization/elk_layout.py
@@ -344,7 +344,7 @@ def run_elk_layout(elk_graph: dict, timeout: Optional[int] = None) -> dict:
 
     graph_json = json.dumps(elk_graph)
     graph_kb = len(graph_json) // 1024
-    heap_mb = max(4096, graph_kb * 8)  # ~8x JSON size
+    heap_mb = max(4096, graph_kb * 16)  # ~16x JSON size
     stack_kb = max(65536, graph_kb * 16)  # ~16x JSON size
 
     try: