Skip to content

Commit 41b9f89

Browse files
fix(vis): harden ELK heap scaling and fix flaky signal safety test
- Bump ELK Node.js heap scaling from 8x to 16x JSON size to prevent OOM on 250k+ node graphs - Mark 100k node tests as @rare (too slow for regular runs) - Fix flaky TestSignalSafety: use setitimer(50ms) instead of alarm(1s), increase model iterations to 50k, skip if alarm doesn't fire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e2d0ae4 commit 41b9f89

File tree

3 files changed

+15
-5
lines changed

3 files changed

+15
-5
lines changed

tests/test_decoration.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -753,23 +753,30 @@ def test_sigalrm_during_forward(self):
753753

754754
class SlowModel(nn.Module):
755755
def forward(self, x):
756-
# Do enough work that the alarm fires
757-
for _ in range(1000):
756+
# Do enough work that the alarm fires mid-logging
757+
for _ in range(50000):
758758
x = x + 0.001
759759
return x
760760

761+
alarm_fired = False
762+
761763
def alarm_handler(signum, frame):
764+
nonlocal alarm_fired
765+
alarm_fired = True
762766
raise TimeoutError("alarm fired")
763767

764768
old_handler = signal.signal(signal.SIGALRM, alarm_handler)
765769
try:
766-
signal.alarm(1) # 1 second
770+
signal.setitimer(signal.ITIMER_REAL, 0.05) # 50ms — fires mid-forward
767771
model = SlowModel()
768772
try:
769773
log_forward_pass(model, torch.randn(5))
770774
except TimeoutError:
771775
pass
772-
signal.alarm(0) # cancel alarm
776+
signal.setitimer(signal.ITIMER_REAL, 0) # cancel
777+
778+
if not alarm_fired:
779+
pytest.skip("alarm didn't fire — forward pass too fast")
773780

774781
# Toggle MUST be off
775782
assert _state._logging_enabled is False

tests/test_large_graphs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def test_50k_nodes(self):
8484
assert 45000 < count < 55000, f"Expected ~50000 nodes, got {count}"
8585

8686
@pytest.mark.slow
87+
@pytest.mark.rare
8788
def test_100k_nodes(self):
8889
model = RandomGraphModel(target_nodes=100000, seed=42)
8990
x = torch.randn(2, 64)
@@ -174,6 +175,7 @@ def test_validation_50k(self):
174175
assert validate_forward_pass(model, torch.randn(2, 64))
175176

176177
@pytest.mark.slow
178+
@pytest.mark.rare
177179
def test_validation_100k(self):
178180
"""Validation passes for 100k-node random model."""
179181
model = RandomGraphModel(target_nodes=100000, seed=42)
@@ -406,6 +408,7 @@ def test_elk_renders_50k(self):
406408

407409
@pytest.mark.skipif(not elk_available(), reason="elkjs not installed")
408410
@pytest.mark.slow
411+
@pytest.mark.rare
409412
def test_elk_renders_100k(self):
410413
"""ELK engine works for 100k-node graphs."""
411414
model = RandomGraphModel(target_nodes=100000, seed=42)

torchlens/visualization/elk_layout.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def run_elk_layout(elk_graph: dict, timeout: Optional[int] = None) -> dict:
344344

345345
graph_json = json.dumps(elk_graph)
346346
graph_kb = len(graph_json) // 1024
347-
heap_mb = max(4096, graph_kb * 8) # ~8x JSON size
347+
heap_mb = max(4096, graph_kb * 16) # ~16x JSON size
348348
stack_kb = max(65536, graph_kb * 16) # ~16x JSON size
349349

350350
try:

0 commit comments

Comments
 (0)