Skip to content

Commit d9a1525

Browse files
fix(vis): prevent OOM kill on 1M-node ELK render (#128)
The 1M-node render was OOM-killed at ~74GB RSS because: 1. Model params (~8-10GB) stayed alive during ELK subprocess 2. preexec_fn forced fork+exec, COW-doubling the 74GB process 3. Heap/stack formulas produced absurd values (5.6TB heap, 15GB stack) 4. No memory cleanup before subprocess launch Changes: - render_large_graph.py: separate log_forward_pass from render_graph, free model/autograd before ELK render - elk_layout.py: cap heap at 64GB, stack floor 4096MB/cap 8192MB, write JSON to temp file (free string before subprocess), gc.collect before subprocess, set RLIMIT_STACK at module level (removes preexec_fn and the forced fork+exec) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 43a2539 commit d9a1525

File tree

2 files changed

+85
-44
lines changed

2 files changed

+85
-44
lines changed

scripts/render_large_graph.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"""
1111

1212
import argparse
13+
import gc
1314
import os
1415
import sys
1516
import time
@@ -42,14 +43,21 @@ def main():
4243
x = torch.randn(2, 64)
4344
print(f"Model constructed ({time.time() - t0:.1f}s)", flush=True)
4445

45-
ml = log_forward_pass(
46-
model,
47-
x,
48-
layers_to_save=None,
49-
verbose=True,
46+
# Log forward pass WITHOUT rendering — collect metadata only.
47+
ml = log_forward_pass(model, x, layers_to_save=None, verbose=True)
48+
print(f"Forward pass logged ({time.time() - t0:.1f}s)", flush=True)
49+
50+
# Free model parameters and autograd graphs before the memory-heavy ELK render.
51+
del model, x
52+
gc.collect()
53+
if torch.cuda.is_available():
54+
torch.cuda.empty_cache()
55+
56+
# Render from ModelLog metadata (model tensors no longer in memory).
57+
ml.render_graph(
5058
vis_opt="rolled",
5159
vis_outpath=os.path.join(args.outdir, label),
52-
vis_save_only=True,
60+
save_only=True,
5361
vis_fileformat=args.format,
5462
vis_node_placement="elk",
5563
)

torchlens/visualization/elk_layout.py

Lines changed: 71 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,27 @@
1212
"""
1313

1414
import functools
15+
import gc
1516
import json
17+
import os
1618
import re
1719
import resource
1820
import subprocess
21+
import tempfile
1922
import warnings
2023
from typing import Optional
2124

22-
23-
def _unlimit_stack():
24-
"""Remove OS stack size limit so Node.js --stack-size flag works.
25-
26-
V8's --stack-size requests a JS stack allocation, but the OS enforces
27-
its own limit via RLIMIT_STACK. If the OS soft limit (ulimit -s) is
28-
smaller than what --stack-size asks for, Node.js segfaults instead of
29-
raising a clean JS RangeError. Called as preexec_fn in subprocess.run
30-
so only the child process is affected.
31-
"""
32-
resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
25+
# Set the soft stack limit to match the hard limit once at import time.
26+
# Child processes (Node.js) inherit this, removing the need for preexec_fn
27+
# which forces fork+exec and COW-doubles virtual memory of large parent processes.
28+
try:
29+
_soft, _hard = resource.getrlimit(resource.RLIMIT_STACK)
30+
if _hard == resource.RLIM_INFINITY:
31+
resource.setrlimit(resource.RLIMIT_STACK, (resource.RLIM_INFINITY, _hard))
32+
elif _soft < _hard:
33+
resource.setrlimit(resource.RLIMIT_STACK, (_hard, _hard))
34+
except (ValueError, resource.error):
35+
pass
3336

3437

3538
_ELK_NODE_THRESHOLD = 3500
@@ -38,9 +41,11 @@ def _unlimit_stack():
3841
_DEFAULT_NODE_WIDTH = 200 # points — fallback when label isn't available
3942
_DEFAULT_NODE_HEIGHT = 60 # points — fallback when label isn't available
4043

41-
# Inline Node.js script that reads ELK JSON from stdin, runs layout, writes to stdout.
44+
# Inline Node.js script that reads ELK JSON from a temp file (path in
45+
# _TL_JSON_PATH env var) or stdin, runs layout, writes to stdout.
4246
_ELK_LAYOUT_SCRIPT = r"""
4347
const { Worker } = require('worker_threads');
48+
const fs = require('fs');
4449
4550
// Run ELK layout in a worker thread with a large stack via resourceLimits.
4651
// resourceLimits.stackSizeMb is far more reliable than the --stack-size V8
@@ -58,10 +63,7 @@ def _unlimit_stack():
5863
}).catch((err) => { throw err; });
5964
`;
6065
61-
let input = '';
62-
process.stdin.setEncoding('utf8');
63-
process.stdin.on('data', (chunk) => { input += chunk; });
64-
process.stdin.on('end', () => {
66+
function runLayout(input) {
6567
const worker = new Worker(workerCode, {
6668
eval: true,
6769
workerData: input,
@@ -74,7 +76,17 @@ def _unlimit_stack():
7476
process.stderr.write(err.toString());
7577
process.exit(1);
7678
});
77-
});
79+
}
80+
81+
const jsonPath = process.env._TL_JSON_PATH;
82+
if (jsonPath) {
83+
runLayout(fs.readFileSync(jsonPath, 'utf8'));
84+
} else {
85+
let input = '';
86+
process.stdin.setEncoding('utf8');
87+
process.stdin.on('data', (chunk) => { input += chunk; });
88+
process.stdin.on('end', () => { runLayout(input); });
89+
}
7890
"""
7991

8092

@@ -405,34 +417,55 @@ def run_elk_layout(elk_graph: dict, timeout: Optional[int] = None) -> dict:
405417
timeout = _ELK_TIMEOUT
406418

407419
graph_json = json.dumps(elk_graph)
420+
# Free the Python dict — we only need the JSON string from here.
421+
elk_graph.clear()
422+
408423
graph_kb = len(graph_json) // 1024
409-
heap_mb = max(16384, graph_kb * 48) # ~48x JSON size, 16GB floor
424+
# Cap heap at 64GB — V8 only allocates what it actually needs, and
425+
# unbounded values (e.g. 5.6TB for 1M nodes) are nonsensical.
426+
heap_mb = min(65536, max(16384, graph_kb * 48))
410427
# Worker thread stack via resourceLimits.stackSizeMb (MB).
411-
# Much more reliable than --stack-size for deeply recursive ELK layout.
412-
stack_mb = max(64, graph_kb // 8) # ~128 bytes/KB of JSON, 64MB floor
428+
# Floor of 4096 MB (matches CHANGELOG), cap at 8192 MB.
429+
stack_mb = min(8192, max(4096, graph_kb // 8))
413430

414431
env = _node_env()
415432
env["_TL_STACK_MB"] = str(stack_mb)
416433

434+
# Write JSON to a temp file so Node.js reads from disk instead of stdin.
435+
# This lets us free the graph_json string before the subprocess runs,
436+
# avoiding holding ~120MB+ in Python memory during ELK layout.
437+
json_fd, json_path = tempfile.mkstemp(suffix=".json", prefix="tl_elk_")
417438
try:
418-
result = subprocess.run(
419-
[
420-
"node",
421-
f"--max-old-space-size={heap_mb}",
422-
"-e",
423-
_ELK_LAYOUT_SCRIPT,
424-
],
425-
input=graph_json,
426-
capture_output=True,
427-
text=True,
428-
timeout=timeout,
429-
env=env,
430-
preexec_fn=_unlimit_stack,
431-
)
432-
except FileNotFoundError:
433-
raise RuntimeError("Node.js not found. Install from https://nodejs.org/")
434-
except subprocess.TimeoutExpired:
435-
raise RuntimeError(f"ELK layout timed out after {timeout}s")
439+
with os.fdopen(json_fd, "w") as f:
440+
f.write(graph_json)
441+
del graph_json
442+
env["_TL_JSON_PATH"] = json_path
443+
444+
# Reclaim garbage before the memory-heavy subprocess.
445+
gc.collect()
446+
447+
try:
448+
result = subprocess.run(
449+
[
450+
"node",
451+
f"--max-old-space-size={heap_mb}",
452+
"-e",
453+
_ELK_LAYOUT_SCRIPT,
454+
],
455+
capture_output=True,
456+
text=True,
457+
timeout=timeout,
458+
env=env,
459+
)
460+
except FileNotFoundError:
461+
raise RuntimeError("Node.js not found. Install from https://nodejs.org/")
462+
except subprocess.TimeoutExpired:
463+
raise RuntimeError(f"ELK layout timed out after {timeout}s")
464+
finally:
465+
try:
466+
os.unlink(json_path)
467+
except OSError:
468+
pass
436469

437470
if result.returncode != 0:
438471
detail = (

0 commit comments

Comments
 (0)