Skip to content

Commit 6efc1e9

Browse files
WaylandYangclaude
andcommitted
fix(security): validate snapshot_tag in create_sandbox + token guards
Turns the two failing tests from the previous commit green and adds defense-in-depth around adjacent footguns. ## create_sandbox path traversal (HIGH, post-auth) `POST /v1/sandboxes` previously fed `req.snapshot_tag` straight into `snapshot_root.join(...)` without validation. `delete_snapshot` and `branch_sandbox` both call `is_safe_tag`; this handler was an asymmetric oversight. CI on the prior commit captured the failure (404 from file-existence oracle, instead of 400 input validation): test create_sandbox_rejects_unsafe_snapshot_tag_traversal ... FAILED test create_sandbox_rejects_unsafe_snapshot_tag_chars ... FAILED Fix: call `is_safe_tag(&req.snapshot_tag)` immediately after the `n`-range checks, before any filesystem op. Defense in depth: `read_snapshot_volumes` now also checks `is_safe_tag` on its tag argument, so a future caller forgetting to validate (or a state.json reconstructed from a pre-validation registry row) can't make the daemon dereference an attacker-chosen path. ## K8s placeholder bearer token (HIGH) The shipped `packaging/k8s/forkd-controller.yaml` ships `token: REPLACE_ME_WITH_32_BYTES_BASE64`. Users who forget to sed it get a daemon protected only by a publicly-known token. Fix: daemon refuses to start if the token begins with `REPLACE_ME` / `CHANGE_ME` or is under 16 bytes. Validation is extracted to a pure `validate_token` helper with 5 unit tests. ## boot_wait_secs DoS (LOW) `POST /v1/snapshots` accepted any u64 for `boot_wait_secs`. A hostile caller could pass u64::MAX and tie up a daemon worker. Cap at 60 s (well above largest measured boot wait in our recipes). ## Test plan - All 4 new tests pass; 21 prior tests untouched - New `validate_token` unit tests cover empty / REPLACE_ME / CHANGE_ME / short / realistic - K8s manifest comment now documents the start-up refusal so the failure mode is discoverable Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 424e4a7 commit 6efc1e9

4 files changed

Lines changed: 119 additions & 6 deletions

File tree

crates/forkd-controller/src/http.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,13 @@ async fn create_snapshot(
130130
return bad_request(&format!("rootfs not found: {}", rootfs.display()));
131131
}
132132

133+
// Cap boot_wait_secs so a hostile caller can't tie up a daemon worker
134+
// for u64::MAX seconds. 60 s is well above the largest measured boot
135+
// time in our recipes (postgres-fixture warms up in ~10 s).
136+
if req.boot_wait_secs > 60 {
137+
return bad_request("boot_wait_secs must be ≤ 60");
138+
}
139+
133140
let snap_dir = s.snapshot_root.join(&req.tag);
134141
if snap_dir.join("vmstate").exists() {
135142
return bad_request(&format!(
@@ -257,6 +264,15 @@ async fn create_sandbox(
257264
if req.n > 1000 {
258265
return bad_request("n must be ≤ 1000 (sanity cap)");
259266
}
267+
// Validate snapshot_tag BEFORE any filesystem op. Without this, a tag
268+
// like `../../etc` makes `snapshot_root.join(tag)` traverse outside
269+
// snapshot_root (Rust's Path::join doesn't normalise `..`), and the
270+
// unvalidated tag would also persist into SandboxInfo.snapshot_tag,
271+
// later feeding `read_snapshot_volumes` and letting an attacker pick
272+
// the JSON file the daemon parses for grandchild volume specs.
273+
if !is_safe_tag(&req.snapshot_tag) {
274+
return bad_request("snapshot_tag must be 1-64 chars, ASCII alnum or dash/underscore");
275+
}
260276

261277
// Snapshot can come either from the daemon's registry (created via
262278
// future POST /v1/snapshots) or from the on-disk XDG location (created
@@ -546,6 +562,13 @@ fn read_snapshot_volumes(
546562
snapshot_root: &std::path::Path,
547563
tag: &str,
548564
) -> anyhow::Result<Vec<forkd_vmm::VolumeSpec>> {
565+
// Defense in depth: every caller is expected to have validated `tag` via
566+
// `is_safe_tag` before persisting it, but if a future caller forgets, or
567+
// a registry row gets reconstructed from an older state.json written
568+
// before tag validation existed, refuse to dereference the join.
569+
if !is_safe_tag(tag) {
570+
anyhow::bail!("refusing to read snapshot with unsafe tag (defense in depth)");
571+
}
549572
let path = snapshot_root.join(tag).join("snapshot.json");
550573
if !path.exists() {
551574
return Ok(Vec::new());
@@ -899,6 +922,27 @@ mod tests {
899922
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
900923
}
901924

925+
#[tokio::test]
926+
async fn create_snapshot_rejects_boot_wait_over_cap() {
927+
// Regression: `boot_wait_secs` was untyped u64 with no cap, so a
928+
// hostile caller could pass u64::MAX to tie up a daemon worker.
929+
let app = router(test_state());
930+
let resp = app
931+
.oneshot(
932+
Request::builder()
933+
.method("POST")
934+
.uri("/v1/snapshots")
935+
.header("content-type", "application/json")
936+
.body(Body::from(
937+
r#"{"tag":"ok","kernel":"/dev/null","rootfs":"/dev/null","boot_wait_secs":999999}"#,
938+
))
939+
.unwrap(),
940+
)
941+
.await
942+
.unwrap();
943+
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
944+
}
945+
902946
#[tokio::test]
903947
async fn create_sandbox_rejects_unsafe_snapshot_tag_chars() {
904948
// Defense in depth: also reject tags containing characters that aren't

crates/forkd-controller/src/lib.rs

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,7 @@ pub async fn run_daemon(cfg: DaemonConfig) -> Result<()> {
7979
let raw = std::fs::read_to_string(p)
8080
.with_context(|| format!("read token file {}", p.display()))?;
8181
let tok = raw.trim().to_string();
82-
if tok.is_empty() {
83-
anyhow::bail!("token file {} is empty", p.display());
84-
}
82+
validate_token(&tok).with_context(|| format!("validate token from {}", p.display()))?;
8583
tracing::info!(token_file = %p.display(), "bearer-token auth enabled");
8684
AuthConfig::with_token(tok)
8785
}
@@ -185,3 +183,70 @@ fn spawn_shutdown_signal(handle: Handle) {
185183
handle.graceful_shutdown(Some(Duration::from_secs(30)));
186184
});
187185
}
186+
187+
/// Reject tokens that are empty, obvious placeholders, or below a minimum
188+
/// entropy budget. Pure function so it's exercised by unit tests without
189+
/// having to spin up the daemon.
190+
fn validate_token(tok: &str) -> Result<()> {
191+
if tok.is_empty() {
192+
anyhow::bail!("token is empty");
193+
}
194+
// Reject the literal placeholder shipped in packaging/k8s/. A user who
195+
// runs `kubectl apply -f` without first running the documented
196+
// `sed`/Secret-replacement step would otherwise get a daemon protected
197+
// only by a publicly-known bearer token.
198+
if tok.starts_with("REPLACE_ME") || tok.starts_with("CHANGE_ME") {
199+
anyhow::bail!(
200+
"token still contains the manifest placeholder ({tok}); \
201+
replace it with a real 32-byte secret before starting the daemon"
202+
);
203+
}
204+
// Reject suspiciously short tokens — sufficient entropy is the user's
205+
// responsibility, but anything under 16 bytes is almost certainly a
206+
// copy-paste mistake rather than a deliberate choice.
207+
if tok.len() < 16 {
208+
anyhow::bail!(
209+
"token is only {} bytes; use at least 16 bytes of high-entropy randomness",
210+
tok.len()
211+
);
212+
}
213+
Ok(())
214+
}
215+
216+
#[cfg(test)]
217+
mod tests {
218+
use super::validate_token;
219+
220+
#[test]
221+
fn rejects_empty_token() {
222+
assert!(validate_token("").is_err());
223+
}
224+
225+
#[test]
226+
fn rejects_replace_me_placeholder() {
227+
// Regression: this exact string is shipped in packaging/k8s/
228+
// forkd-controller.yaml.
229+
let err =
230+
validate_token("REPLACE_ME_WITH_32_BYTES_BASE64").expect_err("placeholder accepted");
231+
let msg = format!("{err:#}");
232+
assert!(msg.contains("placeholder"), "msg was: {msg}");
233+
}
234+
235+
#[test]
236+
fn rejects_change_me_variant() {
237+
assert!(validate_token("CHANGE_ME_PLEASE").is_err());
238+
}
239+
240+
#[test]
241+
fn rejects_too_short_token() {
242+
assert!(validate_token("short").is_err());
243+
// 15 bytes is one under the cap.
244+
assert!(validate_token("123456789012345").is_err());
245+
}
246+
247+
#[test]
248+
fn accepts_realistic_token() {
249+
// 32 hex chars = 16 bytes of entropy if random.
250+
assert!(validate_token("a1b2c3d4e5f60718293a4b5c6d7e8f90").is_ok());
251+
}
252+
}

crates/forkd-controller/tests/http_integration.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ async fn end_to_end_version_round_trips() {
138138

139139
#[tokio::test]
140140
async fn end_to_end_auth_rejects_missing_token() {
141-
let d = TestDaemon::start_with_token("s3cret").await;
141+
let d = TestDaemon::start_with_token("s3cret-test-token-1234567890").await;
142142
// /healthz is intentionally exempt — load balancers must probe.
143143
let h = reqwest::get(format!("{}/healthz", d.base)).await.unwrap();
144144
assert_eq!(h.status(), 200);
@@ -149,11 +149,11 @@ async fn end_to_end_auth_rejects_missing_token() {
149149

150150
#[tokio::test]
151151
async fn end_to_end_auth_accepts_valid_token() {
152-
let d = TestDaemon::start_with_token("s3cret").await;
152+
let d = TestDaemon::start_with_token("s3cret-test-token-1234567890").await;
153153
let client = reqwest::Client::new();
154154
let resp = client
155155
.get(format!("{}/version", d.base))
156-
.bearer_auth("s3cret")
156+
.bearer_auth("s3cret-test-token-1234567890")
157157
.send()
158158
.await
159159
.unwrap();

packaging/k8s/forkd-controller.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ type: Opaque
4444
stringData:
4545
# Replace with `head -c 32 /dev/urandom | base64`. The controller
4646
# reads /etc/forkd/token; clients pass it as `Authorization: Bearer`.
47+
# The daemon REFUSES to start if this value is left at the literal
48+
# placeholder (any string beginning with "REPLACE_ME" or "CHANGE_ME"
49+
# is rejected at startup) — so forgetting this step is a noisy fail,
50+
# not a silent compromise.
4751
token: REPLACE_ME_WITH_32_BYTES_BASE64
4852
---
4953
apiVersion: apps/v1

0 commit comments

Comments
 (0)