From 03a2209aacf368560e0cd0d42e62cd2b15cf8167 Mon Sep 17 00:00:00 2001 From: James Devine Date: Sat, 13 Jun 2026 13:19:31 +0100 Subject: [PATCH] fix(ir): reject cross-pipeline duplicate StepIds; strict de-indent in lower_raw_yaml; document EnvValue::Literal contract Three follow-ups from the post-merge review of the native IR PR. 1. Bug: src/compile/ir/graph.rs index_job_steps only rejected duplicate StepIds within the SAME job. Two jobs each carrying a step with the same StepId (e.g. a future extension author accidentally reuses "synthPr" or "marker") would silently overwrite step_locations, and every OutputRef pointing to that StepId would resolve to whichever producer was indexed last. The IR's OutputRef carries only StepId (no job qualifier), so the implicit contract is that StepIds must be pipeline-wide unique for correct producer resolution - but that constraint was neither enforced nor documented. - index_job_steps now rejects any duplicate StepId regardless of job / stage, with a typed error citing both previous and current locations. - Uniqueness contract is now documented on the StepId module-level doc (src/compile/ir/ids.rs) and on the OutputRef type-level doc (src/compile/ir/output.rs). - Two new tests: cross_job_duplicate_step_id_is_rejected and cross_stage_duplicate_step_id_is_rejected. 2. Suggestion: src/compile/ir/lower.rs lower_raw_yaml's 2-space de-indent for `- ` prefixed bodies used .unwrap_or(line) as fallback. For inputs from step_to_raw_yaml_string the fallback was dead code, but for any future caller passing a string with different indentation the fallback would silently produce misaligned YAML (deferred parse failure several stack frames later in serde_yaml). Replaced with strict ok_or_else returning a typed error citing the offending line. Empty / whitespace-only continuation lines are kept verbatim (legal between mapping keys, between block scalars, etc.). Two new tests: raw_yaml_dash_prefix_rejects_unindented_continuation and raw_yaml_dash_prefix_accepts_blank_continuation_lines. 3. Suggestion: src/compile/ir/env.rs EnvValue::Literal had no doc-comment marking the compiler-internal contract. lower_env_value emits Literal(s) verbatim with no sanitisation; all current construction sites use hardcoded strings, but the type accepts any impl Into. A future EnvValue::literal(user_string) would silently embed raw user input in the env: block. Doc-comment on the variant now says "**Compiler-internal use only.**" and lists the safe input categories (hardcoded strings, constants, values pre-routed through reject_pipeline_injection) with pointers to PipelineVar / Secret / AdoMacro as the right choices for runtime-resolved values. Constructor doc updated to reference the contract. Validation: cargo build, cargo test (1833 passed, +4 new), cargo clippy --all-targets --all-features, cargo test --test bash_lint_tests, cargo run -- compile --force across all 33 fixtures produces zero lock-file drift - the new validation rejects only contract-violating input that the production compile path never emits today. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/compile/ir/env.rs | 18 +++++++- src/compile/ir/graph.rs | 88 ++++++++++++++++++++++++++++++++++------ src/compile/ir/ids.rs | 14 +++++++ src/compile/ir/lower.rs | 67 ++++++++++++++++++++++++++++-- src/compile/ir/output.rs | 9 +++- 5 files changed, 176 insertions(+), 20 deletions(-) diff --git a/src/compile/ir/env.rs b/src/compile/ir/env.rs index ce5e450d..c80522e2 100644 --- a/src/compile/ir/env.rs +++ b/src/compile/ir/env.rs @@ -43,7 +43,18 @@ use super::output::OutputRef; /// `env:` mapping entry. #[derive(Debug, Clone, PartialEq, Eq)] pub enum EnvValue { - /// Plain string literal. + /// Plain string literal — emitted **verbatim** into the YAML + /// `env:` value position with no escaping or sanitisation. + /// + /// **Compiler-internal use only.** Construction sites must pass + /// a hardcoded string, a constant, or a value derived from + /// front-matter that has already been routed through + /// `crate::validate::reject_pipeline_injection` (or a stronger + /// equivalent). Never construct from raw user-supplied input — + /// use [`EnvValue::PipelineVar`] (`$(NAME)` macro form) or + /// [`EnvValue::Secret`] when the value should come from an ADO + /// variable, or [`EnvValue::AdoMacro`] for predefined variables + /// on the allowlist. Literal(String), /// ADO predefined-variable macro. Must be a member of /// [`ALLOWED_ADO_MACROS`]. @@ -124,7 +135,10 @@ pub const ALLOWED_ADO_MACROS: &[&str] = &[ ]; impl EnvValue { - /// Construct an [`EnvValue::Literal`]. + /// Construct an [`EnvValue::Literal`]. **Compiler-internal use + /// only** — see the variant's doc-comment for the contract on + /// safe inputs (hardcoded strings, constants, or values + /// pre-validated against `reject_pipeline_injection`). pub fn literal(s: impl Into) -> Self { EnvValue::Literal(s.into()) } diff --git a/src/compile/ir/graph.rs b/src/compile/ir/graph.rs index 954726f6..ebb1a236 100644 --- a/src/compile/ir/graph.rs +++ b/src/compile/ir/graph.rs @@ -209,21 +209,34 @@ fn add_manual_job_edges(job: &super::job::Job, g: &mut Graph) { fn index_job_steps(stage: Option, job: &super::job::Job, g: &mut Graph) -> Result<()> { for step in &job.steps { if let Some(id) = step.id() { - // Step ids are pipeline-wide identifiers in ADO when - // referenced via `dependencies..outputs['.X']`, - // so duplicate ids across jobs are technically allowed if - // both jobs are referenced through the qualifying job - // name. We still reject true duplicates inside the SAME - // job, which would silently shadow. - if let Some(prev) = g.step_locations.get(id) - && prev.stage == stage - && prev.job == job.id - { - bail!( - "ir::graph: duplicate StepId '{}' inside job '{}'", - id, + // [`super::output::OutputRef`] carries only a [`StepId`] — + // no qualifying job name — so the IR's producer-resolution + // is keyed on `StepId` alone. Duplicate ids across jobs + // (or stages) would silently overwrite `step_locations` + // and every `OutputRef` to that id would resolve to + // whichever producer was indexed last. ADO YAML allows + // qualifying duplicates at the wire level (`dependencies..outputs[...]`) + // but the IR does not model the job-qualified form, so + // duplicates are rejected pipeline-wide. + if let Some(prev) = g.step_locations.get(id) { + let prev_loc = format!( + "{}.{}", + prev.stage + .as_ref() + .map(|s| s.as_str()) + .unwrap_or(""), + prev.job + ); + let this_loc = format!( + "{}.{}", + stage.as_ref().map(|s| s.as_str()).unwrap_or(""), job.id ); + bail!( + "ir::graph: duplicate StepId '{id}' — previously declared in {prev_loc}, \ + now also declared in {this_loc}. StepIds must be pipeline-wide unique \ + because OutputRef carries only the StepId (no job qualifier)." + ); } let outputs: BTreeSet = collect_step_outputs(step); g.step_locations.insert( @@ -899,6 +912,55 @@ mod tests { assert!(msg.contains("B")); } + #[test] + fn cross_job_duplicate_step_id_is_rejected() { + // OutputRef carries only StepId (no job qualifier), so a + // duplicate StepId across two jobs would silently overwrite + // step_locations and every consumer would resolve to the + // last-indexed producer. Reject loudly at graph-build time. + let dup = StepId::new("synthPr").unwrap(); + let mut job_a = Job::new(JobId::new("A").unwrap(), "A", pool()); + job_a.push_step(Step::Bash( + BashStep::new("s", "echo a") + .with_id(dup.clone()) + .with_output(OutputDecl::new("X")), + )); + let mut job_b = Job::new(JobId::new("B").unwrap(), "B", pool()); + job_b.push_step(Step::Bash( + BashStep::new("s", "echo b") + .with_id(dup) + .with_output(OutputDecl::new("X")), + )); + let p = pipe(PipelineBody::Jobs(vec![job_a, job_b])); + let err = build_graph(&p).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("duplicate StepId 'synthPr'"), "got: {msg}"); + assert!(msg.contains("previously declared in"), "got: {msg}"); + assert!(msg.contains("pipeline-wide unique"), "got: {msg}"); + } + + #[test] + fn cross_stage_duplicate_step_id_is_rejected() { + // Same contract across stages. + let dup = StepId::new("marker").unwrap(); + let mut job_a = Job::new(JobId::new("A").unwrap(), "A", pool()); + job_a.push_step(Step::Bash( + BashStep::new("m", "echo a").with_id(dup.clone()), + )); + let mut stage_a = Stage::new(StageId::new("StageA").unwrap(), "StageA"); + stage_a.push_job(job_a); + + let mut job_b = Job::new(JobId::new("B").unwrap(), "B", pool()); + job_b.push_step(Step::Bash(BashStep::new("m", "echo b").with_id(dup))); + let mut stage_b = Stage::new(StageId::new("StageB").unwrap(), "StageB"); + stage_b.push_job(job_b); + + let p = pipe(PipelineBody::Stages(vec![stage_a, stage_b])); + let err = build_graph(&p).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("duplicate StepId 'marker'"), "got: {msg}"); + } + #[test] fn stage_condition_rejects_same_stage_step_output() { let producer_id = StepId::new("producer").unwrap(); diff --git a/src/compile/ir/ids.rs b/src/compile/ir/ids.rs index ee953cc9..683edff3 100644 --- a/src/compile/ir/ids.rs +++ b/src/compile/ir/ids.rs @@ -16,6 +16,20 @@ //! [`Result`] so call sites can surface a meaningful error rather //! than panic. //! +//! ## Uniqueness contract +//! +//! - [`StageId`] is unique within a [`super::Pipeline`]. +//! - [`JobId`] is unique within a stage (or pipeline-wide for +//! stage-less pipelines). The graph builder rejects duplicates +//! with a typed error. +//! - **[`StepId`] is unique pipeline-wide** — not just within a job. +//! [`super::output::OutputRef`] carries only a `StepId` (no +//! qualifying job name), so the IR's producer-resolution is keyed +//! on `StepId` alone. The graph builder rejects cross-job +//! duplicates with a typed error. (ADO YAML technically allows +//! `dependencies..outputs[...]` to disambiguate, but the IR +//! does not model the job-qualified form.) +//! //! `Display` round-trips to the original string. `AsRef` is //! provided so ids slot into format strings cheaply. diff --git a/src/compile/ir/lower.rs b/src/compile/ir/lower.rs index 66549a0d..705fb676 100644 --- a/src/compile/ir/lower.rs +++ b/src/compile/ir/lower.rs @@ -824,8 +824,18 @@ pub(crate) fn lower_step(step: &Step, ctx: &LoweringContext<'_>) -> Result Result { let trimmed = raw.trim_start(); let body = if let Some(rest) = trimmed.strip_prefix("- ") { @@ -836,9 +846,24 @@ fn lower_raw_yaml(raw: &str) -> Result { for (i, line) in rest.split_inclusive('\n').enumerate() { if i == 0 { out.push_str(line); - } else { - out.push_str(line.strip_prefix(" ").unwrap_or(line)); + continue; } + // Empty / whitespace-only continuation lines are fine + // verbatim (a literal block-scalar terminator, a blank + // line between keys, etc.). + if line.trim().is_empty() { + out.push_str(line); + continue; + } + let dedented = line.strip_prefix(" ").ok_or_else(|| { + anyhow::anyhow!( + "ir::lower: Step::RawYaml continuation line is not indented by at \ + least two spaces (the leading `- ` prefix on the first line \ + requires every subsequent non-blank line to start with at least \ + two spaces so the mapping parses as a top-level document). Line: {line:?}" + ) + })?; + out.push_str(dedented); } out } else { @@ -1581,6 +1606,40 @@ mod tests { assert!(format!("{err:#}").contains("Step::RawYaml")); } + #[test] + fn raw_yaml_dash_prefix_rejects_unindented_continuation() { + // A `- bash: ...` first line obligates every non-blank + // continuation line to start with at least two spaces (the + // standard YAML sequence-item indent that the dedent strips). + // The previous `unwrap_or(line)` fallback silently emitted + // misaligned YAML; the strict form bails with a clear error + // citing the offending line. + let body = "- bash: echo hi\n displayName: greet\nnot-indented: oops\n"; + let err = super::lower_raw_yaml(body).unwrap_err(); + let msg = format!("{err:#}"); + assert!( + msg.contains("not indented by at least two spaces"), + "got: {msg}" + ); + assert!(msg.contains("not-indented: oops"), "got: {msg}"); + } + + #[test] + fn raw_yaml_dash_prefix_accepts_blank_continuation_lines() { + // Blank/whitespace-only continuation lines are legal between + // mapping keys; the strict dedent must not bail on them. + let body = "- bash: echo hi\n\n displayName: greet\n"; + let v = super::lower_raw_yaml(body).unwrap(); + let m = match v { + Value::Mapping(m) => m, + _ => panic!("expected mapping"), + }; + assert_eq!( + m.get(Value::String("displayName".into())).and_then(|v| v.as_str()), + Some("greet") + ); + } + // ── Phase 0: top-level pipeline lowering tests ───────────────── /// `parameters:` with a Boolean default round-trips through emit diff --git a/src/compile/ir/output.rs b/src/compile/ir/output.rs index 5fce7560..b41fe5a4 100644 --- a/src/compile/ir/output.rs +++ b/src/compile/ir/output.rs @@ -93,9 +93,16 @@ impl OutputDecl { /// output it wants; at lower time the IR picks the correct ADO /// reference syntax based on whether the consumer lives in the same /// job / a sibling job in the same stage / a different stage. +/// +/// **Uniqueness:** the [`StepId`] is the *only* key used to resolve +/// the producer's location — there is no job-qualified form here. +/// Step IDs are therefore required to be pipeline-wide unique; see +/// [`crate::compile::ir::ids`] for the contract and +/// [`crate::compile::ir::graph::build_graph`] for the enforcement. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct OutputRef { - /// The producer step's id. + /// The producer step's id. **Must be pipeline-wide unique** — + /// see type-level doc on uniqueness. pub step: StepId, /// The output variable name (must match an [`OutputDecl::name`] /// on the producer).