Distinguish the two kinds of token range.

When collecting tokens there are two kinds of range: - a range relative to the parser's full token stream (which we get when we are parsing); - a range relative to a single AST node's token stream (which we use within `LazyAttrTokenStreamImpl` when replacing tokens). These are currently both represented with `Range<u32>` and it's easy to mix them up -- until now I hadn't properly understood the difference. This commit introduces `ParserRange` and `NodeRange` to distinguish them. This also requires splitting `ReplaceRange` in two, giving the new types `ParserReplacement` and `NodeReplacement`. (These latter two names reduce the overloading of the word "range".) The commit also rewrites some comments to be clearer. The end result is a little more verbose, but much clearer.
rust-lang · jieyouxu · Jul 10, 2024 · Jul 10, 2024 · Jul 11, 2024 · Jul 28, 2024
commit d1f05fd1848fc68ed89d17f7937e358dacd8aed4
diff --git a/compiler/rustc_builtin_macros/src/cfg_eval.rs b/compiler/rustc_builtin_macros/src/cfg_eval.rs
@@ -202,7 +202,7 @@ impl CfgEval<'_> {
         }
 
         // Now that we have our re-parsed `AttrTokenStream`, recursively configuring
-        // our attribute target will correctly the tokens as well.
+        // our attribute target will correctly configure the tokens as well.
         flat_map_annotatable(self, annotatable)
     }
 }

diff --git a/compiler/rustc_parse/src/parser/attr.rs b/compiler/rustc_parse/src/parser/attr.rs
@@ -8,7 +8,7 @@ use rustc_span::{sym, BytePos, Span};
 use thin_vec::ThinVec;
 use tracing::debug;
 
-use super::{AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, PathStyle};
+use super::{AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, ParserRange, PathStyle};
 use crate::{errors, fluent_generated as fluent, maybe_whole};
 
 // Public for rustfmt usage
@@ -307,8 +307,8 @@ impl<'a> Parser<'a> {
                 // inner attribute, for possible later processing in a `LazyAttrTokenStream`.
                 if let Capturing::Yes = self.capture_state.capturing {
                     let end_pos = self.num_bump_calls;
-                    let range = start_pos..end_pos;
-                    self.capture_state.inner_attr_ranges.insert(attr.id, range);
+                    let parser_range = ParserRange(start_pos..end_pos);
+                    self.capture_state.inner_attr_parser_ranges.insert(attr.id, parser_range);
                 }
                 attrs.push(attr);
             } else {

diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs
@@ -10,7 +10,10 @@ use rustc_errors::PResult;
 use rustc_session::parse::ParseSess;
 use rustc_span::{sym, Span, DUMMY_SP};
 
-use super::{Capturing, FlatToken, ForceCollect, Parser, ReplaceRange, TokenCursor};
+use super::{
+    Capturing, FlatToken, ForceCollect, NodeRange, NodeReplacement, Parser, ParserRange,
+    TokenCursor,
+};
 
 /// A wrapper type to ensure that the parser handles outer attributes correctly.
 /// When we parse outer attributes, we need to ensure that we capture tokens
@@ -28,8 +31,8 @@ use super::{Capturing, FlatToken, ForceCollect, Parser, ReplaceRange, TokenCurso
 #[derive(Debug, Clone)]
 pub struct AttrWrapper {
     attrs: AttrVec,
-    // The start of the outer attributes in the token cursor.
-    // This allows us to create a `ReplaceRange` for the entire attribute
+    // The start of the outer attributes in the parser's token stream.
+    // This lets us create a `NodeReplacement` for the entire attribute
     // target, including outer attributes.
     start_pos: u32,
 }
@@ -88,7 +91,7 @@ struct LazyAttrTokenStreamImpl {
     cursor_snapshot: TokenCursor,
     num_calls: u32,
     break_last_token: bool,
-    replace_ranges: Box<[ReplaceRange]>,
+    node_replacements: Box<[NodeReplacement]>,
 }
 
 impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
@@ -103,21 +106,24 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
             .chain(iter::repeat_with(|| FlatToken::Token(cursor_snapshot.next())))
             .take(self.num_calls as usize);
 
-        if self.replace_ranges.is_empty() {
+        if self.node_replacements.is_empty() {
             make_attr_token_stream(tokens, self.break_last_token)
         } else {
             let mut tokens: Vec<_> = tokens.collect();
-            let mut replace_ranges = self.replace_ranges.to_vec();
-            replace_ranges.sort_by_key(|(range, _)| range.start);
+            let mut node_replacements = self.node_replacements.to_vec();
+            node_replacements.sort_by_key(|(range, _)| range.0.start);
 
             #[cfg(debug_assertions)]
-            for [(range, tokens), (next_range, next_tokens)] in replace_ranges.array_windows() {
+            for [(node_range, tokens), (next_node_range, next_tokens)] in
+                node_replacements.array_windows()
+            {
                 assert!(
-                    range.end <= next_range.start || range.end >= next_range.end,
-                    "Replace ranges should either be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})",
-                    range,
+                    node_range.0.end <= next_node_range.0.start
+                        || node_range.0.end >= next_node_range.0.end,
+                    "Node ranges should be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})",
+                    node_range,
                     tokens,
-                    next_range,
+                    next_node_range,
                     next_tokens,
                 );
             }
@@ -135,20 +141,23 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
             // start position, we ensure that any (outer) replace range which
             // encloses another (inner) replace range will fully overwrite the
             // inner range's replacement.
-            for (range, target) in replace_ranges.into_iter().rev() {
-                assert!(!range.is_empty(), "Cannot replace an empty range: {range:?}");
+            for (node_range, target) in node_replacements.into_iter().rev() {
+                assert!(
+                    !node_range.0.is_empty(),
+                    "Cannot replace an empty node range: {:?}",
+                    node_range.0
+                );
 
                 // Replace the tokens in range with zero or one `FlatToken::AttrsTarget`s, plus
                 // enough `FlatToken::Empty`s to fill up the rest of the range. This keeps the
                 // total length of `tokens` constant throughout the replacement process, allowing
-                // us to use all of the `ReplaceRanges` entries without adjusting indices.
+                // us to do all replacements without adjusting indices.
                 let target_len = target.is_some() as usize;
                 tokens.splice(
-                    (range.start as usize)..(range.end as usize),
-                    target
-                        .into_iter()
-                        .map(|target| FlatToken::AttrsTarget(target))
-                        .chain(iter::repeat(FlatToken::Empty).take(range.len() - target_len)),
+                    (node_range.0.start as usize)..(node_range.0.end as usize),
+                    target.into_iter().map(|target| FlatToken::AttrsTarget(target)).chain(
+                        iter::repeat(FlatToken::Empty).take(node_range.0.len() - target_len),
+                    ),
                 );
             }
             make_attr_token_stream(tokens.into_iter(), self.break_last_token)
@@ -215,7 +224,7 @@ impl<'a> Parser<'a> {
         let cursor_snapshot = self.token_cursor.clone();
         let start_pos = self.num_bump_calls;
         let has_outer_attrs = !attrs.attrs.is_empty();
-        let replace_ranges_start = self.capture_state.replace_ranges.len();
+        let parser_replacements_start = self.capture_state.parser_replacements.len();
 
         // We set and restore `Capturing::Yes` on either side of the call to
         // `f`, so we can distinguish the outermost call to
@@ -270,7 +279,7 @@ impl<'a> Parser<'a> {
             return Ok(ret);
         }
 
-        let replace_ranges_end = self.capture_state.replace_ranges.len();
+        let parser_replacements_end = self.capture_state.parser_replacements.len();
 
         assert!(
             !(self.break_last_token && capture_trailing),
@@ -287,15 +296,16 @@ impl<'a> Parser<'a> {
 
         let num_calls = end_pos - start_pos;
 
-        // Take the captured ranges for any inner attributes that we parsed in
-        // `Parser::parse_inner_attributes`, and pair them in a `ReplaceRange`
-        // with `None`, which means the relevant tokens will be removed. (More
-        // details below.)
-        let mut inner_attr_replace_ranges = Vec::new();
+        // Take the captured `ParserRange`s for any inner attributes that we parsed in
+        // `Parser::parse_inner_attributes`, and pair them in a `ParserReplacement` with `None`,
+        // which means the relevant tokens will be removed. (More details below.)
+        let mut inner_attr_parser_replacements = Vec::new();
         for attr in ret.attrs() {
             if attr.style == ast::AttrStyle::Inner {
-                if let Some(attr_range) = self.capture_state.inner_attr_ranges.remove(&attr.id) {
-                    inner_attr_replace_ranges.push((attr_range, None));
+                if let Some(inner_attr_parser_range) =
+                    self.capture_state.inner_attr_parser_ranges.remove(&attr.id)
+                {
+                    inner_attr_parser_replacements.push((inner_attr_parser_range, None));
                 } else {
                     self.dcx().span_delayed_bug(attr.span, "Missing token range for attribute");
                 }
@@ -304,37 +314,41 @@ impl<'a> Parser<'a> {
 
         // This is hot enough for `deep-vector` that checking the conditions for an empty iterator
         // is measurably faster than actually executing the iterator.
-        let replace_ranges: Box<[ReplaceRange]> =
-            if replace_ranges_start == replace_ranges_end && inner_attr_replace_ranges.is_empty() {
-                Box::new([])
-            } else {
-                // Grab any replace ranges that occur *inside* the current AST node. We will
-                // perform the actual replacement only when we convert the `LazyAttrTokenStream` to
-                // an `AttrTokenStream`.
-                self.capture_state.replace_ranges[replace_ranges_start..replace_ranges_end]
-                    .iter()
-                    .cloned()
-                    .chain(inner_attr_replace_ranges.iter().cloned())
-                    .map(|(range, data)| ((range.start - start_pos)..(range.end - start_pos), data))
-                    .collect()
-            };
+        let node_replacements: Box<[_]> = if parser_replacements_start == parser_replacements_end
+            && inner_attr_parser_replacements.is_empty()
+        {
+            Box::new([])
+        } else {
+            // Grab any replace ranges that occur *inside* the current AST node. Convert them
+            // from `ParserRange` form to `NodeRange` form. We will perform the actual
+            // replacement only when we convert the `LazyAttrTokenStream` to an
+            // `AttrTokenStream`.
+            self.capture_state.parser_replacements
+                [parser_replacements_start..parser_replacements_end]
+                .iter()
+                .cloned()
+                .chain(inner_attr_parser_replacements.iter().cloned())
+                .map(|(parser_range, data)| (NodeRange::new(parser_range, start_pos), data))
+                .collect()
+        };
 
         // What is the status here when parsing the example code at the top of this method?
         //
         // When parsing `g`:
         // - `start_pos..end_pos` is `12..33` (`fn g { ... }`, excluding the outer attr).
-        // - `inner_attr_replace_ranges` has one entry (`5..15`, when counting from `fn`), to
+        // - `inner_attr_parser_replacements` has one entry (`ParserRange(17..27)`), to
         //   delete the inner attr's tokens.
-        //   - This entry is put into the lazy tokens for `g`, i.e. deleting the inner attr from
-        //     those tokens (if they get evaluated).
+        //   - This entry is converted to `NodeRange(5..15)` (relative to the `fn`) and put into
+        //     the lazy tokens for `g`, i.e. deleting the inner attr from those tokens (if they get
+        //     evaluated).
         //   - Those lazy tokens are also put into an `AttrsTarget` that is appended to `self`'s
         //     replace ranges at the bottom of this function, for processing when parsing `m`.
-        // - `replace_ranges_start..replace_ranges_end` is empty.
+        // - `parser_replacements_start..parser_replacements_end` is empty.
         //
         // When parsing `m`:
         // - `start_pos..end_pos` is `0..34` (`mod m`, excluding the `#[cfg_eval]` attribute).
-        // - `inner_attr_replace_ranges` is empty.
-        // - `replace_range_start..replace_ranges_end` has one entry.
+        // - `inner_attr_parser_replacements` is empty.
+        // - `parser_replacements_start..parser_replacements_end` has one entry.
         //   - One `AttrsTarget` (added below when parsing `g`) to replace all of `g` (`3..33`,
         //     including its outer attribute), with:
         //     - `attrs`: includes the outer and the inner attr.
@@ -345,7 +359,7 @@ impl<'a> Parser<'a> {
             num_calls,
             cursor_snapshot,
             break_last_token: self.break_last_token,
-            replace_ranges,
+            node_replacements,
         });
 
         // If we support tokens and don't already have them, store the newly captured tokens.
@@ -366,7 +380,7 @@ impl<'a> Parser<'a> {
             // What is the status here when parsing the example code at the top of this method?
             //
             // When parsing `g`, we add one entry:
-            // - The `start_pos..end_pos` (`3..33`) entry has a new `AttrsTarget` with:
+            // - The pushed entry (`ParserRange(3..33)`) has a new `AttrsTarget` with:
             //   - `attrs`: includes the outer and the inner attr.
             //   - `tokens`: lazy tokens for `g` (with its inner attr deleted).
             //
@@ -377,12 +391,14 @@ impl<'a> Parser<'a> {
             // cfg-expand this AST node.
             let start_pos = if has_outer_attrs { attrs.start_pos } else { start_pos };
             let target = AttrsTarget { attrs: ret.attrs().iter().cloned().collect(), tokens };
-            self.capture_state.replace_ranges.push((start_pos..end_pos, Some(target)));
+            self.capture_state
+                .parser_replacements
+                .push((ParserRange(start_pos..end_pos), Some(target)));
         } else if matches!(self.capture_state.capturing, Capturing::No) {
             // Only clear the ranges once we've finished capturing entirely, i.e. we've finished
             // the outermost call to this method.
-            self.capture_state.replace_ranges.clear();
-            self.capture_state.inner_attr_ranges.clear();
+            self.capture_state.parser_replacements.clear();
+            self.capture_state.inner_attr_parser_ranges.clear();
         }
         Ok(ret)
     }

diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs
@@ -192,24 +192,54 @@ struct ClosureSpans {
     body: Span,
 }
 
-/// Indicates a range of tokens that should be replaced by
-/// the tokens in the provided `AttrsTarget`. This is used in two
-/// places during token collection:
+/// A token range within a `Parser`'s full token stream.
+#[derive(Clone, Debug)]
+struct ParserRange(Range<u32>);
+
+/// A token range within an individual AST node's (lazy) token stream, i.e.
+/// relative to that node's first token. Distinct from `ParserRange` so the two
+/// kinds of range can't be mixed up.
+#[derive(Clone, Debug)]
+struct NodeRange(Range<u32>);
+
+/// Indicates a range of tokens that should be replaced by an `AttrsTarget`
+/// (replacement) or be replaced by nothing (deletion). This is used in two
+/// places during token collection.
+///
+/// 1. Replacement. During the parsing of an AST node that may have a
+///    `#[derive]` attribute, when we parse a nested AST node that has `#[cfg]`
+///    or `#[cfg_attr]`, we replace the entire inner AST node with
+///    `FlatToken::AttrsTarget`. This lets us perform eager cfg-expansion on an
+///    `AttrTokenStream`.
 ///
-/// 1. During the parsing of an AST node that may have a `#[derive]`
-/// attribute, we parse a nested AST node that has `#[cfg]` or `#[cfg_attr]`
-/// In this case, we use a `ReplaceRange` to replace the entire inner AST node
-/// with `FlatToken::AttrsTarget`, allowing us to perform eager cfg-expansion
-/// on an `AttrTokenStream`.
+/// 2. Deletion. We delete inner attributes from all collected token streams,
+///    and instead track them through the `attrs` field on the AST node. This
+///    lets us manipulate them similarly to outer attributes. When we create a
+///    `TokenStream`, the inner attributes are inserted into the proper place
+///    in the token stream.
 ///
-/// 2. When we parse an inner attribute while collecting tokens. We
-/// remove inner attributes from the token stream entirely, and
-/// instead track them through the `attrs` field on the AST node.
-/// This allows us to easily manipulate them (for example, removing
-/// the first macro inner attribute to invoke a proc-macro).
-/// When create a `TokenStream`, the inner attributes get inserted
-/// into the proper place in the token stream.
-type ReplaceRange = (Range<u32>, Option<AttrsTarget>);
+/// Each replacement starts off in `ParserReplacement` form but is converted to
+/// `NodeReplacement` form when it is attached to a single AST node, via
+/// `LazyAttrTokenStreamImpl`.
+type ParserReplacement = (ParserRange, Option<AttrsTarget>);
+
+/// See the comment on `ParserReplacement`.
+type NodeReplacement = (NodeRange, Option<AttrsTarget>);
+
+impl NodeRange {
+    // Converts a range within a parser's tokens to a range within a
+    // node's tokens beginning at `start_pos`.
+    //
+    // For example, imagine a parser with 50 tokens in its token stream, a
+    // function that spans `ParserRange(20..40)` and an inner attribute within
+    // that function that spans `ParserRange(30..35)`. We would find the inner
+    // attribute's range within the function's tokens by subtracting 20, which
+    // is the position of the function's start token. This gives
+    // `NodeRange(10..15)`.
+    fn new(ParserRange(parser_range): ParserRange, start_pos: u32) -> NodeRange {
+        NodeRange((parser_range.start - start_pos)..(parser_range.end - start_pos))
+    }
+}
 
 /// Controls how we capture tokens. Capturing can be expensive,
 /// so we try to avoid performing capturing in cases where
@@ -226,8 +256,8 @@ enum Capturing {
 #[derive(Clone, Debug)]
 struct CaptureState {
     capturing: Capturing,
-    replace_ranges: Vec<ReplaceRange>,
-    inner_attr_ranges: FxHashMap<AttrId, Range<u32>>,
+    parser_replacements: Vec<ParserReplacement>,
+    inner_attr_parser_ranges: FxHashMap<AttrId, ParserRange>,
 }
 
 /// Iterator over a `TokenStream` that produces `Token`s. It's a bit odd that
@@ -417,8 +447,8 @@ impl<'a> Parser<'a> {
             subparser_name,
             capture_state: CaptureState {
                 capturing: Capturing::No,
-                replace_ranges: Vec::new(),
-                inner_attr_ranges: Default::default(),
+                parser_replacements: Vec::new(),
+                inner_attr_parser_ranges: Default::default(),
             },
             current_closure: None,
             recovery: Recovery::Allowed,