Skip to content

Commit cd96dd8

Browse files
committed
Rework the description of the top-level nonterminals
Change the name "token-kind nonterminal" to "tokenisation nonterminal". "token nonterminal" now refers to a tokenisation nonterminal which isn't a reserved form, rather than to the TOKEN_yyy nonterminals. Define them explicitly using the fine-grained tokens table and a list of reserved forms, rather than saying they are the nonterminals which appear in the top-level choice expressions.
1 parent b96cbc0 commit cd96dd8

File tree

13 files changed

+132
-110
lines changed

13 files changed

+132
-110
lines changed

src/framework/simple_reports.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use crate::datatypes::trees::Forest;
2020
use crate::reimplementation::cleaning::{self, CleaningOutcome};
2121
use crate::reimplementation::doc_lowering::lower_doc_comments;
2222
use crate::reimplementation::fine_tokens::FineToken;
23-
use crate::reimplementation::tokenisation::{self, TokenKindMatch};
23+
use crate::reimplementation::tokenisation::{self, TokenisationMatch};
2424
use crate::rustc_harness::lex_via_rustc;
2525
use crate::tokens_common::Origin;
2626
use crate::{CleaningMode, Edition, Lowering};
@@ -128,7 +128,7 @@ pub enum DetailsMode {
128128
Always,
129129
}
130130

131-
fn describe_match(match_data: &TokenKindMatch) -> impl Iterator<Item = String> + use<'_> {
131+
fn describe_match(match_data: &TokenisationMatch) -> impl Iterator<Item = String> + use<'_> {
132132
once(format!(
133133
"{:?}, {:?}",
134134
match_data.matched_nonterminal, match_data.consumed
@@ -249,7 +249,7 @@ fn show_inspect(input: &str, edition: Edition, cleaning: CleaningMode, lowering:
249249
println!(" error: {message}");
250250
}
251251
}
252-
println!(" -- token-kind nonterminal matches --");
252+
println!(" -- tokenisation nonterminal matches --");
253253
for match_data in matches {
254254
for s in describe_match(&match_data) {
255255
println!(" {s}",);
@@ -274,7 +274,7 @@ fn show_inspect(input: &str, edition: Edition, cleaning: CleaningMode, lowering:
274274
}
275275
tokenisation::Reason::Processing(message, rejected, matches, tokens) => {
276276
println!(
277-
"lex_via_peg: {failure_label} when processing a match of a token-kind nonterminal"
277+
"lex_via_peg: {failure_label} when processing a match of a tokenisation nonterminal"
278278
);
279279
println!(" error: {message}");
280280
println!(" -- when considering match --");
@@ -284,7 +284,7 @@ fn show_inspect(input: &str, edition: Edition, cleaning: CleaningMode, lowering:
284284
(matches, tokens)
285285
}
286286
};
287-
println!(" -- previous token-kind nonterminal matches --");
287+
println!(" -- previous tokenisation nonterminal matches --");
288288
for match_data in matches {
289289
for s in describe_match(&match_data) {
290290
println!(" {s}");

src/reimplementation/tokenisation.rs

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use super::fine_tokens::FineToken;
88
mod processing;
99
mod tokens_matching;
1010

11-
pub use tokens_matching::TokenKindMatch;
11+
pub use tokens_matching::TokenisationMatch;
1212
use tokens_matching::TokensMatchData;
1313

1414
const MAX_INPUT_LENGTH: usize = 0x100_0000;
@@ -39,7 +39,7 @@ pub fn analyse(input: &Charseq, edition: Edition) -> Analysis {
3939
}
4040

4141
let TokensMatchData {
42-
token_kind_matches,
42+
tokenisation_matches,
4343
consumed_entire_input: matched_entire_input,
4444
} = match tokens_matching::match_tokens(edition, input.chars()) {
4545
Ok(tokens_match_data) => tokens_match_data,
@@ -48,11 +48,11 @@ pub fn analyse(input: &Charseq, edition: Edition) -> Analysis {
4848
}
4949
};
5050

51-
// Note that if there's a processing error we only report the token-kind matches up to the match
52-
// that failed processing.
51+
// Note that if there's a processing error we only report the tokenisation matches up to the
52+
// match that failed processing.
5353
let mut tokens = Vec::new();
5454
let mut reported_matches = Vec::new();
55-
for match_data in token_kind_matches {
55+
for match_data in tokenisation_matches {
5656
match processing::process(&match_data) {
5757
Ok(token) => {
5858
reported_matches.push(match_data);
@@ -91,7 +91,7 @@ pub fn analyse(input: &Charseq, edition: Edition) -> Analysis {
9191
/// Result of running lexical analysis on a string.
9292
pub enum Analysis {
9393
/// Lexical analysis accepted the input.
94-
Accepts(Vec<TokenKindMatch>, Vec<FineToken>),
94+
Accepts(Vec<TokenisationMatch>, Vec<FineToken>),
9595

9696
/// Lexical analysis rejected the input.
9797
Rejects(Reason),
@@ -102,23 +102,28 @@ pub enum Analysis {
102102

103103
/// Explanation of why and where input was rejected.
104104
pub enum Reason {
105-
/// Rejected when trying to match the edition's token nonterminal.
105+
/// Rejected when trying to match the edition's tokens nonterminal.
106106
///
107107
/// The string describes the reason for rejection (or a model error).
108108
///
109-
/// The lists of matches and tokens represent what was lexed successfully before the token
109+
/// The lists of matches and tokens represent what was lexed successfully before the tokens
110110
/// nonterminal ceased to match.
111-
Matching(String, Vec<TokenKindMatch>, Vec<FineToken>),
111+
Matching(String, Vec<TokenisationMatch>, Vec<FineToken>),
112112

113-
/// Rejected when processing a match of a token-kind nonterminal.
113+
/// Rejected when processing a match of a tokenisation nonterminal.
114114
///
115115
/// The string describes the reason for rejection (or a model error).
116116
///
117117
/// The single MatchData describes the match which was rejected (or which was being processed
118118
/// when we encountered a problem with the model).
119119
///
120120
/// The lists of matches and tokens represent what was lexed successfully first.
121-
Processing(String, TokenKindMatch, Vec<TokenKindMatch>, Vec<FineToken>),
121+
Processing(
122+
String,
123+
TokenisationMatch,
124+
Vec<TokenisationMatch>,
125+
Vec<FineToken>,
126+
),
122127
}
123128

124129
impl Reason {
@@ -159,13 +164,13 @@ impl Reason {
159164
/// Otherwise returns None.
160165
pub fn lex_as_single_token(input: &[char], edition: Edition) -> Option<FineToken> {
161166
let Ok(TokensMatchData {
162-
token_kind_matches,
167+
tokenisation_matches,
163168
consumed_entire_input: true,
164169
}) = tokens_matching::match_tokens(edition, input)
165170
else {
166171
return None;
167172
};
168-
let [match_data] = &token_kind_matches[..] else {
173+
let [match_data] = &tokenisation_matches[..] else {
169174
return None;
170175
};
171176
processing::process(match_data).ok()
@@ -186,14 +191,15 @@ pub fn first_nonwhitespace_token(input: &[char], edition: Edition) -> Option<Fin
186191

187192
use crate::reimplementation::fine_tokens::{CommentStyle, FineTokenData::*};
188193

189-
let token_kind_matches = match tokens_matching::match_tokens(edition, input) {
194+
let tokenisation_matches = match tokens_matching::match_tokens(edition, input) {
190195
Ok(TokensMatchData {
191-
token_kind_matches, ..
192-
}) => token_kind_matches,
196+
tokenisation_matches,
197+
..
198+
}) => tokenisation_matches,
193199
Err(_) => return None,
194200
};
195201

196-
for match_data in token_kind_matches {
202+
for match_data in tokenisation_matches {
197203
let Ok(token) = processing::process(&match_data) else {
198204
return None;
199205
};

src/reimplementation/tokenisation/processing.rs

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,18 @@ use crate::reimplementation::tokenisation::processing::escape_processing::{
88
};
99
use crate::tokens_common::{NumericBase, Origin};
1010

11-
use super::tokens_matching::{Nonterminal, TokenKindMatch};
11+
use super::tokens_matching::{Nonterminal, TokenisationMatch};
1212

1313
mod escape_processing;
1414

1515
/// Converts a match to a fine-grained token, or rejects the match.
1616
///
17-
/// This is the "Processing a match" stage of extracting a fine-grained token.
17+
/// This implements "Processing a tokenisation nonterminal match".
1818
///
1919
/// If the match is accepted, returns a fine-grained token.
2020
///
21-
/// If the match is rejected, distinguishes rejection from "model error".
22-
pub fn process(match_data: &TokenKindMatch) -> Result<FineToken, Error> {
21+
/// If the match is not accepted, distinguishes rejection from "model error".
22+
pub fn process(match_data: &TokenisationMatch) -> Result<FineToken, Error> {
2323
let token_data = match match_data.matched_nonterminal {
2424
Nonterminal::Whitespace => process_whitespace(match_data)?,
2525
Nonterminal::Line_comment => process_line_comment(match_data)?,
@@ -54,7 +54,7 @@ pub fn process(match_data: &TokenKindMatch) -> Result<FineToken, Error> {
5454
match_data.matched_nonterminal
5555
)));
5656
}
57-
_ => return model_error("unhandled token-kind nonterminal"),
57+
_ => return model_error("unhandled tokenisation nonterminal"),
5858
};
5959
Ok(FineToken {
6060
data: token_data,
@@ -97,7 +97,7 @@ impl From<escape_processing::Error> for Error {
9797
}
9898
}
9999

100-
impl TokenKindMatch {
100+
impl TokenisationMatch {
101101
/// Returns the characters consumed by the specified subsidiary nonterminal, or None if that
102102
/// nonterminal did not participate in the match.
103103
///
@@ -138,11 +138,11 @@ impl TokenKindMatch {
138138
}
139139
}
140140

141-
fn process_whitespace(_m: &TokenKindMatch) -> Result<FineTokenData, Error> {
141+
fn process_whitespace(_m: &TokenisationMatch) -> Result<FineTokenData, Error> {
142142
Ok(FineTokenData::Whitespace)
143143
}
144144

145-
fn process_line_comment(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
145+
fn process_line_comment(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
146146
let comment_content = m.consumed(Nonterminal::LINE_COMMENT_CONTENT)?;
147147
let (style, body) = match comment_content.chars() {
148148
['/', '/', ..] => (CommentStyle::NonDoc, &[] as &[char]),
@@ -159,7 +159,7 @@ fn process_line_comment(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
159159
})
160160
}
161161

162-
fn process_block_comment(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
162+
fn process_block_comment(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
163163
let comment_content = m.consumed_by_first_participating(Nonterminal::BLOCK_COMMENT_CONTENT)?;
164164
let (style, body) = match comment_content.chars() {
165165
['*', '*', ..] => (CommentStyle::NonDoc, &[] as &[char]),
@@ -176,7 +176,7 @@ fn process_block_comment(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
176176
})
177177
}
178178

179-
fn process_character_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
179+
fn process_character_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
180180
use LiteralComponent::*;
181181
let single_quoted_content = m.consumed(Nonterminal::SINGLE_QUOTED_CONTENT)?;
182182
let single_escape_interpretation =
@@ -209,7 +209,7 @@ fn process_character_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error>
209209
})
210210
}
211211

212-
fn process_byte_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
212+
fn process_byte_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
213213
use LiteralComponent::*;
214214
let single_quoted_content = m.consumed(Nonterminal::SINGLE_QUOTED_CONTENT)?;
215215
let single_escape_interpretation =
@@ -246,7 +246,7 @@ fn process_byte_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
246246
})
247247
}
248248

249-
fn process_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
249+
fn process_string_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
250250
use LiteralComponent::*;
251251
let double_quoted_content = m.consumed(Nonterminal::DOUBLE_QUOTED_CONTENT)?;
252252
let escape_interpretation = match try_escape_interpretation(double_quoted_content)? {
@@ -282,7 +282,7 @@ fn process_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
282282
})
283283
}
284284

285-
fn process_byte_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
285+
fn process_byte_string_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
286286
use LiteralComponent::*;
287287
let double_quoted_content = m.consumed(Nonterminal::DOUBLE_QUOTED_CONTENT)?;
288288
let escape_interpretation = match try_escape_interpretation(double_quoted_content)? {
@@ -321,7 +321,7 @@ fn process_byte_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Erro
321321
})
322322
}
323323

324-
fn process_c_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
324+
fn process_c_string_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
325325
use LiteralComponent::*;
326326
let double_quoted_content = m.consumed(Nonterminal::DOUBLE_QUOTED_CONTENT)?;
327327
let escape_interpretation = match try_escape_interpretation(double_quoted_content)? {
@@ -377,7 +377,7 @@ fn process_c_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error>
377377
})
378378
}
379379

380-
fn process_raw_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
380+
fn process_raw_string_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
381381
let raw_double_quoted_content = m.consumed(Nonterminal::RAW_DOUBLE_QUOTED_CONTENT)?.clone();
382382
if raw_double_quoted_content.contains('\u{000d}') {
383383
return rejected("CR non-escape");
@@ -392,7 +392,7 @@ fn process_raw_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error
392392
})
393393
}
394394

395-
fn process_raw_byte_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
395+
fn process_raw_byte_string_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
396396
let raw_double_quoted_content = m.consumed(Nonterminal::RAW_DOUBLE_QUOTED_CONTENT)?;
397397
if raw_double_quoted_content.scalar_values().any(|n| n > 127) {
398398
return rejected("non-ASCII character");
@@ -414,7 +414,7 @@ fn process_raw_byte_string_literal(m: &TokenKindMatch) -> Result<FineTokenData,
414414
})
415415
}
416416

417-
fn process_raw_c_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
417+
fn process_raw_c_string_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
418418
let raw_double_quoted_content = m.consumed(Nonterminal::RAW_DOUBLE_QUOTED_CONTENT)?;
419419
if raw_double_quoted_content.contains('\u{000d}') {
420420
return rejected("CR in raw content");
@@ -433,7 +433,7 @@ fn process_raw_c_string_literal(m: &TokenKindMatch) -> Result<FineTokenData, Err
433433
})
434434
}
435435

436-
fn process_float_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
436+
fn process_float_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
437437
let body = match (
438438
m.maybe_consumed(Nonterminal::FLOAT_BODY_WITH_EXPONENT)?,
439439
m.maybe_consumed(Nonterminal::FLOAT_BODY_WITHOUT_EXPONENT)?,
@@ -453,7 +453,7 @@ fn process_float_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
453453
})
454454
}
455455

456-
fn process_integer_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
456+
fn process_integer_literal(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
457457
let base = match (
458458
m.maybe_consumed(Nonterminal::INTEGER_BINARY_LITERAL)?,
459459
m.maybe_consumed(Nonterminal::INTEGER_OCTAL_LITERAL)?,
@@ -504,7 +504,7 @@ fn process_integer_literal(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
504504
})
505505
}
506506

507-
fn process_raw_lifetime_or_label(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
507+
fn process_raw_lifetime_or_label(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
508508
let name = m.consumed(Nonterminal::IDENT)?.clone();
509509
let s = name.to_string();
510510
if s == "_" || s == "crate" || s == "self" || s == "super" || s == "Self" {
@@ -513,12 +513,12 @@ fn process_raw_lifetime_or_label(m: &TokenKindMatch) -> Result<FineTokenData, Er
513513
Ok(FineTokenData::RawLifetimeOrLabel { name })
514514
}
515515

516-
fn process_lifetime_or_label(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
516+
fn process_lifetime_or_label(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
517517
let name = m.consumed(Nonterminal::IDENT)?.clone();
518518
Ok(FineTokenData::LifetimeOrLabel { name })
519519
}
520520

521-
fn process_raw_ident(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
521+
fn process_raw_ident(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
522522
let represented_ident = m.consumed(Nonterminal::IDENT)?.nfc();
523523
let s = represented_ident.to_string();
524524
if s == "_" || s == "crate" || s == "self" || s == "super" || s == "Self" {
@@ -527,13 +527,13 @@ fn process_raw_ident(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
527527
Ok(FineTokenData::RawIdent { represented_ident })
528528
}
529529

530-
fn process_ident(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
530+
fn process_ident(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
531531
Ok(FineTokenData::Ident {
532532
represented_ident: m.consumed(Nonterminal::IDENT)?.nfc(),
533533
})
534534
}
535535

536-
fn process_punctuation(m: &TokenKindMatch) -> Result<FineTokenData, Error> {
536+
fn process_punctuation(m: &TokenisationMatch) -> Result<FineTokenData, Error> {
537537
let mark = match m.consumed.chars() {
538538
[c] => *c,
539539
_ => return rejected("impossible Punctuation match"),

0 commit comments

Comments
 (0)