Skip to content

Commit 621f2b5

Browse files
GTimothyRenjiSann
authored andcommitted
checksum/cksum: rewrite lineformat parsing without regex
removes dependency on the regex crate for LineFormat detection and parsing, resulting in a faster and lighter cksum binary.
1 parent 07cce02 commit 621f2b5

File tree

2 files changed

+175
-79
lines changed

2 files changed

+175
-79
lines changed

src/uucore/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ default = []
8989
# * non-default features
9090
backup-control = []
9191
colors = []
92-
checksum = ["data-encoding", "thiserror", "regex", "sum"]
92+
checksum = ["data-encoding", "thiserror", "sum"]
9393
encoding = ["data-encoding", "data-encoding-macro", "z85"]
9494
entries = ["libc"]
9595
fs = ["dunce", "libc", "winapi-util", "windows-sys"]

src/uucore/src/lib/features/checksum.rs

Lines changed: 174 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
//
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
5-
// spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
5+
// spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
66

77
use data_encoding::BASE64;
88
use os_display::Quotable;
9-
use regex::bytes::{Match, Regex};
109
use std::{
1110
borrow::Cow,
1211
ffi::OsStr,
@@ -15,7 +14,6 @@ use std::{
1514
io::{self, BufReader, Read, Write, stdin},
1615
path::Path,
1716
str,
18-
sync::LazyLock,
1917
};
2018

2119
use crate::{
@@ -466,36 +464,157 @@ pub fn detect_algo(algo: &str, length: Option<usize>) -> UResult<HashAlgorithm>
466464
}
467465
}
468466

469-
// Regexp to handle the three input formats:
470-
// 1. <algo>[-<bits>] (<filename>) = <checksum>
471-
// algo must be uppercase or b (for blake2b)
472-
// 2. <checksum> [* ]<filename>
473-
// 3. <checksum> [*]<filename> (only one space)
474-
const ALGO_BASED_REGEX: &str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>(?-u:.*))\)\s*=\s*(?P<checksum>[A-Za-z0-9+/]+={0,2})$";
475-
476-
const DOUBLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>(?-u:.*))$";
477-
478-
// In this case, we ignore the *
479-
const SINGLE_SPACE_REGEX: &str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$";
480-
481-
static R_ALGO_BASED: LazyLock<Regex> = LazyLock::new(|| Regex::new(ALGO_BASED_REGEX).unwrap());
482-
static R_DOUBLE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(DOUBLE_SPACE_REGEX).unwrap());
483-
static R_SINGLE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(SINGLE_SPACE_REGEX).unwrap());
484-
485467
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
486468
enum LineFormat {
487469
AlgoBased,
488470
SingleSpace,
489-
DoubleSpace,
471+
Untagged,
490472
}
491473

492474
impl LineFormat {
493-
fn to_regex(self) -> &'static Regex {
494-
match self {
495-
LineFormat::AlgoBased => &R_ALGO_BASED,
496-
LineFormat::SingleSpace => &R_SINGLE_SPACE,
497-
LineFormat::DoubleSpace => &R_DOUBLE_SPACE,
475+
/// parse [tagged output format]
476+
/// Normally the format is simply space separated but openssl does not
477+
/// respect the gnu definition.
478+
///
479+
/// [tagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
480+
fn parse_algo_based(line: &[u8]) -> Option<LineInfo> {
481+
// r"\MD5 (a\\ b) = abc123",
482+
// BLAKE2b(44)= a45a4c4883cce4b50d844fab460414cc2080ca83690e74d850a9253e757384366382625b218c8585daee80f34dc9eb2f2fde5fb959db81cd48837f9216e7b0fa
483+
let trimmed = line.trim_ascii_start();
484+
let algo_start = if trimmed.starts_with(b"\\") { 1 } else { 0 };
485+
let rest = &trimmed[algo_start..];
486+
487+
// find the next parenthesis using byte search (not next whitespace) because openssl's
488+
// tagged format does not put a space before (filename)
489+
let par_idx = rest.iter().position(|&b| b == b'(')?;
490+
let algo_substring = &rest[..par_idx].trim_ascii();
491+
let mut algo_parts = algo_substring.splitn(2, |&b| b == b'-');
492+
let algo = algo_parts.next()?;
493+
494+
// Parse algo_bits if present
495+
let algo_bits = algo_parts
496+
.next()
497+
.and_then(|s| std::str::from_utf8(s).ok()?.parse::<usize>().ok());
498+
499+
// Check algo format: uppercase ASCII or digits or "BLAKE2b"
500+
let is_valid_algo = algo == b"BLAKE2b"
501+
|| algo
502+
.iter()
503+
.all(|&b| b.is_ascii_uppercase() || b.is_ascii_digit());
504+
if !is_valid_algo {
505+
return None;
506+
}
507+
// SAFETY: we just validated the contents of algo, we can unsafely make a
508+
// String from it
509+
let algo_utf8 = unsafe { String::from_utf8_unchecked(algo.to_vec()) };
510+
// stripping '(' not ' (' since we matched on ( not whitespace because of openssl.
511+
let after_paren = rest.get(par_idx + 1..)?;
512+
let (filename, checksum) = ByteSliceExt::split_once(after_paren, b") = ")
513+
.or_else(|| ByteSliceExt::split_once(after_paren, b")= "))?;
514+
515+
fn is_valid_checksum(checksum: &[u8]) -> bool {
516+
if checksum.is_empty() {
517+
return false;
518+
}
519+
520+
let mut parts = checksum.splitn(2, |&b| b == b'=');
521+
let main = parts.next().unwrap(); // Always exists since checksum isn't empty
522+
let padding = parts.next().unwrap_or(&b""[..]); // Empty if no '='
523+
524+
main.iter()
525+
.all(|&b| b.is_ascii_alphanumeric() || b == b'+' || b == b'/')
526+
&& !main.is_empty()
527+
&& padding.len() <= 2
528+
&& padding.iter().all(|&b| b == b'=')
529+
}
530+
if !is_valid_checksum(checksum) {
531+
return None;
498532
}
533+
// SAFETY: we just validated the contents of checksum, we can unsafely make a
534+
// String from it
535+
let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) };
536+
537+
Some(LineInfo {
538+
algo_name: Some(algo_utf8),
539+
algo_bit_len: algo_bits,
540+
checksum: checksum_utf8,
541+
filename: filename.to_vec(),
542+
format: LineFormat::AlgoBased,
543+
})
544+
}
545+
546+
#[allow(rustdoc::invalid_html_tags)]
547+
/// parse [untagged output format]
548+
/// The format is simple, either "<checksum> <filename>" or
549+
/// "<checksum> *<filename>"
550+
///
551+
/// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
552+
fn parse_untagged(line: &[u8]) -> Option<LineInfo> {
553+
let space_idx = line.iter().position(|&b| b == b' ')?;
554+
let checksum = &line[..space_idx];
555+
if !checksum.iter().all(|&b| b.is_ascii_hexdigit()) || checksum.is_empty() {
556+
return None;
557+
}
558+
// SAFETY: we just validated the contents of checksum, we can unsafely make a
559+
// String from it
560+
let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) };
561+
562+
let rest = &line[space_idx..];
563+
let filename = rest
564+
.strip_prefix(b" ")
565+
.or_else(|| rest.strip_prefix(b" *"))?;
566+
567+
Some(LineInfo {
568+
algo_name: None,
569+
algo_bit_len: None,
570+
checksum: checksum_utf8,
571+
filename: filename.to_vec(),
572+
format: LineFormat::Untagged,
573+
})
574+
}
575+
576+
#[allow(rustdoc::invalid_html_tags)]
577+
/// parse [untagged output format]
578+
/// Normally the format is simple, either "<checksum> <filename>" or
579+
/// "<checksum> *<filename>"
580+
/// But the bsd tests expect special single space behavior where
581+
/// checksum and filename are separated only by a space, meaning the second
582+
/// space or asterisk is part of the file name.
583+
/// This parser accounts for this variation
584+
///
585+
/// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
586+
fn parse_single_space(line: &[u8]) -> Option<LineInfo> {
587+
// Find first space
588+
let space_idx = line.iter().position(|&b| b == b' ')?;
589+
let checksum = &line[..space_idx];
590+
if !checksum.iter().all(|&b| b.is_ascii_hexdigit()) || checksum.is_empty() {
591+
return None;
592+
}
593+
// SAFETY: we just validated the contents of checksum, we can unsafely make a
594+
// String from it
595+
let checksum_utf8 = unsafe { String::from_utf8_unchecked(checksum.to_vec()) };
596+
597+
let filename = line.get(space_idx + 1..)?; // Skip single space
598+
599+
Some(LineInfo {
600+
algo_name: None,
601+
algo_bit_len: None,
602+
checksum: checksum_utf8,
603+
filename: filename.to_vec(),
604+
format: LineFormat::SingleSpace,
605+
})
606+
}
607+
}
608+
609+
// Helper trait for byte slice operations
610+
trait ByteSliceExt {
611+
fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)>;
612+
}
613+
614+
impl ByteSliceExt for [u8] {
615+
fn split_once(&self, pattern: &[u8]) -> Option<(&Self, &Self)> {
616+
let pos = self.windows(pattern.len()).position(|w| w == pattern)?;
617+
Some((&self[..pos], &self[pos + pattern.len()..]))
499618
}
500619
}
501620

@@ -505,62 +624,39 @@ struct LineInfo {
505624
algo_bit_len: Option<usize>,
506625
checksum: String,
507626
filename: Vec<u8>,
508-
509627
format: LineFormat,
510628
}
511629

512630
impl LineInfo {
513631
/// Returns a `LineInfo` parsed from a checksum line.
514-
/// The function will run 3 regexes against the line and select the first one that matches
632+
/// The function will run 3 parsers against the line and select the first one that matches
515633
/// to populate the fields of the struct.
516-
/// However, there is a catch to handle regarding the handling of `cached_regex`.
517-
/// In case of non-algo-based regex, if `cached_regex` is Some, it must take the priority
518-
/// over the detected regex. Otherwise, we must set it the the detected regex.
634+
/// However, there is a catch to handle regarding the handling of `cached_line_format`.
635+
/// In case of non-algo-based format, if `cached_line_format` is Some, it must take the priority
636+
/// over the detected format. Otherwise, we must set it the the detected format.
519637
/// This specific behavior is emphasized by the test
520638
/// `test_hashsum::test_check_md5sum_only_one_space`.
521-
fn parse(s: impl AsRef<OsStr>, cached_regex: &mut Option<LineFormat>) -> Option<Self> {
522-
let regexes: &[(&'static Regex, LineFormat)] = &[
523-
(&R_ALGO_BASED, LineFormat::AlgoBased),
524-
(&R_DOUBLE_SPACE, LineFormat::DoubleSpace),
525-
(&R_SINGLE_SPACE, LineFormat::SingleSpace),
526-
];
527-
528-
let line_bytes = os_str_as_bytes(s.as_ref()).expect("UTF-8 decoding failed");
529-
530-
for (regex, format) in regexes {
531-
if !regex.is_match(line_bytes) {
532-
continue;
533-
}
534-
535-
let mut r = *regex;
536-
if *format != LineFormat::AlgoBased {
537-
// The cached regex ensures that when processing non-algo based regexes,
538-
// it cannot be changed (can't have single and double space regexes
539-
// used in the same file).
540-
if cached_regex.is_some() {
541-
r = cached_regex.unwrap().to_regex();
542-
} else {
543-
*cached_regex = Some(*format);
544-
}
545-
}
639+
fn parse(s: impl AsRef<OsStr>, cached_line_format: &mut Option<LineFormat>) -> Option<Self> {
640+
let line_bytes = os_str_as_bytes(s.as_ref()).ok()?;
546641

547-
if let Some(caps) = r.captures(line_bytes) {
548-
// These unwraps are safe thanks to the regex
549-
let match_to_string = |m: Match| String::from_utf8(m.as_bytes().into()).unwrap();
550-
551-
return Some(Self {
552-
algo_name: caps.name("algo").map(match_to_string),
553-
algo_bit_len: caps
554-
.name("bits")
555-
.map(|m| match_to_string(m).parse::<usize>().unwrap()),
556-
checksum: caps.name("checksum").map(match_to_string).unwrap(),
557-
filename: caps.name("filename").map(|m| m.as_bytes().into()).unwrap(),
558-
format: *format,
559-
});
642+
if let Some(info) = LineFormat::parse_algo_based(line_bytes) {
643+
return Some(info);
644+
}
645+
if let Some(cached_format) = cached_line_format {
646+
match cached_format {
647+
LineFormat::Untagged => LineFormat::parse_untagged(line_bytes),
648+
LineFormat::SingleSpace => LineFormat::parse_single_space(line_bytes),
649+
_ => unreachable!("we never catch the algo based format"),
560650
}
651+
} else if let Some(info) = LineFormat::parse_untagged(line_bytes) {
652+
*cached_line_format = Some(LineFormat::Untagged);
653+
Some(info)
654+
} else if let Some(info) = LineFormat::parse_single_space(line_bytes) {
655+
*cached_line_format = Some(LineFormat::SingleSpace);
656+
Some(info)
657+
} else {
658+
None
561659
}
562-
563-
None
564660
}
565661
}
566662

@@ -835,7 +931,7 @@ fn process_checksum_line(
835931
cli_algo_name: Option<&str>,
836932
cli_algo_length: Option<usize>,
837933
opts: ChecksumOptions,
838-
cached_regex: &mut Option<LineFormat>,
934+
cached_line_format: &mut Option<LineFormat>,
839935
last_algo: &mut Option<String>,
840936
) -> Result<(), LineCheckError> {
841937
let line_bytes = os_str_as_bytes(line)?;
@@ -847,14 +943,14 @@ fn process_checksum_line(
847943

848944
// Use `LineInfo` to extract the data of a line.
849945
// Then, depending on its format, apply a different pre-treatment.
850-
let Some(line_info) = LineInfo::parse(line, cached_regex) else {
946+
let Some(line_info) = LineInfo::parse(line, cached_line_format) else {
851947
return Err(LineCheckError::ImproperlyFormatted);
852948
};
853949

854950
if line_info.format == LineFormat::AlgoBased {
855951
process_algo_based_line(&line_info, cli_algo_name, opts, last_algo)
856952
} else if let Some(cli_algo) = cli_algo_name {
857-
// If we match a non-algo based regex, we expect a cli argument
953+
// If we match a non-algo based parser, we expect a cli argument
858954
// to give us the algorithm to use
859955
process_non_algo_based_line(i, &line_info, cli_algo, cli_algo_length, opts)
860956
} else {
@@ -890,9 +986,9 @@ fn process_checksum_file(
890986
let reader = BufReader::new(file);
891987
let lines = read_os_string_lines(reader).collect::<Vec<_>>();
892988

893-
// cached_regex is used to ensure that several non algo-based checksum line
894-
// will use the same regex.
895-
let mut cached_regex = None;
989+
// cached_line_format is used to ensure that several non algo-based checksum line
990+
// will use the same parser.
991+
let mut cached_line_format = None;
896992
// last_algo caches the algorithm used in the last line to print a warning
897993
// message for the current line if improperly formatted.
898994
// Behavior tested in gnu_cksum_c::test_warn
@@ -905,7 +1001,7 @@ fn process_checksum_file(
9051001
cli_algo_name,
9061002
cli_algo_length,
9071003
opts,
908-
&mut cached_regex,
1004+
&mut cached_line_format,
9091005
&mut last_algo,
9101006
);
9111007

@@ -1381,7 +1477,7 @@ mod tests {
13811477
assert!(line_info.algo_bit_len.is_none());
13821478
assert_eq!(line_info.filename, b"example.txt");
13831479
assert_eq!(line_info.checksum, "d41d8cd98f00b204e9800998ecf8427e");
1384-
assert_eq!(line_info.format, LineFormat::DoubleSpace);
1480+
assert_eq!(line_info.format, LineFormat::Untagged);
13851481
assert!(cached_regex.is_some());
13861482

13871483
cached_regex = None;

0 commit comments

Comments
 (0)