22//
33// For the full copyright and license information, please view the LICENSE
44// file that was distributed with this source code.
5- // spell-checker:ignore anotherfile invalidchecksum regexes JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
5+ // spell-checker:ignore anotherfile invalidchecksum JWZG FFFD xffname prefixfilename bytelen bitlen hexdigit
66
77use data_encoding:: BASE64 ;
88use os_display:: Quotable ;
9- use regex:: bytes:: { Match , Regex } ;
109use std:: {
1110 borrow:: Cow ,
1211 ffi:: OsStr ,
@@ -15,7 +14,6 @@ use std::{
1514 io:: { self , BufReader , Read , Write , stdin} ,
1615 path:: Path ,
1716 str,
18- sync:: LazyLock ,
1917} ;
2018
2119use crate :: {
@@ -466,36 +464,157 @@ pub fn detect_algo(algo: &str, length: Option<usize>) -> UResult<HashAlgorithm>
466464 }
467465}
468466
469- // Regexp to handle the three input formats:
470- // 1. <algo>[-<bits>] (<filename>) = <checksum>
471- // algo must be uppercase or b (for blake2b)
472- // 2. <checksum> [* ]<filename>
473- // 3. <checksum> [*]<filename> (only one space)
474- const ALGO_BASED_REGEX : & str = r"^\s*\\?(?P<algo>(?:[A-Z0-9]+|BLAKE2b))(?:-(?P<bits>\d+))?\s?\((?P<filename>(?-u:.*))\)\s*=\s*(?P<checksum>[A-Za-z0-9+/]+={0,2})$" ;
475-
476- const DOUBLE_SPACE_REGEX : & str = r"^(?P<checksum>[a-fA-F0-9]+)\s{2}(?P<filename>(?-u:.*))$" ;
477-
478- // In this case, we ignore the *
479- const SINGLE_SPACE_REGEX : & str = r"^(?P<checksum>[a-fA-F0-9]+)\s(?P<filename>\*?(?-u:.*))$" ;
480-
481- static R_ALGO_BASED : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( ALGO_BASED_REGEX ) . unwrap ( ) ) ;
482- static R_DOUBLE_SPACE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( DOUBLE_SPACE_REGEX ) . unwrap ( ) ) ;
483- static R_SINGLE_SPACE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( SINGLE_SPACE_REGEX ) . unwrap ( ) ) ;
484-
485467#[ derive( Debug , PartialEq , Eq , Clone , Copy ) ]
486468enum LineFormat {
487469 AlgoBased ,
488470 SingleSpace ,
489- DoubleSpace ,
471+ Untagged ,
490472}
491473
492474impl LineFormat {
493- fn to_regex ( self ) -> & ' static Regex {
494- match self {
495- LineFormat :: AlgoBased => & R_ALGO_BASED ,
496- LineFormat :: SingleSpace => & R_SINGLE_SPACE ,
497- LineFormat :: DoubleSpace => & R_DOUBLE_SPACE ,
475+ /// parse [tagged output format]
476+ /// Normally the format is simply space separated but openssl does not
477+ /// respect the gnu definition.
478+ ///
479+ /// [tagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
480+ fn parse_algo_based ( line : & [ u8 ] ) -> Option < LineInfo > {
481+ // r"\MD5 (a\\ b) = abc123",
482+ // BLAKE2b(44)= a45a4c4883cce4b50d844fab460414cc2080ca83690e74d850a9253e757384366382625b218c8585daee80f34dc9eb2f2fde5fb959db81cd48837f9216e7b0fa
483+ let trimmed = line. trim_ascii_start ( ) ;
484+ let algo_start = if trimmed. starts_with ( b"\\ " ) { 1 } else { 0 } ;
485+ let rest = & trimmed[ algo_start..] ;
486+
487+ // find the next parenthesis using byte search (not next whitespace) because openssl's
488+ // tagged format does not put a space before (filename)
489+ let par_idx = rest. iter ( ) . position ( |& b| b == b'(' ) ?;
490+ let algo_substring = & rest[ ..par_idx] . trim_ascii ( ) ;
491+ let mut algo_parts = algo_substring. splitn ( 2 , |& b| b == b'-' ) ;
492+ let algo = algo_parts. next ( ) ?;
493+
494+ // Parse algo_bits if present
495+ let algo_bits = algo_parts
496+ . next ( )
497+ . and_then ( |s| std:: str:: from_utf8 ( s) . ok ( ) ?. parse :: < usize > ( ) . ok ( ) ) ;
498+
499+ // Check algo format: uppercase ASCII or digits or "BLAKE2b"
500+ let is_valid_algo = algo == b"BLAKE2b"
501+ || algo
502+ . iter ( )
503+ . all ( |& b| b. is_ascii_uppercase ( ) || b. is_ascii_digit ( ) ) ;
504+ if !is_valid_algo {
505+ return None ;
506+ }
507+ // SAFETY: we just validated the contents of algo, we can unsafely make a
508+ // String from it
509+ let algo_utf8 = unsafe { String :: from_utf8_unchecked ( algo. to_vec ( ) ) } ;
510+ // stripping '(' not ' (' since we matched on ( not whitespace because of openssl.
511+ let after_paren = rest. get ( par_idx + 1 ..) ?;
512+ let ( filename, checksum) = ByteSliceExt :: split_once ( after_paren, b") = " )
513+ . or_else ( || ByteSliceExt :: split_once ( after_paren, b")= " ) ) ?;
514+
515+ fn is_valid_checksum ( checksum : & [ u8 ] ) -> bool {
516+ if checksum. is_empty ( ) {
517+ return false ;
518+ }
519+
520+ let mut parts = checksum. splitn ( 2 , |& b| b == b'=' ) ;
521+ let main = parts. next ( ) . unwrap ( ) ; // Always exists since checksum isn't empty
522+ let padding = parts. next ( ) . unwrap_or ( & b"" [ ..] ) ; // Empty if no '='
523+
524+ main. iter ( )
525+ . all ( |& b| b. is_ascii_alphanumeric ( ) || b == b'+' || b == b'/' )
526+ && !main. is_empty ( )
527+ && padding. len ( ) <= 2
528+ && padding. iter ( ) . all ( |& b| b == b'=' )
529+ }
530+ if !is_valid_checksum ( checksum) {
531+ return None ;
498532 }
533+ // SAFETY: we just validated the contents of checksum, we can unsafely make a
534+ // String from it
535+ let checksum_utf8 = unsafe { String :: from_utf8_unchecked ( checksum. to_vec ( ) ) } ;
536+
537+ Some ( LineInfo {
538+ algo_name : Some ( algo_utf8) ,
539+ algo_bit_len : algo_bits,
540+ checksum : checksum_utf8,
541+ filename : filename. to_vec ( ) ,
542+ format : LineFormat :: AlgoBased ,
543+ } )
544+ }
545+
546+ #[ allow( rustdoc:: invalid_html_tags) ]
547+ /// parse [untagged output format]
548+ /// The format is simple, either "<checksum> <filename>" or
549+ /// "<checksum> *<filename>"
550+ ///
551+ /// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
552+ fn parse_untagged ( line : & [ u8 ] ) -> Option < LineInfo > {
553+ let space_idx = line. iter ( ) . position ( |& b| b == b' ' ) ?;
554+ let checksum = & line[ ..space_idx] ;
555+ if !checksum. iter ( ) . all ( |& b| b. is_ascii_hexdigit ( ) ) || checksum. is_empty ( ) {
556+ return None ;
557+ }
558+ // SAFETY: we just validated the contents of checksum, we can unsafely make a
559+ // String from it
560+ let checksum_utf8 = unsafe { String :: from_utf8_unchecked ( checksum. to_vec ( ) ) } ;
561+
562+ let rest = & line[ space_idx..] ;
563+ let filename = rest
564+ . strip_prefix ( b" " )
565+ . or_else ( || rest. strip_prefix ( b" *" ) ) ?;
566+
567+ Some ( LineInfo {
568+ algo_name : None ,
569+ algo_bit_len : None ,
570+ checksum : checksum_utf8,
571+ filename : filename. to_vec ( ) ,
572+ format : LineFormat :: Untagged ,
573+ } )
574+ }
575+
576+ #[ allow( rustdoc:: invalid_html_tags) ]
577+ /// parse [untagged output format]
578+ /// Normally the format is simple, either "<checksum> <filename>" or
579+ /// "<checksum> *<filename>"
580+ /// But the bsd tests expect special single space behavior where
581+ /// checksum and filename are separated only by a space, meaning the second
582+ /// space or asterisk is part of the file name.
583+ /// This parser accounts for this variation
584+ ///
585+ /// [untagged output format]: https://www.gnu.org/software/coreutils/manual/html_node/cksum-output-modes.html#cksum-output-modes-1
586+ fn parse_single_space ( line : & [ u8 ] ) -> Option < LineInfo > {
587+ // Find first space
588+ let space_idx = line. iter ( ) . position ( |& b| b == b' ' ) ?;
589+ let checksum = & line[ ..space_idx] ;
590+ if !checksum. iter ( ) . all ( |& b| b. is_ascii_hexdigit ( ) ) || checksum. is_empty ( ) {
591+ return None ;
592+ }
593+ // SAFETY: we just validated the contents of checksum, we can unsafely make a
594+ // String from it
595+ let checksum_utf8 = unsafe { String :: from_utf8_unchecked ( checksum. to_vec ( ) ) } ;
596+
597+ let filename = line. get ( space_idx + 1 ..) ?; // Skip single space
598+
599+ Some ( LineInfo {
600+ algo_name : None ,
601+ algo_bit_len : None ,
602+ checksum : checksum_utf8,
603+ filename : filename. to_vec ( ) ,
604+ format : LineFormat :: SingleSpace ,
605+ } )
606+ }
607+ }
608+
609+ // Helper trait for byte slice operations
610+ trait ByteSliceExt {
611+ fn split_once ( & self , pattern : & [ u8 ] ) -> Option < ( & Self , & Self ) > ;
612+ }
613+
614+ impl ByteSliceExt for [ u8 ] {
615+ fn split_once ( & self , pattern : & [ u8 ] ) -> Option < ( & Self , & Self ) > {
616+ let pos = self . windows ( pattern. len ( ) ) . position ( |w| w == pattern) ?;
617+ Some ( ( & self [ ..pos] , & self [ pos + pattern. len ( ) ..] ) )
499618 }
500619}
501620
@@ -505,62 +624,39 @@ struct LineInfo {
505624 algo_bit_len : Option < usize > ,
506625 checksum : String ,
507626 filename : Vec < u8 > ,
508-
509627 format : LineFormat ,
510628}
511629
512630impl LineInfo {
513631 /// Returns a `LineInfo` parsed from a checksum line.
514- /// The function will run 3 regexes against the line and select the first one that matches
632+ /// The function will run 3 parsers against the line and select the first one that matches
515633 /// to populate the fields of the struct.
516- /// However, there is a catch to handle regarding the handling of `cached_regex `.
517- /// In case of non-algo-based regex , if `cached_regex ` is Some, it must take the priority
518- /// over the detected regex . Otherwise, we must set it the the detected regex .
634+ /// However, there is a catch to handle regarding the handling of `cached_line_format `.
635+ /// In case of non-algo-based format , if `cached_line_format ` is Some, it must take the priority
636+ /// over the detected format . Otherwise, we must set it the the detected format .
519637 /// This specific behavior is emphasized by the test
520638 /// `test_hashsum::test_check_md5sum_only_one_space`.
521- fn parse ( s : impl AsRef < OsStr > , cached_regex : & mut Option < LineFormat > ) -> Option < Self > {
522- let regexes: & [ ( & ' static Regex , LineFormat ) ] = & [
523- ( & R_ALGO_BASED , LineFormat :: AlgoBased ) ,
524- ( & R_DOUBLE_SPACE , LineFormat :: DoubleSpace ) ,
525- ( & R_SINGLE_SPACE , LineFormat :: SingleSpace ) ,
526- ] ;
527-
528- let line_bytes = os_str_as_bytes ( s. as_ref ( ) ) . expect ( "UTF-8 decoding failed" ) ;
529-
530- for ( regex, format) in regexes {
531- if !regex. is_match ( line_bytes) {
532- continue ;
533- }
534-
535- let mut r = * regex;
536- if * format != LineFormat :: AlgoBased {
537- // The cached regex ensures that when processing non-algo based regexes,
538- // it cannot be changed (can't have single and double space regexes
539- // used in the same file).
540- if cached_regex. is_some ( ) {
541- r = cached_regex. unwrap ( ) . to_regex ( ) ;
542- } else {
543- * cached_regex = Some ( * format) ;
544- }
545- }
639+ fn parse ( s : impl AsRef < OsStr > , cached_line_format : & mut Option < LineFormat > ) -> Option < Self > {
640+ let line_bytes = os_str_as_bytes ( s. as_ref ( ) ) . ok ( ) ?;
546641
547- if let Some ( caps) = r. captures ( line_bytes) {
548- // These unwraps are safe thanks to the regex
549- let match_to_string = |m : Match | String :: from_utf8 ( m. as_bytes ( ) . into ( ) ) . unwrap ( ) ;
550-
551- return Some ( Self {
552- algo_name : caps. name ( "algo" ) . map ( match_to_string) ,
553- algo_bit_len : caps
554- . name ( "bits" )
555- . map ( |m| match_to_string ( m) . parse :: < usize > ( ) . unwrap ( ) ) ,
556- checksum : caps. name ( "checksum" ) . map ( match_to_string) . unwrap ( ) ,
557- filename : caps. name ( "filename" ) . map ( |m| m. as_bytes ( ) . into ( ) ) . unwrap ( ) ,
558- format : * format,
559- } ) ;
642+ if let Some ( info) = LineFormat :: parse_algo_based ( line_bytes) {
643+ return Some ( info) ;
644+ }
645+ if let Some ( cached_format) = cached_line_format {
646+ match cached_format {
647+ LineFormat :: Untagged => LineFormat :: parse_untagged ( line_bytes) ,
648+ LineFormat :: SingleSpace => LineFormat :: parse_single_space ( line_bytes) ,
649+ _ => unreachable ! ( "we never catch the algo based format" ) ,
560650 }
651+ } else if let Some ( info) = LineFormat :: parse_untagged ( line_bytes) {
652+ * cached_line_format = Some ( LineFormat :: Untagged ) ;
653+ Some ( info)
654+ } else if let Some ( info) = LineFormat :: parse_single_space ( line_bytes) {
655+ * cached_line_format = Some ( LineFormat :: SingleSpace ) ;
656+ Some ( info)
657+ } else {
658+ None
561659 }
562-
563- None
564660 }
565661}
566662
@@ -835,7 +931,7 @@ fn process_checksum_line(
835931 cli_algo_name : Option < & str > ,
836932 cli_algo_length : Option < usize > ,
837933 opts : ChecksumOptions ,
838- cached_regex : & mut Option < LineFormat > ,
934+ cached_line_format : & mut Option < LineFormat > ,
839935 last_algo : & mut Option < String > ,
840936) -> Result < ( ) , LineCheckError > {
841937 let line_bytes = os_str_as_bytes ( line) ?;
@@ -847,14 +943,14 @@ fn process_checksum_line(
847943
848944 // Use `LineInfo` to extract the data of a line.
849945 // Then, depending on its format, apply a different pre-treatment.
850- let Some ( line_info) = LineInfo :: parse ( line, cached_regex ) else {
946+ let Some ( line_info) = LineInfo :: parse ( line, cached_line_format ) else {
851947 return Err ( LineCheckError :: ImproperlyFormatted ) ;
852948 } ;
853949
854950 if line_info. format == LineFormat :: AlgoBased {
855951 process_algo_based_line ( & line_info, cli_algo_name, opts, last_algo)
856952 } else if let Some ( cli_algo) = cli_algo_name {
857- // If we match a non-algo based regex , we expect a cli argument
953+ // If we match a non-algo based parser , we expect a cli argument
858954 // to give us the algorithm to use
859955 process_non_algo_based_line ( i, & line_info, cli_algo, cli_algo_length, opts)
860956 } else {
@@ -890,9 +986,9 @@ fn process_checksum_file(
890986 let reader = BufReader :: new ( file) ;
891987 let lines = read_os_string_lines ( reader) . collect :: < Vec < _ > > ( ) ;
892988
893- // cached_regex is used to ensure that several non algo-based checksum line
894- // will use the same regex .
895- let mut cached_regex = None ;
989+ // cached_line_format is used to ensure that several non algo-based checksum line
990+ // will use the same parser .
991+ let mut cached_line_format = None ;
896992 // last_algo caches the algorithm used in the last line to print a warning
897993 // message for the current line if improperly formatted.
898994 // Behavior tested in gnu_cksum_c::test_warn
@@ -905,7 +1001,7 @@ fn process_checksum_file(
9051001 cli_algo_name,
9061002 cli_algo_length,
9071003 opts,
908- & mut cached_regex ,
1004+ & mut cached_line_format ,
9091005 & mut last_algo,
9101006 ) ;
9111007
@@ -1381,7 +1477,7 @@ mod tests {
13811477 assert ! ( line_info. algo_bit_len. is_none( ) ) ;
13821478 assert_eq ! ( line_info. filename, b"example.txt" ) ;
13831479 assert_eq ! ( line_info. checksum, "d41d8cd98f00b204e9800998ecf8427e" ) ;
1384- assert_eq ! ( line_info. format, LineFormat :: DoubleSpace ) ;
1480+ assert_eq ! ( line_info. format, LineFormat :: Untagged ) ;
13851481 assert ! ( cached_regex. is_some( ) ) ;
13861482
13871483 cached_regex = None ;
0 commit comments