From c20f4a7f872f884c51242e10de7ac49ef81c24a0 Mon Sep 17 00:00:00 2001 From: jinroq Date: Thu, 5 Mar 2026 12:36:33 +0900 Subject: [PATCH 01/34] Add StringScanner#integer_at to convert capture group to Integer Add a method that returns a captured substring as an Integer, following String#to_i(base) semantics. Accepts an optional base argument (default 10), Symbol/String for named capture groups, and returns 0 for non-numeric or empty captures. Extract resolve_capture_index helper to share index resolution logic between StringScanner#[] and StringScanner#integer_at. --- ext/strscan/strscan.c | 108 +++++++++++++---- test/strscan/test_stringscanner.rb | 180 +++++++++++++++++++++++++++++ 2 files changed, 268 insertions(+), 20 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index d4135e4baf..2c2c8e06f6 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1621,6 +1621,37 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name rb_long2int(name_end - name), name); } +/* Resolve capture group index from Integer, Symbol, or String. + * Returns the resolved register index, or -1 if unmatched/out of range. */ +static long +resolve_capture_index(struct strscanner *p, VALUE idx) +{ + const char *name; + long i; + + if (! MATCHED_P(p)) return -1; + + switch (TYPE(idx)) { + case T_SYMBOL: + idx = rb_sym2str(idx); + /* fall through */ + case T_STRING: + RSTRING_GETMEM(idx, name, i); + i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx)); + break; + default: + i = NUM2LONG(idx); + } + + if (i < 0) + i += p->regs.num_regs; + if (i < 0) return -1; + if (i >= p->regs.num_regs) return -1; + if (p->regs.beg[i] == -1) return -1; + + return i; +} + /* * * :markup: markdown @@ -1695,30 +1726,12 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name static VALUE strscan_aref(VALUE self, VALUE idx) { - const char *name; struct strscanner *p; long i; GET_SCANNER(self, p); - if (! MATCHED_P(p)) return Qnil; - - switch (TYPE(idx)) { - case T_SYMBOL: - idx = rb_sym2str(idx); - /* fall through */ - case T_STRING: - RSTRING_GETMEM(idx, name, i); - i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx)); - break; - default: - i = NUM2LONG(idx); - } - - if (i < 0) - i += p->regs.num_regs; - if (i < 0) return Qnil; - if (i >= p->regs.num_regs) return Qnil; - if (p->regs.beg[i] == -1) return Qnil; + i = resolve_capture_index(p, idx); + if (i < 0) return Qnil; return extract_range(p, adjust_register_position(p, p->regs.beg[i]), @@ -1852,6 +1865,60 @@ strscan_values_at(int argc, VALUE *argv, VALUE self) return new_ary; } +/* + * call-seq: + * integer_at(index, base = 10) -> integer or nil + * + * Returns the captured substring at the given +index+ as an Integer, + * following the behavior of String#to_i(base). + * + * +index+ can be an Integer (positive, negative, or zero), a Symbol, + * or a String for named capture groups. + * + * Returns +nil+ if: + * - No match has been performed or the last match failed + * - The +index+ is out of range + * - The group at +index+ did not participate in the match + * + * This is semantically equivalent to self[index].to_i(base) + * but avoids the allocation of a temporary String when possible. + * + * scanner = StringScanner.new("2024-06-15") + * scanner.scan(/(\d{4})-(\d{2})-(\d{2})/) + * scanner.integer_at(1) # => 2024 + * scanner.integer_at(1, 16) # => 8228 + * + */ +static VALUE +strscan_integer_at(int argc, VALUE *argv, VALUE self) +{ + struct strscanner *p; + long i; + long beg, end, len; + const char *ptr; + VALUE idx, vbase; + int base = 10; + + rb_scan_args(argc, argv, "11", &idx, &vbase); + if (!NIL_P(vbase)) base = NUM2INT(vbase); + + GET_SCANNER(self, p); + i = resolve_capture_index(p, idx); + if (i < 0) return Qnil; + + beg = adjust_register_position(p, p->regs.beg[i]); + end = adjust_register_position(p, p->regs.end[i]); + len = end - beg; + + if (len <= 0) return INT2FIX(0); + + ptr = S_PBEG(p) + beg; + + /* Follow String#to_i(base) semantics via rb_str_to_inum. + * badcheck=0 returns 0 for non-numeric input instead of raising. */ + return rb_str_to_inum(rb_str_new(ptr, len), base, 0); +} + /* * :markup: markdown * :include: strscan/link_refs.txt @@ -2290,6 +2357,7 @@ Init_strscan(void) rb_define_method(StringScanner, "size", strscan_size, 0); rb_define_method(StringScanner, "captures", strscan_captures, 0); rb_define_method(StringScanner, "values_at", strscan_values_at, -1); + rb_define_method(StringScanner, "integer_at", strscan_integer_at, -1); rb_define_method(StringScanner, "rest", strscan_rest, 0); rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0); diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index dd3663ea6a..0c3c587b9d 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -968,6 +968,186 @@ def test_named_captures_same_name_union assert_equal({"number" => "1"}, scan.named_captures) end + def test_integer_at + s = create_string_scanner("2024-06-15") + s.scan(/(\d{4})-(\d{2})-(\d{2})/) + assert_equal(2024, s.integer_at(1)) + assert_equal(6, s.integer_at(2)) + assert_equal(15, s.integer_at(3)) + end + + def test_integer_at_index_zero + s = create_string_scanner("42 abc") + s.scan(/(\d+)/) + assert_equal(42, s.integer_at(0)) + end + + def test_integer_at_negative_index + s = create_string_scanner("2024-06-15") + s.scan(/(\d{4})-(\d{2})-(\d{2})/) + assert_equal(15, s.integer_at(-1)) + assert_equal(6, s.integer_at(-2)) + assert_equal(2024, s.integer_at(-3)) + end + + def test_integer_at_no_match + s = create_string_scanner("abc") + s.scan(/\d+/) + assert_nil(s.integer_at(0)) + end + + def test_integer_at_before_match + s = create_string_scanner("abc") + assert_nil(s.integer_at(0)) + end + + def test_integer_at_index_out_of_range + s = create_string_scanner("42") + s.scan(/(\d+)/) + assert_nil(s.integer_at(2)) + assert_nil(s.integer_at(100)) + assert_nil(s.integer_at(-3)) + end + + def test_integer_at_optional_group_not_matched + s = create_string_scanner("2024-06") + s.scan(/(\d{4})-(\d{2})(-(\d{2}))?/) + assert_equal(2024, s.integer_at(1)) + assert_equal(6, s.integer_at(2)) + assert_nil(s.integer_at(4)) + end + + def test_integer_at_large_number + huge = '9' * 100 + s = create_string_scanner(huge) + s.scan(/(\d+)/) + assert_equal(huge.to_i, s.integer_at(1)) + end + + def test_integer_at_fixnum_bignum_boundary + # 18 digits: fits in long on 64-bit + s = create_string_scanner("999999999999999999") + s.scan(/(\d+)/) + assert_equal(999999999999999999, s.integer_at(1)) + + # 19 digits: exceeds long on 64-bit, becomes bignum + s = create_string_scanner("9999999999999999999") + s.scan(/(\d+)/) + assert_equal(9999999999999999999, s.integer_at(1)) + + # negative 18 digits + s = create_string_scanner("-999999999999999999") + s.scan(/([+\-]?\d+)/) + assert_equal(-999999999999999999, s.integer_at(1)) + + # negative 19 digits + s = create_string_scanner("-9999999999999999999") + s.scan(/([+\-]?\d+)/) + assert_equal(-9999999999999999999, s.integer_at(1)) + end + + def test_integer_at_non_digit + # follows String#to_i: stops at non-digit, returns parsed portion + s = create_string_scanner("1.5") + s.scan(/([\d.]+)/) + assert_equal(1, s.integer_at(1)) + end + + def test_integer_at_non_digit_alpha + # follows String#to_i: no leading digits, returns 0 + s = create_string_scanner("foo bar") + s.scan(/(\w+)/) + assert_equal(0, s.integer_at(1)) + end + + def test_integer_at_empty_capture + # follows String#to_i: empty string returns 0 + s = create_string_scanner("abc") + s.scan(/()abc/) + assert_equal(0, s.integer_at(1)) + end + + def test_integer_at_sign_only + # follows String#to_i: sign only returns 0 + s = create_string_scanner("+") + s.scan(/([+\-])/) + assert_equal(0, s.integer_at(1)) + + s = create_string_scanner("-") + s.scan(/([+\-])/) + assert_equal(0, s.integer_at(1)) + end + + def test_integer_at_signed_number + s = create_string_scanner("-42") + s.scan(/([+\-]?\d+)/) + assert_equal(-42, s.integer_at(1)) + + s = create_string_scanner("+42") + s.scan(/([+\-]?\d+)/) + assert_equal(42, s.integer_at(1)) + end + + def test_integer_at_leading_zeros + # "010" is 8 in octal (Integer("010")), but 10 in base 10 + s = create_string_scanner("010") + s.scan(/(\d+)/) + assert_equal(10, s.integer_at(1)) + end + + def test_integer_at_full_match_with_non_digits + # follows String#to_i: "2024-06-15".to_i => 2024 + s = create_string_scanner("2024-06-15") + s.scan(/(\d{4})-(\d{2})-(\d{2})/) + assert_equal(2024, s.integer_at(0)) + end + + def test_integer_at_named_capture_symbol + s = create_string_scanner("2024-06-15") + s.scan(/(?\d{4})-(?\d{2})-(?\d{2})/) + assert_equal(2024, s.integer_at(:year)) + assert_equal(6, s.integer_at(:month)) + assert_equal(15, s.integer_at(:day)) + end + + def test_integer_at_named_capture_string + s = create_string_scanner("2024-06-15") + s.scan(/(?\d{4})-(?\d{2})-(?\d{2})/) + assert_equal(2024, s.integer_at("year")) + assert_equal(6, s.integer_at("month")) + assert_equal(15, s.integer_at("day")) + end + + def test_integer_at_named_capture_undefined + s = create_string_scanner("2024-06-15") + s.scan(/(?\d{4})-(?\d{2})-(?\d{2})/) + assert_raise(IndexError) { s.integer_at(:unknown) } + assert_raise(IndexError) { s.integer_at("unknown") } + end + + def test_integer_at_base + s = create_string_scanner("2024") + s.scan(/(\d+)/) + assert_equal(2024, s.integer_at(1)) # default base 10 + assert_equal(1044, s.integer_at(1, 8)) # base 8 + assert_equal(8228, s.integer_at(1, 16)) # base 16 + end + + def test_integer_at_base_zero + # base 0 respects prefixes like 0x + s = create_string_scanner("0xF") + s.scan(/(...)/) + assert_equal(0, s.integer_at(1)) # base 10: "0xF".to_i => 0 + assert_equal(15, s.integer_at(1, 0)) # base 0: "0xF".to_i(0) => 15 + end + + def test_integer_at_underscore + # follows String#to_i: underscores are accepted + s = create_string_scanner("1_0_0") + s.scan(/(\d+(?:_\d+)*)/) + assert_equal(100, s.integer_at(1)) + end + def test_scan_integer s = create_string_scanner('abc') assert_equal(3, s.match?(/(?abc)/)) # set named_captures From efe8d0ff40f85d00cbf987c72516ce2b73321b10 Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 18 Mar 2026 22:31:10 +0900 Subject: [PATCH 02/34] Add fast path for base-10 pure digit captures to avoid String allocation When base is 10 and the capture contains only digits (with optional sign) that fit in long, parse directly and return via LONG2NUM. This covers the Date._strptime use case without temporary String creation. All other cases fall through to rb_str_to_inum. --- ext/strscan/strscan.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 2c2c8e06f6..bcd28d7bae 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1914,7 +1914,42 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) ptr = S_PBEG(p) + beg; - /* Follow String#to_i(base) semantics via rb_str_to_inum. + /* Fast path for base 10 with pure digits: parse directly from + * source bytes without temporary String allocation. + * This covers the Date._strptime use case. */ + if (base == 10) { + long j = 0; + int negative = 0; + long digit_count; + + if (ptr[0] == '-') { negative = 1; j = 1; } + else if (ptr[0] == '+') { j = 1; } + + digit_count = len - j; + if (digit_count > 0) { + long k; + int all_digits = 1; + for (k = j; k < len; k++) { + if (ptr[k] < '0' || ptr[k] > '9') { + all_digits = 0; + break; + } + } + if (all_digits) { + if (digit_count <= (sizeof(long) >= 8 ? 18 : 9)) { + long result = 0; + for (; j < len; j++) { + result = result * 10 + (ptr[j] - '0'); + } + if (negative) result = -result; + return LONG2NUM(result); + } + /* Bignum: fall through to rb_str_to_inum */ + } + } + } + + /* General path: follow String#to_i(base) semantics via rb_str_to_inum. * badcheck=0 returns 0 for non-numeric input instead of raising. */ return rb_str_to_inum(rb_str_new(ptr, len), base, 0); } From d74afd10b36c512223f072eae7ac6945f974f50d Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 00:35:16 +0900 Subject: [PATCH 03/34] Add Ruby fallback for integer_at on platforms without C extension Provide a pure Ruby implementation using self[index].to_i(base) for JRuby and other non-CRuby platforms. The C extension version takes precedence when available. --- lib/strscan/strscan.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/strscan/strscan.rb b/lib/strscan/strscan.rb index 46acc7ea82..817d275851 100644 --- a/lib/strscan/strscan.rb +++ b/lib/strscan/strscan.rb @@ -12,6 +12,16 @@ class StringScanner # # The scanned string must be encoded with an ASCII compatible encoding, otherwise # Encoding::CompatibilityError will be raised. + unless method_defined?(:integer_at) + # Fallback implementation for platforms without C extension (e.g. JRuby). + # Equivalent to self[index].to_i(base). + def integer_at(index, base = 10) + str = self[index] + return nil if str.nil? + str.to_i(base) + end + end + def scan_integer(base: 10) case base when 10 From 485f27cf777cfc6ac814563e5485df886125fb1c Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 20:48:09 +0900 Subject: [PATCH 04/34] Use bool instead of int for boolean variables in integer_at fast path --- ext/strscan/strscan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index bcd28d7bae..8f8132ce22 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1919,19 +1919,19 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) * This covers the Date._strptime use case. */ if (base == 10) { long j = 0; - int negative = 0; + bool negative = false; long digit_count; - if (ptr[0] == '-') { negative = 1; j = 1; } + if (ptr[0] == '-') { negative = true; j = 1; } else if (ptr[0] == '+') { j = 1; } digit_count = len - j; if (digit_count > 0) { long k; - int all_digits = 1; + bool all_digits = true; for (k = j; k < len; k++) { if (ptr[k] < '0' || ptr[k] > '9') { - all_digits = 0; + all_digits = false; break; } } From ef6281d5353a70266b778424e327ca1198cc2d09 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 21:10:54 +0900 Subject: [PATCH 05/34] Move integer_at fallback after scan_integer to keep docs together --- lib/strscan/strscan.rb | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/strscan/strscan.rb b/lib/strscan/strscan.rb index 817d275851..cc6e6bc433 100644 --- a/lib/strscan/strscan.rb +++ b/lib/strscan/strscan.rb @@ -12,16 +12,6 @@ class StringScanner # # The scanned string must be encoded with an ASCII compatible encoding, otherwise # Encoding::CompatibilityError will be raised. - unless method_defined?(:integer_at) - # Fallback implementation for platforms without C extension (e.g. JRuby). - # Equivalent to self[index].to_i(base). - def integer_at(index, base = 10) - str = self[index] - return nil if str.nil? - str.to_i(base) - end - end - def scan_integer(base: 10) case base when 10 @@ -32,4 +22,14 @@ def scan_integer(base: 10) raise ArgumentError, "Unsupported integer base: #{base.inspect}, expected 10 or 16" end end + + unless method_defined?(:integer_at) + # Fallback implementation for platforms without C extension (e.g. JRuby). + # Equivalent to self[index].to_i(base). + def integer_at(index, base = 10) + str = self[index] + return nil if str.nil? + str.to_i(base) + end + end end From 6052143e91277a61eee94fa5d0f00324d656f375 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 21:23:47 +0900 Subject: [PATCH 06/34] Optimize fast path to handle leading zeros and near-LONG_MAX values Skip leading zeros to compute effective digit count, allowing values like "00000000000000000001" to use the fast path. Add overflow-checked parsing for 19-digit values so LONG_MAX fits in the fast path while LONG_MAX+1 correctly falls through to rb_str_to_inum. --- ext/strscan/strscan.c | 30 +++++++++++++++++++++++++++++- test/strscan/test_stringscanner.rb | 22 ++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 8f8132ce22..05eaf352bb 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1936,7 +1936,14 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) } } if (all_digits) { - if (digit_count <= (sizeof(long) >= 8 ? 18 : 9)) { + /* Skip leading zeros to get effective digit count */ + long first_nonzero = j; + long effective_digits; + while (first_nonzero < len && ptr[first_nonzero] == '0') + first_nonzero++; + effective_digits = len - first_nonzero; + + if (effective_digits <= (sizeof(long) >= 8 ? 18 : 9)) { long result = 0; for (; j < len; j++) { result = result * 10 + (ptr[j] - '0'); @@ -1944,6 +1951,27 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) if (negative) result = -result; return LONG2NUM(result); } + /* 19 digits on 64-bit (or 10 on 32-bit): may fit in long */ + if (effective_digits <= (sizeof(long) >= 8 ? 19 : 10)) { + unsigned long result = 0; + unsigned long limit = negative + ? (unsigned long)LONG_MAX + 1 + : (unsigned long)LONG_MAX; + bool overflow = false; + for (; j < len; j++) { + unsigned long d = ptr[j] - '0'; + if (result > (limit - d) / 10) { + overflow = true; + break; + } + result = result * 10 + d; + } + if (!overflow) { + if (negative) + return LONG2NUM(-(long)result); + return LONG2NUM((long)result); + } + } /* Bignum: fall through to rb_str_to_inum */ } } diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 0c3c587b9d..79314a256c 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1044,6 +1044,28 @@ def test_integer_at_fixnum_bignum_boundary s = create_string_scanner("-9999999999999999999") s.scan(/([+\-]?\d+)/) assert_equal(-9999999999999999999, s.integer_at(1)) + + # LONG_MAX (19 digits, fits in long) + long_max = 2 ** (0.size * 8 - 1) - 1 + s = create_string_scanner(long_max.to_s) + s.scan(/(\d+)/) + assert_equal(long_max, s.integer_at(1)) + + # LONG_MIN (19 digits + sign, fits in long) + long_min = -(2 ** (0.size * 8 - 1)) + s = create_string_scanner(long_min.to_s) + s.scan(/([+\-]?\d+)/) + assert_equal(long_min, s.integer_at(1)) + + # LONG_MAX + 1 (bignum) + s = create_string_scanner((long_max + 1).to_s) + s.scan(/(\d+)/) + assert_equal(long_max + 1, s.integer_at(1)) + + # leading zeros with many digits + s = create_string_scanner("00000000000000000001") + s.scan(/(\d+)/) + assert_equal(1, s.integer_at(1)) end def test_integer_at_non_digit From f64fdd8f29aa940f312f84a2cbed54987c124fa6 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 21:30:42 +0900 Subject: [PATCH 07/34] Remove unnecessary capture group in test_integer_at_index_zero --- test/strscan/test_stringscanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 79314a256c..0ba1872f3f 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -978,7 +978,7 @@ def test_integer_at def test_integer_at_index_zero s = create_string_scanner("42 abc") - s.scan(/(\d+)/) + s.scan(/\d+/) assert_equal(42, s.integer_at(0)) end From 6a557812ca53e45802b0c6a3d2c758894ed7384b Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 21:34:28 +0900 Subject: [PATCH 08/34] Rename test_integer_at to test_integer_at_date_parts for consistency --- test/strscan/test_stringscanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 0ba1872f3f..5e3b315024 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -968,7 +968,7 @@ def test_named_captures_same_name_union assert_equal({"number" => "1"}, scan.named_captures) end - def test_integer_at + def test_integer_at_date_parts s = create_string_scanner("2024-06-15") s.scan(/(\d{4})-(\d{2})-(\d{2})/) assert_equal(2024, s.integer_at(1)) From 67a10ae0b759697876ebcc848c37baebe12021b8 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 21:43:45 +0900 Subject: [PATCH 09/34] Simplify regex in test_integer_at_optional_group_not_matched Remove nested capture group and check group 3 directly for nil. --- test/strscan/test_stringscanner.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 5e3b315024..95310151bb 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1011,10 +1011,10 @@ def test_integer_at_index_out_of_range def test_integer_at_optional_group_not_matched s = create_string_scanner("2024-06") - s.scan(/(\d{4})-(\d{2})(-(\d{2}))?/) + s.scan(/(\d{4})-(\d{2})-?(\d{2})?/) assert_equal(2024, s.integer_at(1)) assert_equal(6, s.integer_at(2)) - assert_nil(s.integer_at(4)) + assert_nil(s.integer_at(3)) end def test_integer_at_large_number From 9ab11722116bb81e3cdbb601b91cf4aec89af507 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 21:55:12 +0900 Subject: [PATCH 10/34] Rename variable huge to large in test_integer_at_large_number --- test/strscan/test_stringscanner.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 95310151bb..64d377b966 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1018,10 +1018,10 @@ def test_integer_at_optional_group_not_matched end def test_integer_at_large_number - huge = '9' * 100 - s = create_string_scanner(huge) + large = '9' * 100 + s = create_string_scanner(large) s.scan(/(\d+)/) - assert_equal(huge.to_i, s.integer_at(1)) + assert_equal(large.to_i, s.integer_at(1)) end def test_integer_at_fixnum_bignum_boundary From 960130b5012155714ac88cf483baf9a82b67c263 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 22:05:27 +0900 Subject: [PATCH 11/34] Replace magic numbers with INT64/INT32_DECIMAL_SAFE_DIGITS constants --- ext/strscan/strscan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 05eaf352bb..2bc31bf6ae 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1914,6 +1914,11 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) ptr = S_PBEG(p) + beg; + /* Max decimal digits guaranteed to fit in long without overflow check. + * floor(log10(INT64_MAX)) = 18, floor(log10(INT32_MAX)) = 9 */ +#define INT64_DECIMAL_SAFE_DIGITS 18 +#define INT32_DECIMAL_SAFE_DIGITS 9 + /* Fast path for base 10 with pure digits: parse directly from * source bytes without temporary String allocation. * This covers the Date._strptime use case. */ @@ -1943,7 +1948,7 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) first_nonzero++; effective_digits = len - first_nonzero; - if (effective_digits <= (sizeof(long) >= 8 ? 18 : 9)) { + if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS : INT32_DECIMAL_SAFE_DIGITS)) { long result = 0; for (; j < len; j++) { result = result * 10 + (ptr[j] - '0'); @@ -1951,8 +1956,8 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) if (negative) result = -result; return LONG2NUM(result); } - /* 19 digits on 64-bit (or 10 on 32-bit): may fit in long */ - if (effective_digits <= (sizeof(long) >= 8 ? 19 : 10)) { + /* One more digit than safe: may still fit in long with overflow check */ + if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { unsigned long result = 0; unsigned long limit = negative ? (unsigned long)LONG_MAX + 1 From 0065ecfd8d677c0e2160a83ceed74d2ade3b7965 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 22:10:56 +0900 Subject: [PATCH 12/34] Remove unnecessary capture groups in test_integer_at_full_match_with_non_digits --- test/strscan/test_stringscanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 64d377b966..e9b0f1de00 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1120,7 +1120,7 @@ def test_integer_at_leading_zeros def test_integer_at_full_match_with_non_digits # follows String#to_i: "2024-06-15".to_i => 2024 s = create_string_scanner("2024-06-15") - s.scan(/(\d{4})-(\d{2})-(\d{2})/) + s.scan(/\d{4}-\d{2}-\d{2}/) assert_equal(2024, s.integer_at(0)) end From d89e54b96b3b1160d34b373e45dbadd88b149c05 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 22:28:44 +0900 Subject: [PATCH 13/34] Remove redundant test_integer_at_full_match_with_non_digits Non-digit behavior is already covered by test_integer_at_non_digit and index 0 is covered by test_integer_at_index_zero. --- test/strscan/test_stringscanner.rb | 7 ------- 1 file changed, 7 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index e9b0f1de00..deece5f801 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1117,13 +1117,6 @@ def test_integer_at_leading_zeros assert_equal(10, s.integer_at(1)) end - def test_integer_at_full_match_with_non_digits - # follows String#to_i: "2024-06-15".to_i => 2024 - s = create_string_scanner("2024-06-15") - s.scan(/\d{4}-\d{2}-\d{2}/) - assert_equal(2024, s.integer_at(0)) - end - def test_integer_at_named_capture_symbol s = create_string_scanner("2024-06-15") s.scan(/(?\d{4})-(?\d{2})-(?\d{2})/) From 4d3583f6cb85bf82cc8aa62ebcddc6fe5d5c7233 Mon Sep 17 00:00:00 2001 From: jinroq Date: Mon, 23 Mar 2026 22:33:02 +0900 Subject: [PATCH 14/34] Rename test_integer_at_named_capture_undefined to use "unknown" consistently --- test/strscan/test_stringscanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index deece5f801..9b633b946a 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1133,7 +1133,7 @@ def test_integer_at_named_capture_string assert_equal(15, s.integer_at("day")) end - def test_integer_at_named_capture_undefined + def test_integer_at_named_capture_unknown s = create_string_scanner("2024-06-15") s.scan(/(?\d{4})-(?\d{2})-(?\d{2})/) assert_raise(IndexError) { s.integer_at(:unknown) } From e686711247edcc8c208635cd762dc57b62e93fc5 Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 24 Mar 2026 20:04:27 +0900 Subject: [PATCH 15/34] Optimize fast path to handle underscored digit strings Extend base-10 fast path to parse underscore-separated digits(e.g. "1_000_000") without temporary String allocation, following String#to_i underscore rules. --- ext/strscan/strscan.c | 91 ++++++++++++++++++------------ test/strscan/test_stringscanner.rb | 10 ++++ 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 2bc31bf6ae..53c0f13b48 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1919,51 +1919,70 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) #define INT64_DECIMAL_SAFE_DIGITS 18 #define INT32_DECIMAL_SAFE_DIGITS 9 - /* Fast path for base 10 with pure digits: parse directly from - * source bytes without temporary String allocation. + /* Fast path for base 10 with digits and optional underscores: + * parse directly from source bytes without temporary String allocation. * This covers the Date._strptime use case. */ if (base == 10) { long j = 0; bool negative = false; - long digit_count; + long digit_count = 0; + bool valid = true; if (ptr[0] == '-') { negative = true; j = 1; } else if (ptr[0] == '+') { j = 1; } - digit_count = len - j; - if (digit_count > 0) { + /* Validate: only digits and underscores (not leading/trailing/consecutive) */ + { long k; - bool all_digits = true; + bool prev_underscore = true; /* treat start as underscore to reject leading _ */ for (k = j; k < len; k++) { - if (ptr[k] < '0' || ptr[k] > '9') { - all_digits = false; + if (ptr[k] >= '0' && ptr[k] <= '9') { + digit_count++; + prev_underscore = false; + } + else if (ptr[k] == '_' && !prev_underscore) { + prev_underscore = true; + } + else { + valid = false; break; } } - if (all_digits) { - /* Skip leading zeros to get effective digit count */ - long first_nonzero = j; - long effective_digits; - while (first_nonzero < len && ptr[first_nonzero] == '0') - first_nonzero++; - effective_digits = len - first_nonzero; - - if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS : INT32_DECIMAL_SAFE_DIGITS)) { - long result = 0; - for (; j < len; j++) { + if (prev_underscore && digit_count > 0) valid = false; /* trailing _ */ + } + + if (valid && digit_count > 0) { + /* Skip leading zeros to get effective digit count */ + long first_nonzero = j; + long effective_digits; + while (first_nonzero < len && (ptr[first_nonzero] == '0' || ptr[first_nonzero] == '_')) + first_nonzero++; + effective_digits = 0; + { + long k; + for (k = first_nonzero; k < len; k++) { + if (ptr[k] != '_') effective_digits++; + } + } + + if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS : INT32_DECIMAL_SAFE_DIGITS)) { + long result = 0; + for (; j < len; j++) { + if (ptr[j] != '_') result = result * 10 + (ptr[j] - '0'); - } - if (negative) result = -result; - return LONG2NUM(result); } - /* One more digit than safe: may still fit in long with overflow check */ - if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { - unsigned long result = 0; - unsigned long limit = negative - ? (unsigned long)LONG_MAX + 1 - : (unsigned long)LONG_MAX; - bool overflow = false; - for (; j < len; j++) { + if (negative) result = -result; + return LONG2NUM(result); + } + /* One more digit than safe: may still fit in long with overflow check */ + if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { + unsigned long result = 0; + unsigned long limit = negative + ? (unsigned long)LONG_MAX + 1 + : (unsigned long)LONG_MAX; + bool overflow = false; + for (; j < len; j++) { + if (ptr[j] != '_') { unsigned long d = ptr[j] - '0'; if (result > (limit - d) / 10) { overflow = true; @@ -1971,14 +1990,14 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) } result = result * 10 + d; } - if (!overflow) { - if (negative) - return LONG2NUM(-(long)result); - return LONG2NUM((long)result); - } } - /* Bignum: fall through to rb_str_to_inum */ + if (!overflow) { + if (negative) + return LONG2NUM(-(long)result); + return LONG2NUM((long)result); + } } + /* Bignum: fall through to rb_str_to_inum */ } } diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 9b633b946a..ab5196ea8a 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1161,6 +1161,16 @@ def test_integer_at_underscore s = create_string_scanner("1_0_0") s.scan(/(\d+(?:_\d+)*)/) assert_equal(100, s.integer_at(1)) + + # large number with underscores + s = create_string_scanner("1_000_000_000") + s.scan(/(\d+(?:_\d+)*)/) + assert_equal(1_000_000_000, s.integer_at(1)) + + # signed with underscores + s = create_string_scanner("-1_000") + s.scan(/([+\-]?\d+(?:_\d+)*)/) + assert_equal(-1000, s.integer_at(1)) end def test_scan_integer From 0f2ad2abdd7248c9cd9bde3935965925396586a8 Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 24 Mar 2026 20:20:21 +0900 Subject: [PATCH 16/34] Use proper boundary value pairs in fixnum_bignum_boundary test Replace "9" * 19 with "1" * 19 as the correct next value after "9" * 18, and add LONG_MIN - 1 test to pair with LONG_MIN. --- test/strscan/test_stringscanner.rb | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index ab5196ea8a..ea0b79980d 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1025,25 +1025,25 @@ def test_integer_at_large_number end def test_integer_at_fixnum_bignum_boundary - # 18 digits: fits in long on 64-bit - s = create_string_scanner("999999999999999999") + # 18 digits max ("9" * 18): largest value without overflow check + s = create_string_scanner("9" * 18) s.scan(/(\d+)/) - assert_equal(999999999999999999, s.integer_at(1)) + assert_equal(("9" * 18).to_i, s.integer_at(1)) - # 19 digits: exceeds long on 64-bit, becomes bignum - s = create_string_scanner("9999999999999999999") + # 19 digits min ("1" * 19): smallest value with overflow check + s = create_string_scanner("1" * 19) s.scan(/(\d+)/) - assert_equal(9999999999999999999, s.integer_at(1)) + assert_equal(("1" * 19).to_i, s.integer_at(1)) - # negative 18 digits - s = create_string_scanner("-999999999999999999") + # negative 18 digits max + s = create_string_scanner("-" + "9" * 18) s.scan(/([+\-]?\d+)/) - assert_equal(-999999999999999999, s.integer_at(1)) + assert_equal(-("9" * 18).to_i, s.integer_at(1)) - # negative 19 digits - s = create_string_scanner("-9999999999999999999") + # negative 19 digits min + s = create_string_scanner("-" + "1" * 19) s.scan(/([+\-]?\d+)/) - assert_equal(-9999999999999999999, s.integer_at(1)) + assert_equal(-("1" * 19).to_i, s.integer_at(1)) # LONG_MAX (19 digits, fits in long) long_max = 2 ** (0.size * 8 - 1) - 1 @@ -1062,6 +1062,11 @@ def test_integer_at_fixnum_bignum_boundary s.scan(/(\d+)/) assert_equal(long_max + 1, s.integer_at(1)) + # LONG_MIN - 1 (negative bignum) + s = create_string_scanner((long_min - 1).to_s) + s.scan(/([+\-]?\d+)/) + assert_equal(long_min - 1, s.integer_at(1)) + # leading zeros with many digits s = create_string_scanner("00000000000000000001") s.scan(/(\d+)/) From 433fd87aa5329dce9b752c52c020c50a57a42fbf Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 24 Mar 2026 20:27:45 +0900 Subject: [PATCH 17/34] Use specifier instead of index for integer_at parameter name Rename the parameter in RDoc, C implementation, and Ruby fallback to match the naming convention used in StringScanner#[]. --- ext/strscan/strscan.c | 18 +++++++++--------- lib/strscan/strscan.rb | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 53c0f13b48..1bb7d9dbe3 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1867,20 +1867,20 @@ strscan_values_at(int argc, VALUE *argv, VALUE self) /* * call-seq: - * integer_at(index, base = 10) -> integer or nil + * integer_at(specifier, base = 10) -> integer or nil * - * Returns the captured substring at the given +index+ as an Integer, + * Returns the captured substring at the given +specifier+ as an Integer, * following the behavior of String#to_i(base). * - * +index+ can be an Integer (positive, negative, or zero), a Symbol, + * +specifier+ can be an Integer (positive, negative, or zero), a Symbol, * or a String for named capture groups. * * Returns +nil+ if: * - No match has been performed or the last match failed - * - The +index+ is out of range - * - The group at +index+ did not participate in the match + * - The +specifier+ is out of range + * - The group at +specifier+ did not participate in the match * - * This is semantically equivalent to self[index].to_i(base) + * This is semantically equivalent to self[specifier].to_i(base) * but avoids the allocation of a temporary String when possible. * * scanner = StringScanner.new("2024-06-15") @@ -1896,14 +1896,14 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) long i; long beg, end, len; const char *ptr; - VALUE idx, vbase; + VALUE specifier, vbase; int base = 10; - rb_scan_args(argc, argv, "11", &idx, &vbase); + rb_scan_args(argc, argv, "11", &specifier, &vbase); if (!NIL_P(vbase)) base = NUM2INT(vbase); GET_SCANNER(self, p); - i = resolve_capture_index(p, idx); + i = resolve_capture_index(p, specifier); if (i < 0) return Qnil; beg = adjust_register_position(p, p->regs.beg[i]); diff --git a/lib/strscan/strscan.rb b/lib/strscan/strscan.rb index cc6e6bc433..433bbf90fc 100644 --- a/lib/strscan/strscan.rb +++ b/lib/strscan/strscan.rb @@ -25,9 +25,9 @@ def scan_integer(base: 10) unless method_defined?(:integer_at) # Fallback implementation for platforms without C extension (e.g. JRuby). - # Equivalent to self[index].to_i(base). - def integer_at(index, base = 10) - str = self[index] + # Equivalent to self[specifier].to_i(base). + def integer_at(specifier, base = 10) + str = self[specifier] return nil if str.nil? str.to_i(base) end From 72c042630975141484f6a807e56581ee06e68e27 Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 25 Mar 2026 21:59:45 +0900 Subject: [PATCH 18/34] Document IndexError for undefined named capture in integer_at RDoc --- ext/strscan/strscan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 1bb7d9dbe3..064384c062 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1877,9 +1877,13 @@ strscan_values_at(int argc, VALUE *argv, VALUE self) * * Returns +nil+ if: * - No match has been performed or the last match failed - * - The +specifier+ is out of range + * - The +specifier+ is an Integer and is out of range * - The group at +specifier+ did not participate in the match * + * Raises IndexError if +specifier+ is a Symbol or String that does not + * correspond to a named capture group, consistent with + * StringScanner#[]. + * * This is semantically equivalent to self[specifier].to_i(base) * but avoids the allocation of a temporary String when possible. * From 0c5c88e05befb49094d39ee8eb350bc5101d1908 Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 25 Mar 2026 22:28:11 +0900 Subject: [PATCH 19/34] Clarify that 18/19-digit boundary tests apply to 64-bit longs --- test/strscan/test_stringscanner.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index ea0b79980d..f7befe4692 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1025,22 +1025,22 @@ def test_integer_at_large_number end def test_integer_at_fixnum_bignum_boundary - # 18 digits max ("9" * 18): largest value without overflow check + # 18 digits max on 64-bit ("9" * 18): largest value without overflow check s = create_string_scanner("9" * 18) s.scan(/(\d+)/) assert_equal(("9" * 18).to_i, s.integer_at(1)) - # 19 digits min ("1" * 19): smallest value with overflow check + # 19 digits min on 64-bit ("1" * 19): smallest value with overflow check s = create_string_scanner("1" * 19) s.scan(/(\d+)/) assert_equal(("1" * 19).to_i, s.integer_at(1)) - # negative 18 digits max + # negative 18 digits max on 64-bit s = create_string_scanner("-" + "9" * 18) s.scan(/([+\-]?\d+)/) assert_equal(-("9" * 18).to_i, s.integer_at(1)) - # negative 19 digits min + # negative 19 digits min on 64-bit s = create_string_scanner("-" + "1" * 19) s.scan(/([+\-]?\d+)/) assert_equal(-("1" * 19).to_i, s.integer_at(1)) From 674e32cc7d0820a4009f53b3bd78a8edf4471337 Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 25 Mar 2026 22:43:15 +0900 Subject: [PATCH 20/34] Fix undefined behavior when negating LONG_MIN in overflow-checked path Special-case result == LONG_MAX + 1 to return LONG_MIN directly, avoiding signed integer overflow from -(long)result. --- ext/strscan/strscan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 064384c062..075cdb0a09 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1996,8 +1996,11 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) } } if (!overflow) { - if (negative) + if (negative) { + if (result == (unsigned long)LONG_MAX + 1) + return LONG2NUM(LONG_MIN); return LONG2NUM(-(long)result); + } return LONG2NUM((long)result); } } From 6b44dbd909d8d1ac5040b77a1fd4a03af2450931 Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 25 Mar 2026 22:56:03 +0900 Subject: [PATCH 21/34] Add test for integer_at when scanner position is not at the beginning --- test/strscan/test_stringscanner.rb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index f7befe4692..d2eb944cdb 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -980,6 +980,12 @@ def test_integer_at_index_zero s = create_string_scanner("42 abc") s.scan(/\d+/) assert_equal(42, s.integer_at(0)) + + # when current position is not at the beginning + s = create_string_scanner("a 10") + s.scan(/a /) + s.scan(/\d+/) + assert_equal(10, s.integer_at(0)) end def test_integer_at_negative_index From b6fe693818c6a2872fab774f56e0f1d7df66e42f Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 20:21:22 +0900 Subject: [PATCH 22/34] Rename parameter idx to specifier in resolve_capture_index --- ext/strscan/strscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 075cdb0a09..229ae97775 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1624,23 +1624,23 @@ name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name /* Resolve capture group index from Integer, Symbol, or String. * Returns the resolved register index, or -1 if unmatched/out of range. */ static long -resolve_capture_index(struct strscanner *p, VALUE idx) +resolve_capture_index(struct strscanner *p, VALUE specifier) { const char *name; long i; if (! MATCHED_P(p)) return -1; - switch (TYPE(idx)) { + switch (TYPE(specifier)) { case T_SYMBOL: - idx = rb_sym2str(idx); + specifier = rb_sym2str(specifier); /* fall through */ case T_STRING: - RSTRING_GETMEM(idx, name, i); - i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx)); + RSTRING_GETMEM(specifier, name, i); + i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(specifier)); break; default: - i = NUM2LONG(idx); + i = NUM2LONG(specifier); } if (i < 0) From a078c0dc4ccfffbfdd47ed2bf70db906053a02f0 Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 20:42:44 +0900 Subject: [PATCH 23/34] Extract base-10 fast path into parse_decimal_fast static inline function --- ext/strscan/strscan.c | 185 ++++++++++++++++++++++-------------------- 1 file changed, 98 insertions(+), 87 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 229ae97775..f21fdc874a 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1893,6 +1893,100 @@ strscan_values_at(int argc, VALUE *argv, VALUE self) * scanner.integer_at(1, 16) # => 8228 * */ +/* Max decimal digits guaranteed to fit in long without overflow check. + * floor(log10(INT64_MAX)) = 18, floor(log10(INT32_MAX)) = 9 */ +#define INT64_DECIMAL_SAFE_DIGITS 18 +#define INT32_DECIMAL_SAFE_DIGITS 9 + +/* Fast path for base-10 integer parsing without temporary String allocation. + * Accepts digits and optional underscores (Ruby String#to_i semantics). + * Returns a Fixnum/Integer VALUE on success, or Qundef to signal fall-through + * to the general path (non-decimal, bignum, or non-numeric input). */ +static inline VALUE +parse_decimal_fast(const char *ptr, long len) +{ + long j = 0; + bool negative = false; + long digit_count = 0; + bool valid = true; + + if (ptr[0] == '-') { negative = true; j = 1; } + else if (ptr[0] == '+') { j = 1; } + + /* Validate: only digits and underscores (not leading/trailing/consecutive) */ + { + long k; + bool prev_underscore = true; /* treat start as underscore to reject leading _ */ + for (k = j; k < len; k++) { + if (ptr[k] >= '0' && ptr[k] <= '9') { + digit_count++; + prev_underscore = false; + } + else if (ptr[k] == '_' && !prev_underscore) { + prev_underscore = true; + } + else { + valid = false; + break; + } + } + if (prev_underscore && digit_count > 0) valid = false; /* trailing _ */ + } + + if (!valid || digit_count == 0) return Qundef; + + /* Skip leading zeros to get effective digit count */ + { + long first_nonzero = j; + long effective_digits; + long k; + while (first_nonzero < len && (ptr[first_nonzero] == '0' || ptr[first_nonzero] == '_')) + first_nonzero++; + effective_digits = 0; + for (k = first_nonzero; k < len; k++) { + if (ptr[k] != '_') effective_digits++; + } + + if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS : INT32_DECIMAL_SAFE_DIGITS)) { + long result = 0; + for (; j < len; j++) { + if (ptr[j] != '_') + result = result * 10 + (ptr[j] - '0'); + } + if (negative) result = -result; + return LONG2NUM(result); + } + /* One more digit than safe: may still fit in long with overflow check */ + if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { + unsigned long result = 0; + unsigned long limit = negative + ? (unsigned long)LONG_MAX + 1 + : (unsigned long)LONG_MAX; + bool overflow = false; + for (; j < len; j++) { + if (ptr[j] != '_') { + unsigned long d = ptr[j] - '0'; + if (result > (limit - d) / 10) { + overflow = true; + break; + } + result = result * 10 + d; + } + } + if (!overflow) { + if (negative) { + if (result == (unsigned long)LONG_MAX + 1) + return LONG2NUM(LONG_MIN); + return LONG2NUM(-(long)result); + } + return LONG2NUM((long)result); + } + } + } + /* Bignum: signal fall-through to rb_str_to_inum */ + return Qundef; +} + static VALUE strscan_integer_at(int argc, VALUE *argv, VALUE self) { @@ -1918,94 +2012,11 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) ptr = S_PBEG(p) + beg; - /* Max decimal digits guaranteed to fit in long without overflow check. - * floor(log10(INT64_MAX)) = 18, floor(log10(INT32_MAX)) = 9 */ -#define INT64_DECIMAL_SAFE_DIGITS 18 -#define INT32_DECIMAL_SAFE_DIGITS 9 - - /* Fast path for base 10 with digits and optional underscores: - * parse directly from source bytes without temporary String allocation. - * This covers the Date._strptime use case. */ + /* Fast path for base 10: parse directly from source bytes without + * temporary String allocation. This covers the Date._strptime use case. */ if (base == 10) { - long j = 0; - bool negative = false; - long digit_count = 0; - bool valid = true; - - if (ptr[0] == '-') { negative = true; j = 1; } - else if (ptr[0] == '+') { j = 1; } - - /* Validate: only digits and underscores (not leading/trailing/consecutive) */ - { - long k; - bool prev_underscore = true; /* treat start as underscore to reject leading _ */ - for (k = j; k < len; k++) { - if (ptr[k] >= '0' && ptr[k] <= '9') { - digit_count++; - prev_underscore = false; - } - else if (ptr[k] == '_' && !prev_underscore) { - prev_underscore = true; - } - else { - valid = false; - break; - } - } - if (prev_underscore && digit_count > 0) valid = false; /* trailing _ */ - } - - if (valid && digit_count > 0) { - /* Skip leading zeros to get effective digit count */ - long first_nonzero = j; - long effective_digits; - while (first_nonzero < len && (ptr[first_nonzero] == '0' || ptr[first_nonzero] == '_')) - first_nonzero++; - effective_digits = 0; - { - long k; - for (k = first_nonzero; k < len; k++) { - if (ptr[k] != '_') effective_digits++; - } - } - - if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS : INT32_DECIMAL_SAFE_DIGITS)) { - long result = 0; - for (; j < len; j++) { - if (ptr[j] != '_') - result = result * 10 + (ptr[j] - '0'); - } - if (negative) result = -result; - return LONG2NUM(result); - } - /* One more digit than safe: may still fit in long with overflow check */ - if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { - unsigned long result = 0; - unsigned long limit = negative - ? (unsigned long)LONG_MAX + 1 - : (unsigned long)LONG_MAX; - bool overflow = false; - for (; j < len; j++) { - if (ptr[j] != '_') { - unsigned long d = ptr[j] - '0'; - if (result > (limit - d) / 10) { - overflow = true; - break; - } - result = result * 10 + d; - } - } - if (!overflow) { - if (negative) { - if (result == (unsigned long)LONG_MAX + 1) - return LONG2NUM(LONG_MIN); - return LONG2NUM(-(long)result); - } - return LONG2NUM((long)result); - } - } - /* Bignum: fall through to rb_str_to_inum */ - } + VALUE result = parse_decimal_fast(ptr, len); + if (result != Qundef) return result; } /* General path: follow String#to_i(base) semantics via rb_str_to_inum. From b1b135ab96c87321ba295218fc9ef8dfe0ef9946 Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 21:09:23 +0900 Subject: [PATCH 24/34] Add else for readability in overflow-checked path of parse_decimal_fast --- ext/strscan/strscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index f21fdc874a..9eaffe84c6 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1979,7 +1979,8 @@ parse_decimal_fast(const char *ptr, long len) return LONG2NUM(LONG_MIN); return LONG2NUM(-(long)result); } - return LONG2NUM((long)result); + else + return LONG2NUM((long)result); } } } From db383e9f92735b6abc06a7b8f30487c11a3e0364 Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 21:14:52 +0900 Subject: [PATCH 25/34] Simplify overflow check in parse_decimal_fast by computing before comparing --- ext/strscan/strscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 9eaffe84c6..dc00b10f9c 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1966,11 +1966,11 @@ parse_decimal_fast(const char *ptr, long len) for (; j < len; j++) { if (ptr[j] != '_') { unsigned long d = ptr[j] - '0'; - if (result > (limit - d) / 10) { + result = result * 10 + d; + if (result > limit) { overflow = true; break; } - result = result * 10 + d; } } if (!overflow) { From a421b1228ed79b9c9b5391d08b1866032b1b93ed Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 21:47:58 +0900 Subject: [PATCH 26/34] Use else if and == for boundary check in parse_decimal_fast overflow path --- ext/strscan/strscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index dc00b10f9c..77ba4d2b14 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1957,7 +1957,7 @@ parse_decimal_fast(const char *ptr, long len) return LONG2NUM(result); } /* One more digit than safe: may still fit in long with overflow check */ - if (effective_digits <= (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { + else if (effective_digits == (sizeof(long) >= 8 ? INT64_DECIMAL_SAFE_DIGITS + 1 : INT32_DECIMAL_SAFE_DIGITS + 1)) { unsigned long result = 0; unsigned long limit = negative ? (unsigned long)LONG_MAX + 1 From a81aa3e9ad432e814da7e9a879348f675219bb41 Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 21:48:37 +0900 Subject: [PATCH 27/34] Rename test_integer_at_date_parts to test_integer_at_positive_index --- test/strscan/test_stringscanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index d2eb944cdb..4fa6368770 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -968,7 +968,7 @@ def test_named_captures_same_name_union assert_equal({"number" => "1"}, scan.named_captures) end - def test_integer_at_date_parts + def test_integer_at_positive_index s = create_string_scanner("2024-06-15") s.scan(/(\d{4})-(\d{2})-(\d{2})/) assert_equal(2024, s.integer_at(1)) From d07b0867adea0f674e3e8cedb9c368fefcf4a22c Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 21:59:44 +0900 Subject: [PATCH 28/34] Fix boundary test to use smallest 19-digit number in test_integer_at_fixnum_bignum_boundary --- test/strscan/test_stringscanner.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 4fa6368770..e092b6a151 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1036,10 +1036,10 @@ def test_integer_at_fixnum_bignum_boundary s.scan(/(\d+)/) assert_equal(("9" * 18).to_i, s.integer_at(1)) - # 19 digits min on 64-bit ("1" * 19): smallest value with overflow check - s = create_string_scanner("1" * 19) + # 19 digits min on 64-bit ("1" + "0" * 18): smallest value with overflow check + s = create_string_scanner("1" + "0" * 18) s.scan(/(\d+)/) - assert_equal(("1" * 19).to_i, s.integer_at(1)) + assert_equal(("1" + "0" * 18).to_i, s.integer_at(1)) # negative 18 digits max on 64-bit s = create_string_scanner("-" + "9" * 18) From 1ae57726c300ed5749372ebffc1febb08d65178d Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 22:02:29 +0900 Subject: [PATCH 29/34] Use "0" * 19 + "1" for readability in leading zeros test --- test/strscan/test_stringscanner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index e092b6a151..6c2f8464f5 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1074,7 +1074,7 @@ def test_integer_at_fixnum_bignum_boundary assert_equal(long_min - 1, s.integer_at(1)) # leading zeros with many digits - s = create_string_scanner("00000000000000000001") + s = create_string_scanner("0" * 19 + "1") s.scan(/(\d+)/) assert_equal(1, s.integer_at(1)) end From 04a7f847f4359308cbe93ab91bba8b1954bd3b0e Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 22:19:40 +0900 Subject: [PATCH 30/34] Fix negative boundary test to use smallest 19-digit absolute value in test_integer_at_fixnum_bignum_boundary --- test/strscan/test_stringscanner.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 6c2f8464f5..36041f1357 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1046,10 +1046,10 @@ def test_integer_at_fixnum_bignum_boundary s.scan(/([+\-]?\d+)/) assert_equal(-("9" * 18).to_i, s.integer_at(1)) - # negative 19 digits min on 64-bit - s = create_string_scanner("-" + "1" * 19) + # negative 19 digits min on 64-bit ("-" + "1" + "0" * 18): smallest absolute value with overflow check + s = create_string_scanner("-" + "1" + "0" * 18) s.scan(/([+\-]?\d+)/) - assert_equal(-("1" * 19).to_i, s.integer_at(1)) + assert_equal(-("1" + "0" * 18).to_i, s.integer_at(1)) # LONG_MAX (19 digits, fits in long) long_max = 2 ** (0.size * 8 - 1) - 1 From 21c6be8e69324bb897d5ec1bdf59df53b6aac54a Mon Sep 17 00:00:00 2001 From: jinroq Date: Tue, 31 Mar 2026 22:25:50 +0900 Subject: [PATCH 31/34] Split test_integer_at_fixnum_bignum_boundary into digit_count and long boundary tests --- test/strscan/test_stringscanner.rb | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 36041f1357..d472e4741a 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1030,7 +1030,7 @@ def test_integer_at_large_number assert_equal(large.to_i, s.integer_at(1)) end - def test_integer_at_fixnum_bignum_boundary + def test_integer_at_digit_count_boundary # 18 digits max on 64-bit ("9" * 18): largest value without overflow check s = create_string_scanner("9" * 18) s.scan(/(\d+)/) @@ -1050,15 +1050,18 @@ def test_integer_at_fixnum_bignum_boundary s = create_string_scanner("-" + "1" + "0" * 18) s.scan(/([+\-]?\d+)/) assert_equal(-("1" + "0" * 18).to_i, s.integer_at(1)) + end - # LONG_MAX (19 digits, fits in long) + def test_integer_at_long_boundary long_max = 2 ** (0.size * 8 - 1) - 1 + long_min = -(2 ** (0.size * 8 - 1)) + + # LONG_MAX (19 digits, fits in long) s = create_string_scanner(long_max.to_s) s.scan(/(\d+)/) assert_equal(long_max, s.integer_at(1)) # LONG_MIN (19 digits + sign, fits in long) - long_min = -(2 ** (0.size * 8 - 1)) s = create_string_scanner(long_min.to_s) s.scan(/([+\-]?\d+)/) assert_equal(long_min, s.integer_at(1)) @@ -1072,11 +1075,6 @@ def test_integer_at_fixnum_bignum_boundary s = create_string_scanner((long_min - 1).to_s) s.scan(/([+\-]?\d+)/) assert_equal(long_min - 1, s.integer_at(1)) - - # leading zeros with many digits - s = create_string_scanner("0" * 19 + "1") - s.scan(/(\d+)/) - assert_equal(1, s.integer_at(1)) end def test_integer_at_non_digit @@ -1126,6 +1124,11 @@ def test_integer_at_leading_zeros s = create_string_scanner("010") s.scan(/(\d+)/) assert_equal(10, s.integer_at(1)) + + # leading zeros with many digits: effective digit count is 1, goes through safe path + s = create_string_scanner("0" * 19 + "1") + s.scan(/(\d+)/) + assert_equal(1, s.integer_at(1)) end def test_integer_at_named_capture_symbol From e6e5c27fa127e51950c44895e9cb2640453834ed Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 1 Apr 2026 21:13:22 +0900 Subject: [PATCH 32/34] Revert "Simplify overflow check in parse_decimal_fast by computing before comparing" This reverts commit db383e9f92735b6abc06a7b8f30487c11a3e0364. --- ext/strscan/strscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 77ba4d2b14..930257b672 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1966,11 +1966,11 @@ parse_decimal_fast(const char *ptr, long len) for (; j < len; j++) { if (ptr[j] != '_') { unsigned long d = ptr[j] - '0'; - result = result * 10 + d; - if (result > limit) { + if (result > (limit - d) / 10) { overflow = true; break; } + result = result * 10 + d; } } if (!overflow) { From d5b3651766bfca36894490a8c7795208fa7e63cd Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 1 Apr 2026 21:22:18 +0900 Subject: [PATCH 33/34] Add comment explaining pre-check guards against 32-bit unsigned long wraparound --- ext/strscan/strscan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 930257b672..e44c6565d5 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1966,6 +1966,8 @@ parse_decimal_fast(const char *ptr, long len) for (; j < len; j++) { if (ptr[j] != '_') { unsigned long d = ptr[j] - '0'; + /* Pre-check before multiply to avoid unsigned long wraparound on + * 32-bit platforms, where 10-digit values can exceed ULONG_MAX. */ if (result > (limit - d) / 10) { overflow = true; break; From ba15508f1cb7d99fb1d1d2e28dc40be43049f276 Mon Sep 17 00:00:00 2001 From: jinroq Date: Wed, 1 Apr 2026 21:42:24 +0900 Subject: [PATCH 34/34] Raise TypeError for explicit nil base argument in integer_at --- ext/strscan/strscan.c | 7 ++++--- test/strscan/test_stringscanner.rb | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index e44c6565d5..597c96d232 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -1997,11 +1997,12 @@ strscan_integer_at(int argc, VALUE *argv, VALUE self) long i; long beg, end, len; const char *ptr; - VALUE specifier, vbase; + VALUE specifier; int base = 10; - rb_scan_args(argc, argv, "11", &specifier, &vbase); - if (!NIL_P(vbase)) base = NUM2INT(vbase); + rb_check_arity(argc, 1, 2); + specifier = argv[0]; + if (argc > 1) base = NUM2INT(argv[1]); GET_SCANNER(self, p); i = resolve_capture_index(p, specifier); diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index d472e4741a..12ebbc8a35 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -1160,6 +1160,9 @@ def test_integer_at_base assert_equal(2024, s.integer_at(1)) # default base 10 assert_equal(1044, s.integer_at(1, 8)) # base 8 assert_equal(8228, s.integer_at(1, 16)) # base 16 + + # explicit nil raises TypeError (consistent with String#to_i and MatchData#integer_at) + assert_raise(TypeError) { s.integer_at(1, nil) } end def test_integer_at_base_zero