From 7fc5a40cf71f6caf0d698d429a7eebbfd9980053 Mon Sep 17 00:00:00 2001
From: Scolliq <146259639+Scolliq@users.noreply.github.com>
Date: Sat, 25 Apr 2026 00:59:11 +0000
Subject: [PATCH 1/2] perf(spark): use 256-entry byte-pair table in hex
 encoding

The bytes path looked up two nibbles and pushed two bytes per input
byte. Replace it with a precomputed `[[u8; 2]; 256]` table built at
compile time, so each input byte becomes one indexed load and one
two-byte extend_from_slice. The int64 path now consumes two nibbles
per loop iteration via the same table, with a fall-through for the
single high nibble.

Existing benchmarks in `datafusion/spark/benches/hex.rs` cover the
hot paths (Int64, Utf8, Utf8View, LargeUtf8, Binary, LargeBinary,
plus dictionary variants).

Adds tests covering all 256 byte values against `format!("{:02X/x}")`
and i64 edge cases (`0`, `i64::MAX`, `i64::MIN`, `-1`).

Refs #15986
---
 datafusion/spark/src/function/math/hex.rs | 117 +++++++++++++++++++---
 1 file changed, 101 insertions(+), 16 deletions(-)

diff --git a/datafusion/spark/src/function/math/hex.rs b/datafusion/spark/src/function/math/hex.rs
index 90444ba9d552a..f3cc2516311a6 100644
--- a/datafusion/spark/src/function/math/hex.rs
+++ b/datafusion/spark/src/function/math/hex.rs
@@ -108,9 +108,27 @@ impl ScalarUDFImpl for SparkHex {
     }
 }
 
-/// Hex encoding lookup tables for fast byte-to-hex conversion
-const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
-const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
+/// Hex encoding lookup tables for fast byte-to-hex conversion.
+///
+/// Each entry maps a full byte to its two-character hex encoding so the
+/// hot loop becomes one load + one two-byte extend per input byte instead
+/// of two nibble lookups and two pushes.
+const HEX_CHARS_UPPER_NIBBLES: &[u8; 16] = b"0123456789ABCDEF";
+const HEX_CHARS_LOWER_NIBBLES: &[u8; 16] = b"0123456789abcdef";
+
+const HEX_LOOKUP_UPPER: [[u8; 2]; 256] = build_hex_lookup(HEX_CHARS_UPPER_NIBBLES);
+const HEX_LOOKUP_LOWER: [[u8; 2]; 256] = build_hex_lookup(HEX_CHARS_LOWER_NIBBLES);
+
+const fn build_hex_lookup(nibbles: &[u8; 16]) -> [[u8; 2]; 256] {
+    let mut table = [[0u8; 2]; 256];
+    let mut i = 0;
+    while i < 256 {
+        table[i][0] = nibbles[(i >> 4) & 0xF];
+        table[i][1] = nibbles[i & 0xF];
+        i += 1;
+    }
+    table
+}
 
 #[inline]
 fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
@@ -118,12 +136,22 @@ fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
         return b"0";
     }
 
+    // Walk the value two nibbles (one full byte) at a time. The buffer is
+    // filled from the right so the high-order nibbles end up first; the
+    // returned slice trims leading zeros automatically.
     let mut n = num as u64;
     let mut i = 16;
-    while n != 0 {
+    while n >= 0x10 {
+        i -= 2;
+        let pair = HEX_LOOKUP_UPPER[(n & 0xFF) as usize];
+        buffer[i] = pair[0];
+        buffer[i + 1] = pair[1];
+        n >>= 8;
+    }
+    if n > 0 {
+        // Single remaining high nibble (value 0x1..=0xF).
         i -= 1;
-        buffer[i] = HEX_CHARS_UPPER[(n & 0xF) as usize];
-        n >>= 4;
+        buffer[i] = HEX_CHARS_UPPER_NIBBLES[n as usize];
     }
     &buffer[i..]
 }
@@ -140,21 +168,21 @@ where
 {
     let mut builder = StringBuilder::with_capacity(len, len * 64);
     let mut buffer = Vec::with_capacity(64);
-    let hex_chars = if lowercase {
-        HEX_CHARS_LOWER
+    let lookup = if lowercase {
+        &HEX_LOOKUP_LOWER
     } else {
-        HEX_CHARS_UPPER
+        &HEX_LOOKUP_UPPER
     };
 
     for v in iter {
         if let Some(b) = v {
-            buffer.clear();
             let bytes = b.as_ref();
+            buffer.clear();
+            buffer.reserve(bytes.len() * 2);
             for &byte in bytes {
-                buffer.push(hex_chars[(byte >> 4) as usize]);
-                buffer.push(hex_chars[(byte & 0x0f) as usize]);
+                buffer.extend_from_slice(&lookup[byte as usize]);
             }
-            // SAFETY: buffer contains only ASCII hex digests, which are valid UTF-8
+            // SAFETY: buffer contains only ASCII hex digits, which are valid UTF-8.
             unsafe {
                 builder.append_value(from_utf8_unchecked(&buffer));
             }
@@ -327,7 +355,9 @@ mod test {
     use std::str::from_utf8_unchecked;
     use std::sync::Arc;
 
-    use arrow::array::{DictionaryArray, Int32Array, Int64Array, StringArray};
+    use arrow::array::{
+        BinaryArray, DictionaryArray, Int32Array, Int64Array, StringArray,
+    };
     use arrow::{
         array::{
             BinaryDictionaryBuilder, PrimitiveDictionaryBuilder, StringDictionaryBuilder,
@@ -427,7 +457,18 @@ mod test {
 
     #[test]
     fn test_hex_int64() {
-        let test_cases = vec![(1234, "4D2"), (-1, "FFFFFFFFFFFFFFFF")];
+        let test_cases = vec![
+            (0_i64, "0"),
+            (1, "1"),
+            (15, "F"),
+            (16, "10"),
+            (255, "FF"),
+            (256, "100"),
+            (1234, "4D2"),
+            (i64::MAX, "7FFFFFFFFFFFFFFF"),
+            (i64::MIN, "8000000000000000"),
+            (-1, "FFFFFFFFFFFFFFFF"),
+        ];
 
         for (num, expected) in test_cases {
             let mut cache = [0u8; 16];
@@ -435,11 +476,55 @@ mod test {
 
             unsafe {
                 let result = from_utf8_unchecked(slice);
-                assert_eq!(expected, result);
+                assert_eq!(expected, result, "hex_int64({num}) mismatch");
             }
         }
     }
 
+    #[test]
+    fn test_hex_lookup_table_covers_all_bytes() {
+        // Cross-check the precomputed table against an independent encoder
+        // for every possible byte value and both casings.
+        for byte in 0u8..=255 {
+            let upper = format!("{byte:02X}");
+            let lower = format!("{byte:02x}");
+            let upper_pair = super::HEX_LOOKUP_UPPER[byte as usize];
+            let lower_pair = super::HEX_LOOKUP_LOWER[byte as usize];
+            assert_eq!(
+                upper.as_bytes(),
+                &upper_pair,
+                "upper encoding mismatch for byte 0x{byte:02X}"
+            );
+            assert_eq!(
+                lower.as_bytes(),
+                &lower_pair,
+                "lower encoding mismatch for byte 0x{byte:02X}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_spark_hex_binary_round_trip_all_bytes() {
+        // Single-row binary input containing every byte value, encoded in
+        // a single column. Catches per-byte regressions in the bytes path.
+        let payload: Vec<u8> = (0u8..=255).collect();
+        let bin_array = BinaryArray::from(vec![Some(payload.as_slice())]);
+
+        let result = super::spark_hex(&[ColumnarValue::Array(Arc::new(bin_array))])
+            .unwrap();
+        let array = match result {
+            ColumnarValue::Array(array) => array,
+            _ => panic!("Expected array"),
+        };
+        let strings = as_string_array(&array);
+        let mut expected = String::with_capacity(512);
+        for byte in 0u8..=255 {
+            use std::fmt::Write;
+            write!(expected, "{byte:02X}").unwrap();
+        }
+        assert_eq!(strings.value(0), expected);
+    }
+
     #[test]
     fn test_spark_hex_int64() {
         let int_array = Int64Array::from(vec![Some(1), Some(2), None, Some(3)]);

From 0c3eb176a146953c9e0e9d33c68aa95227325fba Mon Sep 17 00:00:00 2001
From: Scolliq <146259639+Scolliq@users.noreply.github.com>
Date: Sat, 25 Apr 2026 09:53:04 +0200
Subject: [PATCH 2/2] style: apply cargo fmt to hex.rs test

---
 datafusion/spark/src/function/math/hex.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/spark/src/function/math/hex.rs b/datafusion/spark/src/function/math/hex.rs
index f3cc2516311a6..22e0b5b0786ea 100644
--- a/datafusion/spark/src/function/math/hex.rs
+++ b/datafusion/spark/src/function/math/hex.rs
@@ -510,8 +510,8 @@ mod test {
         let payload: Vec<u8> = (0u8..=255).collect();
         let bin_array = BinaryArray::from(vec![Some(payload.as_slice())]);
 
-        let result = super::spark_hex(&[ColumnarValue::Array(Arc::new(bin_array))])
-            .unwrap();
+        let result =
+            super::spark_hex(&[ColumnarValue::Array(Arc::new(bin_array))]).unwrap();
         let array = match result {
             ColumnarValue::Array(array) => array,
             _ => panic!("Expected array"),