From 5e898d7f778e2eccb610d4b467372a630efd1a47 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Thu, 18 Jun 2026 09:15:10 -0700
Subject: [PATCH 1/4] wolfcrypt: support CHAR_BIT != 8 (WOLFSSL_WIDE_BYTE)
 targets - infra, hashes, DRBG

Enables wolfCrypt on toolchains where a C byte/char is wider than 8 bits (e.g.
TI C2000 C28x, CHAR_BIT == 16), all gated on WOLFSSL_WIDE_BYTE and a no-op on
8-bit-byte targets (the default fast paths are left exactly as-is):
 - types.h: auto-set WOLFSSL_WIDE_BYTE for CHAR_BIT != 8 / known TI C2000
   toolchains (and define CHAR_BIT = 16 when <limits.h> is absent); wc_port.h/.c
   widen the atomic init-state bitfield + CHAR_BIT static assert for 16-bit int.
 - settings.h + sp_int.h: allow SP math on a 16-bit-int CPU via
   WOLFSSL_SP_ALLOW_16BIT_CPU, and detect a 16-bit char in the SP smallest-type
   selection.
 - misc.c/misc.h: shared big-endian octet<->word helpers
   (WordsFromBytesBE32/64, BytesFromWordsBE32/64) for WOLFSSL_WIDE_BYTE, where a
   word cannot be aliased as an octet stream.  They are CHAR_BIT-generic,
   cl2000-safe (loads accumulate with <<= 8, since (word)octet << 24 is
   miscompiled as a 16-bit shift), in-place safe for the SHA schedule, and store
   by octet count for partial digests.  misc.c rotate width uses CHAR_BIT.
 - coding.c: mask the constant-time base64 result to an octet.
 - sha256.c/sha512.c: use the shared helpers for the schedule load and digest
   store, plus a CHAR_BIT*sizeof length carry; sha3.c: octet-wise Keccak squeeze.
 - random.c: Hash-DRBG length + reseed-counter serialization via the shared
   helpers (and an octet-masked carry) under WOLFSSL_WIDE_BYTE; default builds
   keep the word-aliasing path unchanged.

WOLFSSL_WIDE_BYTE replaces the earlier WOLFSSL_NO_OCTET_BYTE working name.
---
 .wolfssl_known_macro_extras  |  7 ++++
 wolfcrypt/src/coding.c       |  2 +-
 wolfcrypt/src/misc.c         | 75 +++++++++++++++++++++++++++++++++---
 wolfcrypt/src/random.c       | 56 ++++++++++++++++++++++++---
 wolfcrypt/src/sha256.c       | 54 +++++++++++++++++++++++++-
 wolfcrypt/src/sha3.c         | 49 ++++++++++++++++++++---
 wolfcrypt/src/sha512.c       | 68 ++++++++++++++++++++++++++++++--
 wolfcrypt/src/wc_port.c      |  2 +-
 wolfssl/wolfcrypt/misc.h     | 13 +++++++
 wolfssl/wolfcrypt/settings.h |  8 +++-
 wolfssl/wolfcrypt/sp_int.h   | 11 +++++-
 wolfssl/wolfcrypt/types.h    | 52 ++++++++++++++++++++++++-
 wolfssl/wolfcrypt/wc_port.h  |  9 ++++-
 13 files changed, 376 insertions(+), 30 deletions(-)

diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras
index 101ebf2fa88..31da154be16 100644
--- a/.wolfssl_known_macro_extras
+++ b/.wolfssl_known_macro_extras
@@ -747,6 +747,7 @@ WOLFSSL_CLANG_TIDY
 WOLFSSL_CLIENT_EXAMPLE
 WOLFSSL_CONTIKI
 WOLFSSL_CRL_ALLOW_MISSING_CDP
+WOLFSSL_DILITHIUM_VERIFY_SMALLEST_MEM
 WOLFSSL_DISABLE_EARLY_SANITY_CHECKS
 WOLFSSL_DRBG_SHA256
 WOLFSSL_DTLS13_ECHO_LEGACY_SESSION_ID
@@ -1115,6 +1116,12 @@ __SUNPRO_CC
 __SVR4
 __TASKING__
 __TI_COMPILER_VERSION__
+__TMS320C2000__
+__TMS320C2800__
+__TMS320C28XX__
+__TMS320C54X__
+__TMS320C5500__
+__TMS320C55X__
 __TURBOC__
 __UNIX__
 __USE_GNU
diff --git a/wolfcrypt/src/coding.c b/wolfcrypt/src/coding.c
index b3f804fcf02..8d91b2674b8 100644
--- a/wolfcrypt/src/coding.c
+++ b/wolfcrypt/src/coding.c
@@ -72,7 +72,7 @@ static WC_INLINE byte Base64_Char2Val_CT(byte c)
     v |= ((slashStart >> 8) ^ (slashEnd >> 8)) & (slashStart + 63 + 1);
     v |= ((plusStart  >> 8) ^ (plusEnd  >> 8)) & (plusStart  + 62 + 1);
 
-    return (byte)(v - 1);
+    return WC_OCTET(v - 1);
 }
 
 #ifndef BASE64_NO_TABLE
diff --git a/wolfcrypt/src/misc.c b/wolfcrypt/src/misc.c
index 669794702dc..98636d2c5d4 100644
--- a/wolfcrypt/src/misc.c
+++ b/wolfcrypt/src/misc.c
@@ -108,13 +108,13 @@ masking and clearing memory logic.
 
     WC_MISC_STATIC WC_INLINE word32 rotlFixed(word32 x, word32 y)
     {
-        return (x << y) | (x >> (sizeof(x) * 8 - y));
+        return (x << y) | (x >> (sizeof(x) * CHAR_BIT - y));
     }
 
 /* This routine performs a right circular arithmetic shift of <x> by <y> value. */
     WC_MISC_STATIC WC_INLINE word32 rotrFixed(word32 x, word32 y)
     {
-        return (x >> y) | (x << (sizeof(x) * 8 - y));
+        return (x >> y) | (x << (sizeof(x) * CHAR_BIT - y));
     }
 
 #endif
@@ -122,14 +122,14 @@ masking and clearing memory logic.
 /* This routine performs a left circular arithmetic shift of <x> by <y> value */
 WC_MISC_STATIC WC_INLINE word16 rotlFixed16(word16 x, word16 y)
 {
-    return (word16)((x << y) | (x >> (sizeof(x) * 8U - y)));
+    return (word16)((x << y) | (x >> (sizeof(x) * CHAR_BIT - y)));
 }
 
 
 /* This routine performs a right circular arithmetic shift of <x> by <y> value */
 WC_MISC_STATIC WC_INLINE word16 rotrFixed16(word16 x, word16 y)
 {
-    return (word16)((x >> y) | (x << (sizeof(x) * 8U - y)));
+    return (word16)((x >> y) | (x << (sizeof(x) * CHAR_BIT - y)));
 }
 
 /* This routine performs a byte swap of 32-bit word value. */
@@ -329,13 +329,13 @@ WC_MISC_STATIC WC_INLINE void writeUnalignedWords64(byte *out, const word64 *in,
 
 WC_MISC_STATIC WC_INLINE word64 rotlFixed64(word64 x, word64 y)
 {
-    return (x << y) | (x >> (sizeof(y) * 8 - y));
+    return (x << y) | (x >> (sizeof(y) * CHAR_BIT - y));
 }
 
 
 WC_MISC_STATIC WC_INLINE word64 rotrFixed64(word64 x, word64 y)
 {
-    return (x >> y) | (x << (sizeof(y) * 8 - y));
+    return (x >> y) | (x << (sizeof(y) * CHAR_BIT - y));
 }
 
 
@@ -410,6 +410,69 @@ WC_MISC_STATIC WC_INLINE void ByteReverseWords64(word64* out, const word64* in,
 
 #endif /* WORD64_AVAILABLE && !WOLFSSL_NO_WORD64_OPS */
 
+#ifdef WOLFSSL_WIDE_BYTE
+/* Big-endian octet <-> word conversion for targets where a C 'byte' (char) is
+ * wider than 8 bits (WOLFSSL_WIDE_BYTE, i.e. CHAR_BIT != 8 - e.g. the TI C2000
+ * C28x).  There a word holds several octets per addressable cell, so it cannot
+ * be aliased as an octet stream (XMEMCPY / ByteReverseWords / (byte) casts all
+ * assume one octet per cell).  These helpers move between an octet stream (one
+ * octet per byte cell) and big-endian words with shifts, correct for any
+ * CHAR_BIT.  Shared by SHA-2 (schedule load + digest store) and the Hash-DRBG.
+ *
+ * Loads accumulate through a word with <<= 8 so each shift is unambiguously
+ * full-word width: a single (word32)octet << 24 is miscompiled as a 16-bit
+ * shift by the TI cl2000 (the top octets are dropped).  A load reads all of a
+ * word's octets before writing the word, so it is safe in place over the
+ * destination word buffer.  Stores take an octet count (not a word count) so a
+ * partial trailing word works - e.g. SHA-512/224's 28-octet digest. */
+WC_MISC_STATIC WC_INLINE void WordsFromBytesBE32(word32* w, const byte* b,
+    word32 wordCnt)
+{
+    word32 i;
+    word32 r;
+    for (i = 0; i < wordCnt; i++) {
+        r  = (word32)(b[(i * 4) + 0] & 0xFF); r <<= 8;
+        r |= (word32)(b[(i * 4) + 1] & 0xFF); r <<= 8;
+        r |= (word32)(b[(i * 4) + 2] & 0xFF); r <<= 8;
+        r |= (word32)(b[(i * 4) + 3] & 0xFF);
+        w[i] = r;
+    }
+}
+WC_MISC_STATIC WC_INLINE void BytesFromWordsBE32(byte* b, const word32* w,
+    word32 byteCnt)
+{
+    word32 i;
+    for (i = 0; i < byteCnt; i++) {
+        b[i] = (byte)((w[i >> 2] >> (24 - ((i & 0x3) * 8))) & 0xFF);
+    }
+}
+#ifdef WORD64_AVAILABLE
+WC_MISC_STATIC WC_INLINE void WordsFromBytesBE64(word64* w, const byte* b,
+    word32 wordCnt)
+{
+    word32 i;
+    int    j;
+    word64 r;
+    for (i = 0; i < wordCnt; i++) {
+        r = 0;
+        for (j = 0; j < 8; j++) {
+            r <<= 8;
+            r |= (word64)(b[(i * 8) + j] & 0xFF);
+        }
+        w[i] = r;
+    }
+}
+WC_MISC_STATIC WC_INLINE void BytesFromWordsBE64(byte* b, const word64* w,
+    word32 byteCnt)
+{
+    word32 i;
+    for (i = 0; i < byteCnt; i++) {
+        b[i] = (byte)((w[i >> 3] >> (56 - ((i & 0x7) * 8))) & 0xFF);
+    }
+}
+#endif /* WORD64_AVAILABLE */
+#endif /* WOLFSSL_WIDE_BYTE */
+
 #ifndef WOLFSSL_NO_XOR_OPS
 
 /* Leave no doubt that WOLFSSL_WORD_SIZE is a power of 2. */
diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c
index a75d3400e70..0a720986086 100644
--- a/wolfcrypt/src/random.c
+++ b/wolfcrypt/src/random.c
@@ -464,7 +464,10 @@ static int Hash_df(DRBG_internal* drbg, byte* out, word32 outSz, byte type,
     byte ctr;
     word32 i;
     word32 len;
-    word32 bits = (outSz * 8); /* reverse byte order */
+    word32 bits = (outSz * 8);
+#ifdef WOLFSSL_WIDE_BYTE
+    byte bitsBuf[4]; /* the 32-bit length as four big-endian octets */
+#endif
 #ifdef WOLFSSL_SMALL_STACK_CACHE
     wc_Sha256* sha = &drbg->sha256;
 #else
@@ -489,7 +492,11 @@ static int Hash_df(DRBG_internal* drbg, byte* out, word32 outSz, byte type,
         return DRBG_FAILURE;
 #endif
 
-#ifdef LITTLE_ENDIAN_ORDER
+#ifdef WOLFSSL_WIDE_BYTE
+    /* A word32 cannot be aliased as an octet stream where a C byte is wider
+     * than 8 bits; emit the length as four big-endian octets (shared helper). */
+    BytesFromWordsBE32(bitsBuf, &bits, 4);
+#elif defined(LITTLE_ENDIAN_ORDER)
     bits = ByteReverseWord32(bits);
 #endif
     len = (outSz / OUTPUT_BLOCK_LEN)
@@ -509,7 +516,11 @@ static int Hash_df(DRBG_internal* drbg, byte* out, word32 outSz, byte type,
         ret = wc_Sha256Update(sha, &ctr, sizeof(ctr));
         if (ret == 0) {
             ctr++;
+#ifdef WOLFSSL_WIDE_BYTE
+            ret = wc_Sha256Update(sha, bitsBuf, sizeof(bitsBuf));
+#else
             ret = wc_Sha256Update(sha, (byte*)&bits, sizeof(bits));
+#endif
         }
 
         if (ret == 0) {
@@ -666,7 +677,7 @@ static WC_INLINE void array_add_one(byte* data, word32 dataSz)
 {
     int i;
     for (i = (int)dataSz - 1; i >= 0; i--) {
-        data[i]++;
+        data[i] = WC_OCTET(data[i] + 1);
         if (data[i] != 0) break;
     }
 }
@@ -789,14 +800,14 @@ static WC_INLINE void array_add(byte* d, word32 dLen, const byte* s, word32 sLen
         dIdx = (int)dLen - 1;
         for (sIdx = (int)sLen - 1; sIdx >= 0; sIdx--) {
             carry = (word16)(carry + d[dIdx] + s[sIdx]);
-            d[dIdx] = (byte)carry;
+            d[dIdx] = WC_OCTET(carry);
             carry >>= 8;
             dIdx--;
         }
 
         for (; dIdx >= 0; dIdx--) {
             carry = (word16)(carry + d[dIdx]);
-            d[dIdx] = (byte)carry;
+            d[dIdx] = WC_OCTET(carry);
             carry >>= 8;
         }
     }
@@ -820,6 +831,9 @@ static int Hash_DRBG_Generate(DRBG_internal* drbg, byte* out, word32 outSz,
 #else
     word32 reseedCtr;
 #endif
+#ifdef WOLFSSL_WIDE_BYTE
+    byte ctrBuf[8]; /* reseed counter as big-endian octets */
+#endif
 
     if (drbg == NULL) {
         return DRBG_FAILURE;
@@ -914,6 +928,17 @@ static int Hash_DRBG_Generate(DRBG_internal* drbg, byte* out, word32 outSz,
             if (ret == 0) {
                 array_add(drbg->V, sizeof(drbg->V), digest, WC_SHA256_DIGEST_SIZE);
                 array_add(drbg->V, sizeof(drbg->V), drbg->C, sizeof(drbg->C));
+#ifdef WOLFSSL_WIDE_BYTE
+                /* A word cannot be aliased as an octet stream where a C byte is
+                 * wider than 8 bits; add the counter as big-endian octets. */
+            #ifdef WORD64_AVAILABLE
+                BytesFromWordsBE64(ctrBuf, &reseedCtr, 8);
+                array_add(drbg->V, sizeof(drbg->V), ctrBuf, 8);
+            #else
+                BytesFromWordsBE32(ctrBuf, &reseedCtr, 4);
+                array_add(drbg->V, sizeof(drbg->V), ctrBuf, 4);
+            #endif
+#else
             #ifdef LITTLE_ENDIAN_ORDER
                 #ifdef WORD64_AVAILABLE
                 reseedCtr = ByteReverseWord64(reseedCtr);
@@ -923,6 +948,7 @@ static int Hash_DRBG_Generate(DRBG_internal* drbg, byte* out, word32 outSz,
             #endif
                 array_add(drbg->V, sizeof(drbg->V),
                                           (byte*)&reseedCtr, sizeof(reseedCtr));
+#endif
                 ret = DRBG_SUCCESS;
             }
             drbg->reseedCtr++;
@@ -1050,6 +1076,9 @@ static int Hash512_df(DRBG_SHA512_internal* drbg, byte* out, word32 outSz,
     word32 i;
     word32 len;
     word32 bits = (outSz * 8);
+#ifdef WOLFSSL_WIDE_BYTE
+    byte bitsBuf[4]; /* the 32-bit length as four big-endian octets */
+#endif
 #ifdef WOLFSSL_SMALL_STACK_CACHE
     wc_Sha512* sha = &drbg->sha512;
 #else
@@ -1082,7 +1111,9 @@ static int Hash512_df(DRBG_SHA512_internal* drbg, byte* out, word32 outSz,
         return DRBG_FAILURE;
 #endif
 
-#ifdef LITTLE_ENDIAN_ORDER
+#ifdef WOLFSSL_WIDE_BYTE
+    BytesFromWordsBE32(bitsBuf, &bits, 4);   /* see Hash_df */
+#elif defined(LITTLE_ENDIAN_ORDER)
     bits = ByteReverseWord32(bits);
 #endif
     len = (outSz / OUTPUT_BLOCK_LEN_SHA512)
@@ -1102,7 +1133,11 @@ static int Hash512_df(DRBG_SHA512_internal* drbg, byte* out, word32 outSz,
         ret = wc_Sha512Update(sha, &ctr, sizeof(ctr));
         if (ret == 0) {
             ctr++;
+#ifdef WOLFSSL_WIDE_BYTE
+            ret = wc_Sha512Update(sha, bitsBuf, sizeof(bitsBuf));
+#else
             ret = wc_Sha512Update(sha, (byte*)&bits, sizeof(bits));
+#endif
         }
 
         if (ret == 0) {
@@ -1300,6 +1335,9 @@ static int Hash512_DRBG_Generate(DRBG_SHA512_internal* drbg, byte* out,
 #endif
     byte type;
     word64 reseedCtr;
+#ifdef WOLFSSL_WIDE_BYTE
+    byte ctrBuf[8]; /* reseed counter as big-endian octets */
+#endif
 
     if (drbg == NULL) {
         return DRBG_FAILURE;
@@ -1377,11 +1415,17 @@ static int Hash512_DRBG_Generate(DRBG_SHA512_internal* drbg, byte* out,
                 array_add(drbg->V, sizeof(drbg->V), digest,
                           WC_SHA512_DIGEST_SIZE);
                 array_add(drbg->V, sizeof(drbg->V), drbg->C, sizeof(drbg->C));
+#ifdef WOLFSSL_WIDE_BYTE
+                /* See Hash_DRBG_Generate. */
+                BytesFromWordsBE64(ctrBuf, &reseedCtr, 8);
+                array_add(drbg->V, sizeof(drbg->V), ctrBuf, 8);
+#else
             #ifdef LITTLE_ENDIAN_ORDER
                 reseedCtr = ByteReverseWord64(reseedCtr);
             #endif
                 array_add(drbg->V, sizeof(drbg->V),
                                           (byte*)&reseedCtr, sizeof(reseedCtr));
+#endif
                 ret = DRBG_SUCCESS;
             }
             drbg->reseedCtr++;
diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c
index 047c57dade8..992c178b84d 100644
--- a/wolfcrypt/src/sha256.c
+++ b/wolfcrypt/src/sha256.c
@@ -248,6 +248,9 @@ on the specific device platform.
     #define SHA256_UPDATE_REV_BYTES(ctx)    SHA256_REV_BYTES(ctx)
 #endif
 
+/* WOLFSSL_WIDE_BYTE octet<->word32 conversion uses the shared big-endian
+ * helpers WordsFromBytesBE32 / BytesFromWordsBE32 from misc.c. */
+
 
 #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \
     (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH) || \
@@ -1499,10 +1502,16 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
                 }
             #endif
 
+        #ifdef WOLFSSL_WIDE_BYTE
+            /* CHAR_BIT != 8: load 16 big-endian schedule words octet-wise. */
+            WordsFromBytesBE32(sha256->buffer, (const byte*)sha256->buffer,
+                WC_SHA256_BLOCK_SIZE / 4);
+        #else
             if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
                 ByteReverseWords(sha256->buffer, sha256->buffer,
                     WC_SHA256_BLOCK_SIZE);
             }
+        #endif
 
             #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
                !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
@@ -1603,9 +1612,14 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
                 }
             #endif
 
+        #ifdef WOLFSSL_WIDE_BYTE
+            WordsFromBytesBE32(local32, (const byte*)local32,
+                WC_SHA256_BLOCK_SIZE / 4);
+        #else
             if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
                 ByteReverseWords(local32, local32, WC_SHA256_BLOCK_SIZE);
             }
+        #endif
 
             #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
                !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
@@ -1705,10 +1719,15 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
             }
         #endif
 
+    #ifdef WOLFSSL_WIDE_BYTE
+        WordsFromBytesBE32(sha256->buffer, (const byte*)sha256->buffer,
+            WC_SHA256_BLOCK_SIZE / 4);
+    #else
         if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
             ByteReverseWords(sha256->buffer, sha256->buffer,
                 WC_SHA256_BLOCK_SIZE);
         }
+    #endif
 
         #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
            !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
@@ -1733,7 +1752,8 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
             WC_SHA256_PAD_SIZE - sha256->buffLen);
 
         /* put 64 bit length in separate 32 bit parts */
-        sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
+        sha256->hiLen = (sha256->loLen >>
+                            (CHAR_BIT * sizeof(sha256->loLen) - 3)) +
                                                          (sha256->hiLen << 3);
         sha256->loLen = sha256->loLen << 3;
 
@@ -1745,6 +1765,21 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
     #endif
 
         /* store lengths */
+#ifdef WOLFSSL_WIDE_BYTE
+        /* CHAR_BIT != 8: 'local' indexes octet cells, not the word32 layout, so
+         * place the 64-bit bit-length as 8 big-endian octets, then load all 16
+         * big-endian schedule words octet-wise (covers W[14]=hiLen,W[15]=loLen). */
+        local[WC_SHA256_PAD_SIZE + 0] = (byte)((sha256->hiLen >> 24) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 1] = (byte)((sha256->hiLen >> 16) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 2] = (byte)((sha256->hiLen >>  8) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 3] = (byte)((sha256->hiLen      ) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 4] = (byte)((sha256->loLen >> 24) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 5] = (byte)((sha256->loLen >> 16) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 6] = (byte)((sha256->loLen >>  8) & 0xFF);
+        local[WC_SHA256_PAD_SIZE + 7] = (byte)((sha256->loLen      ) & 0xFF);
+        WordsFromBytesBE32(sha256->buffer, (const byte*)sha256->buffer,
+            WC_SHA256_BLOCK_SIZE / 4);
+#else
         if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
             ByteReverseWords(sha256->buffer, sha256->buffer,
                 WC_SHA256_PAD_SIZE);
@@ -1753,6 +1788,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
         XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
         XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
                 sizeof(word32));
+#endif
 
     /* Only the ESP32-C3 with HW enabled may need pad size byte order reversal
      * depending on HW or SW mode */
@@ -1841,7 +1877,11 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
             return BAD_FUNC_ARG;
         }
 
-    #ifdef LITTLE_ENDIAN_ORDER
+    #if defined(WOLFSSL_WIDE_BYTE)
+        /* CHAR_BIT != 8: store digest words as big-endian octets. */
+        BytesFromWordsBE32(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+        (void)digest;
+    #elif defined(LITTLE_ENDIAN_ORDER)
         if (SHA256_REV_BYTES(&sha256->ctx)) {
             ByteReverseWords((word32*)digest, (word32*)sha256->digest,
                               WC_SHA256_DIGEST_SIZE);
@@ -1889,6 +1929,10 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
             return ret;
         }
 
+    #if defined(WOLFSSL_WIDE_BYTE)
+        /* CHAR_BIT != 8: store digest words as big-endian octets. */
+        BytesFromWordsBE32(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+    #else
     #if defined(LITTLE_ENDIAN_ORDER)
         if (SHA256_REV_BYTES(&sha256->ctx)) {
             ByteReverseWords(sha256->digest, sha256->digest,
@@ -1896,6 +1940,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
         }
     #endif
         XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+    #endif
 
         return InitSha256(sha256);  /* reset state */
     }
@@ -2417,6 +2462,10 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
         if (ret != 0)
             return ret;
 
+    #if defined(WOLFSSL_WIDE_BYTE)
+        /* CHAR_BIT != 8: store digest words as big-endian octets. */
+        BytesFromWordsBE32(hash, sha224->digest, WC_SHA224_DIGEST_SIZE);
+    #else
     #if defined(LITTLE_ENDIAN_ORDER)
         if (SHA256_REV_BYTES(&sha224->ctx)) {
             ByteReverseWords(sha224->digest,
@@ -2425,6 +2474,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
         }
     #endif
         XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE);
+    #endif
 
         return InitSha224(sha224);  /* reset state */
     }
diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c
index 33d06928834..a87a1170b9c 100644
--- a/wolfcrypt/src/sha3.c
+++ b/wolfcrypt/src/sha3.c
@@ -94,6 +94,15 @@
     #define WC_SHA3_SW_KECCAK
 #endif
 
+/* On targets where a C 'byte' is not 8 bits (e.g. TI C28x has CHAR_BIT == 16)
+ * the word64 Keccak state cannot be aliased as a contiguous octet stream, so
+ * the little-endian fast paths that xorbuf / memcpy / cast the state as bytes
+ * are wrong.  WC_SHA3_BYTEWISE forces the octet-wise absorb (Load64), pad, and
+ * squeeze paths instead. */
+#if !defined(WC_SHA3_BYTEWISE) && defined(WOLFSSL_WIDE_BYTE)
+    #define WC_SHA3_BYTEWISE
+#endif
+
 #if FIPS_VERSION3_GE(6,0,0)
     const unsigned int wolfCrypt_FIPS_sha3_ro_sanity[2] =
                                                      { 0x1a2b3c4d, 0x00000016 };
@@ -604,7 +613,7 @@ void BlockSha3(word64* s)
 #endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */
 
 #ifdef WC_SHA3_SW_KECCAK
-#if defined(BIG_ENDIAN_ORDER)
+#if defined(BIG_ENDIAN_ORDER) || defined(WC_SHA3_BYTEWISE)
 static WC_INLINE word64 Load64Unaligned(const unsigned char *a)
 {
     return ((word64)a[0] <<  0) |
@@ -780,7 +789,8 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
         sha3->i = (byte)(sha3->i + i);
 
         if (sha3->i == p * 8) {
-    #if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN)
+    #if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) && \
+        !defined(WC_SHA3_BYTEWISE)
             xorbuf(sha3->s, sha3->t, (word32)(p * 8));
     #else
             for (i = 0; i < p; i++) {
@@ -817,7 +827,8 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
     total_check += blocks * p;
 #endif
     for (; blocks > 0; blocks--) {
-#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN)
+#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) && \
+    !defined(WC_SHA3_BYTEWISE)
         xorbuf(sha3->s, data, (word32)(p * 8));
 #else
         for (i = 0; i < p; i++) {
@@ -866,18 +877,34 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p)
  * len   Number of bytes in output.
  * returns 0 on success.
  */
+#ifdef WC_SHA3_BYTEWISE
+/* Squeeze len output bytes from the Keccak state, extracting each octet from
+ * the 64-bit lanes (little-endian within a lane).  Used where a C 'byte' is
+ * wider than 8 bits (CHAR_BIT != 8) so the state cannot be copied as an octet
+ * stream. */
+static void Sha3SqueezeBytes(byte* out, const word64* s, word32 len)
+{
+    word32 k;
+    for (k = 0; k < len; k++) {
+        out[k] = (byte)((s[k >> 3] >> (8 * (k & 7))) & 0xFF);
+    }
+}
+#endif
+
 static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
 {
     word32 rate = p * 8U;
     word32 j;
-#if defined(BIG_ENDIAN_ORDER) || defined(WC_SHA3_FAULT_HARDEN)
+#if defined(BIG_ENDIAN_ORDER) || defined(WC_SHA3_FAULT_HARDEN) || \
+    defined(WC_SHA3_BYTEWISE)
     word32 i;
 #endif
 #ifdef WC_SHA3_FAULT_HARDEN
     int check = 0;
 #endif
 
-#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN)
+#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) && \
+    !defined(WC_SHA3_BYTEWISE)
     xorbuf(sha3->s, sha3->t, sha3->i);
 #ifdef WOLFSSL_HASH_FLAGS
     if ((p == WC_SHA3_256_COUNT) && (sha3->flags & WC_HASH_SHA3_KECCAK256)) {
@@ -924,6 +951,8 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
     #endif
     #if defined(BIG_ENDIAN_ORDER)
         ByteReverseWords64((word64*)(hash + j), sha3->s, rate);
+    #elif defined(WC_SHA3_BYTEWISE)
+        Sha3SqueezeBytes(hash + j, sha3->s, rate);
     #else
         XMEMCPY(hash + j, sha3->s, rate);
     #endif
@@ -936,8 +965,12 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l)
     #endif
     #if defined(BIG_ENDIAN_ORDER)
         ByteReverseWords64(sha3->s, sha3->s, rate);
-    #endif
         XMEMCPY(hash + j, sha3->s, l - j);
+    #elif defined(WC_SHA3_BYTEWISE)
+        Sha3SqueezeBytes(hash + j, sha3->s, l - j);
+    #else
+        XMEMCPY(hash + j, sha3->s, l - j);
+    #endif
     }
 #if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && defined(USE_INTEL_SPEEDUP)
     if (SHA3_BLOCK == sha3_block_avx2) {
@@ -1916,6 +1949,8 @@ int wc_Shake128_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
     #endif
     #if defined(BIG_ENDIAN_ORDER)
         ByteReverseWords64((word64*)out, shake->s, WC_SHA3_128_COUNT * 8);
+    #elif defined(WC_SHA3_BYTEWISE)
+        Sha3SqueezeBytes(out, shake->s, WC_SHA3_128_COUNT * 8);
     #else
         XMEMCPY(out, shake->s, WC_SHA3_128_COUNT * 8);
     #endif
@@ -2157,6 +2192,8 @@ int wc_Shake256_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt)
     #endif
     #if defined(BIG_ENDIAN_ORDER)
         ByteReverseWords64((word64*)out, shake->s, WC_SHA3_256_COUNT * 8);
+    #elif defined(WC_SHA3_BYTEWISE)
+        Sha3SqueezeBytes(out, shake->s, WC_SHA3_256_COUNT * 8);
     #else
         XMEMCPY(out, shake->s, WC_SHA3_256_COUNT * 8);
     #endif
diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c
index b2f57b13b86..7312fd8eca9 100644
--- a/wolfcrypt/src/sha512.c
+++ b/wolfcrypt/src/sha512.c
@@ -1751,6 +1751,9 @@ static int _Transform_Sha512(wc_Sha512* sha512)
 #endif
 
 
+/* WOLFSSL_WIDE_BYTE octet<->word64 conversion uses the shared big-endian
+ * helpers WordsFromBytesBE64 / BytesFromWordsBE64 from misc.c. */
+
 static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len)
 {
     word64 tmp = sha512->loLen;
@@ -1798,8 +1801,13 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le
               defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \
               defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512)) && \
              !defined(WOLFSSL_ARMASM)
+            #ifdef WOLFSSL_WIDE_BYTE
+                WordsFromBytesBE64(sha512->buffer,
+                    (const byte*)sha512->buffer, WC_SHA512_BLOCK_SIZE / 8);
+            #else
                 ByteReverseWords64(sha512->buffer, sha512->buffer,
                                                          WC_SHA512_BLOCK_SIZE);
+            #endif
         #endif
             }
     #endif
@@ -1900,8 +1908,13 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le
     #if !defined(WOLFSSL_ESP32_CRYPT) || \
          defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \
          defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512)
+        #ifdef WOLFSSL_WIDE_BYTE
+            WordsFromBytesBE64(sha512->buffer, (const byte*)sha512->buffer,
+                WC_SHA512_BLOCK_SIZE / 8);
+        #else
             ByteReverseWords64(sha512->buffer, sha512->buffer,
                                                        WC_SHA512_BLOCK_SIZE);
+        #endif
     #endif
     #if !defined(WOLFSSL_ESP32_CRYPT) || \
          defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \
@@ -2035,8 +2048,13 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512)
               defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \
               defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512)) && \
              !defined(WOLFSSL_ARMASM)
+            #ifdef WOLFSSL_WIDE_BYTE
+            WordsFromBytesBE64(sha512->buffer, (const byte*)sha512->buffer,
+                WC_SHA512_BLOCK_SIZE / 8);
+            #else
             ByteReverseWords64(sha512->buffer,sha512->buffer,
                                                          WC_SHA512_BLOCK_SIZE);
+            #endif
         #endif
         }
 
@@ -2071,11 +2089,37 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512)
     XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen);
 
     /* put lengths in bits */
-    sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
+    sha512->hiLen = (sha512->loLen >>
+                        (CHAR_BIT * sizeof(sha512->loLen) - 3)) +
                                                          (sha512->hiLen << 3);
     sha512->loLen = sha512->loLen << 3;
 
     /* store lengths */
+#ifdef WOLFSSL_WIDE_BYTE
+    /* CHAR_BIT != 8: 'local' indexes octet cells, not the word64 layout (the
+     * buffer[BLOCK/sizeof(word64) - 2/-1] indices assume 16 words per block,
+     * which is wrong when sizeof(word64) != 8).  Place the 128-bit bit-length
+     * as 16 big-endian octets at the pad offset, then load all 16 big-endian
+     * schedule words octet-wise. */
+    local[WC_SHA512_PAD_SIZE +  0] = (byte)((sha512->hiLen >> 56) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  1] = (byte)((sha512->hiLen >> 48) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  2] = (byte)((sha512->hiLen >> 40) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  3] = (byte)((sha512->hiLen >> 32) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  4] = (byte)((sha512->hiLen >> 24) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  5] = (byte)((sha512->hiLen >> 16) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  6] = (byte)((sha512->hiLen >>  8) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  7] = (byte)((sha512->hiLen      ) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  8] = (byte)((sha512->loLen >> 56) & 0xFF);
+    local[WC_SHA512_PAD_SIZE +  9] = (byte)((sha512->loLen >> 48) & 0xFF);
+    local[WC_SHA512_PAD_SIZE + 10] = (byte)((sha512->loLen >> 40) & 0xFF);
+    local[WC_SHA512_PAD_SIZE + 11] = (byte)((sha512->loLen >> 32) & 0xFF);
+    local[WC_SHA512_PAD_SIZE + 12] = (byte)((sha512->loLen >> 24) & 0xFF);
+    local[WC_SHA512_PAD_SIZE + 13] = (byte)((sha512->loLen >> 16) & 0xFF);
+    local[WC_SHA512_PAD_SIZE + 14] = (byte)((sha512->loLen >>  8) & 0xFF);
+    local[WC_SHA512_PAD_SIZE + 15] = (byte)((sha512->loLen      ) & 0xFF);
+    WordsFromBytesBE64(sha512->buffer, (const byte*)sha512->buffer,
+        WC_SHA512_BLOCK_SIZE / 8);
+#else
 #if defined(LITTLE_ENDIAN_ORDER)
     #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
         (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
@@ -2100,6 +2144,7 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512)
     sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
     sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
 #endif
+#endif /* WOLFSSL_WIDE_BYTE */
 
 #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
     (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
@@ -2151,7 +2196,10 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512)
         return ret;
 #endif
 
-    #ifdef LITTLE_ENDIAN_ORDER
+    /* CHAR_BIT != 8: leave digest in host word order here; the *Final output
+     * functions convert it to big-endian octets via BytesFromWordsBE64
+     * (a word64 array cannot be aliased as octets / reversed in place). */
+    #if defined(LITTLE_ENDIAN_ORDER) && !defined(WOLFSSL_WIDE_BYTE)
         ByteReverseWords64(sha512->digest, sha512->digest,
             WC_SHA512_DIGEST_SIZE);
     #endif
@@ -2185,7 +2233,9 @@ static int Sha512FinalRaw(wc_Sha512* sha512, byte* hash, word32 digestSz)
         return BAD_FUNC_ARG;
     }
 
-#ifdef LITTLE_ENDIAN_ORDER
+#if defined(WOLFSSL_WIDE_BYTE)
+    BytesFromWordsBE64(hash, sha512->digest, digestSz);
+#elif defined(LITTLE_ENDIAN_ORDER)
     if ((digestSz & 0x7) == 0)
         ByteReverseWords64((word64 *)hash, sha512->digest, digestSz);
     else {
@@ -2238,7 +2288,11 @@ static int Sha512_Family_Final(wc_Sha512* sha512, byte* hash, size_t digestSz,
     if (ret != 0)
         return ret;
 
+#ifdef WOLFSSL_WIDE_BYTE
+    BytesFromWordsBE64(hash, sha512->digest, (word32)digestSz);
+#else
     XMEMCPY(hash, sha512->digest, digestSz);
+#endif
 
     /* initialize Sha512 structure for the next use */
     return initfp(sha512);
@@ -2636,7 +2690,9 @@ int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash)
         return BAD_FUNC_ARG;
     }
 
-#ifdef LITTLE_ENDIAN_ORDER
+#if defined(WOLFSSL_WIDE_BYTE)
+    BytesFromWordsBE64(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
+#elif defined(LITTLE_ENDIAN_ORDER)
     ByteReverseWords64((word64 *)hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
 #else
     XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
@@ -2677,7 +2733,11 @@ int wc_Sha384Final(wc_Sha384* sha384, byte* hash)
     if (ret != 0)
         return ret;
 
+#ifdef WOLFSSL_WIDE_BYTE
+    BytesFromWordsBE64(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
+#else
     XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
+#endif
 
     return InitSha384(sha384);  /* reset state */
 }
diff --git a/wolfcrypt/src/wc_port.c b/wolfcrypt/src/wc_port.c
index 63c1bb57f54..efec4b8a362 100644
--- a/wolfcrypt/src/wc_port.c
+++ b/wolfcrypt/src/wc_port.c
@@ -254,7 +254,7 @@ int wc_local_InitUp(wc_init_state_t *s)
      */
     for (;;) {
         wc_static_assert(WC_INIT_STATE_STATE_BITS + WC_INIT_STATE_COUNT_BITS ==
-                         sizeof(WC_ATOMIC_UINT_ARG) * 8);
+                         sizeof(WC_ATOMIC_UINT_ARG) * CHAR_BIT);
         if (exp_wc_init_state.c.count ==
             (((WC_ATOMIC_UINT_ARG)1 << WC_INIT_STATE_COUNT_BITS)
              - (WC_ATOMIC_UINT_ARG)1))
diff --git a/wolfssl/wolfcrypt/misc.h b/wolfssl/wolfcrypt/misc.h
index 0a20345287b..a46c375c579 100644
--- a/wolfssl/wolfcrypt/misc.h
+++ b/wolfssl/wolfcrypt/misc.h
@@ -56,6 +56,13 @@ word32 ByteReverseWord32(word32 value);
 WOLFSSL_LOCAL
 void   ByteReverseWords(word32* out, const word32* in, word32 byteCount);
 
+#ifdef WOLFSSL_WIDE_BYTE
+WOLFSSL_LOCAL
+void WordsFromBytesBE32(word32* w, const byte* b, word32 wordCnt);
+WOLFSSL_LOCAL
+void BytesFromWordsBE32(byte* b, const word32* w, word32 byteCnt);
+#endif
+
 WOLFSSL_LOCAL
 void XorWordsOut(wolfssl_word** r, const wolfssl_word** a,
         const wolfssl_word** b, word32 n);
@@ -99,6 +106,12 @@ WOLFSSL_LOCAL
 word64 ByteReverseWord64(word64 value);
 WOLFSSL_LOCAL
 void   ByteReverseWords64(word64* out, const word64* in, word32 byteCount);
+#ifdef WOLFSSL_WIDE_BYTE
+WOLFSSL_LOCAL
+void WordsFromBytesBE64(word64* w, const byte* b, word32 wordCnt);
+WOLFSSL_LOCAL
+void BytesFromWordsBE64(byte* b, const word64* w, word32 byteCnt);
+#endif
 #endif /* WORD64_AVAILABLE */
 
 #ifndef WOLFSSL_HAVE_MIN
diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h
index 9f699145847..45657fabbbd 100644
--- a/wolfssl/wolfcrypt/settings.h
+++ b/wolfssl/wolfcrypt/settings.h
@@ -5157,7 +5157,13 @@ blinding by defining WC_BLINDING_NO_RNG_ACKNOWLEDGE_WEAKNESS."
            " with static memory (WOLFSSL_STATIC_MEMORY)"
 #endif
 #if defined(WC_16BIT_CPU) && \
-    (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL))
+    (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) && \
+    !defined(WOLFSSL_SP_ALLOW_16BIT_CPU)
+    /* SP math on a 16-bit-int CPU needs an explicitly wider digit
+     * (SP_WORD_SIZE >= 32, so sp_int_digit is at least 32-bit) and an octet
+     * byte (or the CHAR_BIT!=8 octet handling).  Targets that have set those up
+     * - e.g. TI C28x with SP_WORD_SIZE 32 - opt in with
+     * WOLFSSL_SP_ALLOW_16BIT_CPU. */
     #error "16-bit build (WC_16BIT_CPU) is not available with SP math"
 #endif
 
diff --git a/wolfssl/wolfcrypt/sp_int.h b/wolfssl/wolfcrypt/sp_int.h
index 31936d40753..435eae92e74 100644
--- a/wolfssl/wolfcrypt/sp_int.h
+++ b/wolfssl/wolfcrypt/sp_int.h
@@ -58,8 +58,17 @@ extern "C" {
 
     typedef unsigned char sp_uint7;
     typedef          char  sp_int7;
+#elif UCHAR_MAX == 65535
+    /* CHAR_BIT == 16 (e.g. TI C28x): the smallest addressable type is 16-bit,
+     * so there is no native 8-bit type.  An "8-bit" SP value is a 16-bit char
+     * cell holding an octet (0..255); byte I/O masks to an octet (the same
+     * CHAR_BIT!=8 handling used elsewhere - see WOLFSSL_WIDE_BYTE). */
+    #define SP_UCHAR_BITS    16
+
+    typedef unsigned char sp_uint8;
+    typedef          char  sp_int8;
 #else
-    #error "Size of unsigned short not detected"
+    #error "Size of unsigned char not detected"
 #endif
 
 #if USHRT_MAX == 65535
diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h
index 3a1efb79556..da4e388ba9f 100644
--- a/wolfssl/wolfcrypt/types.h
+++ b/wolfssl/wolfcrypt/types.h
@@ -264,6 +264,51 @@ typedef const char wcchar[];
         #endif
 #endif
 
+    /* Targets where a C 'byte' (char) is wider than 8 bits - e.g. the TI
+     * C2000 C28x DSP has CHAR_BIT == 16.  On such targets a word cannot be
+     * aliased as an octet stream (XMEMCPY/ByteReverseWords/(byte) casts assume
+     * 8-bit bytes), so byte<->word conversions must be done octet-wise with
+     * shifts.  It stays undefined - and all existing fast paths are unchanged -
+     * on the usual 8-bit-byte targets.
+     *
+     * Detection is two-pronged: first by known 16-bit-char toolchains (so it
+     * works even without <limits.h>), then generically from CHAR_BIT when
+     * <limits.h> is available (HAVE_LIMITS_H).  The TI cl2000 predefines
+     * __TMS320C28XX__ / __TMS320C2000__ for the C28x (CHAR_BIT == 16); the
+     * C2800 (cl2800) predefines __TMS320C2800__. */
+#if !defined(WOLFSSL_WIDE_BYTE) && \
+    (defined(__TMS320C28XX__) || defined(__TMS320C2000__) || \
+     defined(__TMS320C2800__) || defined(__TMS320C5500__) || \
+     defined(__TMS320C55X__) || defined(__TMS320C54X__))
+    #define WOLFSSL_WIDE_BYTE
+    /* These TI DSP toolchains all have a 16-bit char.  Some octet-wise paths
+     * gated on WOLFSSL_WIDE_BYTE use CHAR_BIT (rotate width, SHA length
+     * carry), so provide it when <limits.h> was not included (NO_LIMITS_H /
+     * no HAVE_LIMITS_H).  These targets are unambiguously CHAR_BIT == 16. */
+    #ifndef CHAR_BIT
+        #define CHAR_BIT 16
+    #endif
+#endif
+#if !defined(WOLFSSL_WIDE_BYTE) && defined(CHAR_BIT) && (CHAR_BIT != 8)
+    #define WOLFSSL_WIDE_BYTE
+#endif
+/* Guarantee CHAR_BIT is defined so it can be used unconditionally for value-bit
+ * widths (rotate complements, SHA length carries).  When <limits.h> was
+ * included (above) or a known wide-char toolchain was detected, CHAR_BIT is
+ * already correct; otherwise the target has the usual 8-bit byte. */
+#ifndef CHAR_BIT
+    #define CHAR_BIT 8
+#endif
+
+/* Reduce a value to a single octet (its low 8 bits) stored in a byte.  On the
+ * usual 8-bit-byte targets a (byte) cast already truncates to 8 bits, so the
+ * mask is a no-op and this is byte-for-byte the historical (byte) cast; on
+ * WOLFSSL_WIDE_BYTE targets (a C byte is wider than an octet, e.g. 16-bit on
+ * the TI C28x) the mask is required so a carry or high bits do not leak into
+ * the stored octet.  Used by the byte-packers in SHA-2/3, the Hash-DRBG, base64
+ * and ML-DSA. */
+#define WC_OCTET(x)  ((byte)((x) & 0xFF))
+
 #if defined(HAVE___UINT128_T) && !defined(NO_INT128)
     #ifndef WOLFSSL_UINT128_T_DEFINED
         #ifdef __SIZEOF_INT128__
@@ -377,7 +422,12 @@ typedef const char wcchar[];
     #endif
 
 #elif defined(WC_16BIT_CPU)
-    #ifndef MICROCHIP_PIC24
+    /* WC_16BIT_CPU selects 16-bit int (word16=unsigned int, word32=unsigned
+     * long).  It historically also assumes no native 64-bit type, but targets
+     * like the TI C2000 C28x are 16-bit-int yet have a 64-bit long long, so
+     * keep WORD64_AVAILABLE when one exists. */
+    #if !defined(MICROCHIP_PIC24) && \
+        !(defined(SIZEOF_LONG_LONG) && (SIZEOF_LONG_LONG == 8))
         #undef WORD64_AVAILABLE
     #endif
     typedef word16 wolfssl_word;
diff --git a/wolfssl/wolfcrypt/wc_port.h b/wolfssl/wolfcrypt/wc_port.h
index 0bd5f793a23..de571b146cc 100644
--- a/wolfssl/wolfcrypt/wc_port.h
+++ b/wolfssl/wolfcrypt/wc_port.h
@@ -516,7 +516,14 @@
     typedef wolfSSL_Mutex wolfSSL_RwLock;
 #endif
 
-#ifdef WC_16BIT_CPU
+/* The init-state bit field below needs a 32-bit type.  Use a wider type when
+ * int is too narrow (16-bit int targets such as C28x, detected via UINT_MAX),
+ * not only when WC_16BIT_CPU is set. */
+#ifdef HAVE_LIMITS_H
+    #include <limits.h>
+#endif
+#if defined(WC_16BIT_CPU) || \
+    (defined(UINT_MAX) && (UINT_MAX < 0xFFFFFFFFUL))
     #define WC_ATOMIC_INT_ARG long int
     #define WC_ATOMIC_UINT_ARG long unsigned int
 #else

From 5522b65804739739e3f2c86905fd446517a13b3e Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Thu, 18 Jun 2026 09:15:10 -0700
Subject: [PATCH 2/4] mldsa: correct ML-DSA on CHAR_BIT!=8 + add
 WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM

ML-DSA-87 keygen/sign/verify on a 16-bit byte/int CPU (TI C28x), gated and a
no-op on normal targets:
 - Encode/decode integer-promotion fixes: a byte/word16 field promotes to
   *unsigned* int where int is 16-bit, so '2 - field' was unsigned and a
   negative coefficient zero-extended into sword32 (e.g. -1 -> 0x0000FFFF);
   cast the unpacked field to sword32 (eta-2/eta-4/t0 decode).  Bit-packers
   relied on (byte) truncating to 8 bits; mask with MLDSA_OCT() and cast the
   <<MLDSA_D shift to sword32 (eta-2/t0/t1/gamma1 encode).
 - dilithium.h: shift-based Montgomery reduction on WC_16BIT_CPU (cl2000
   miscompiles the multiply form).
 - New WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM: stream the signature z vector one
   polynomial at a time instead of pinning the whole l-vector, cutting the
   ML-DSA-87 verify key by ~6 KB (with WOLFSSL_MLDSA_ASSIGN_KEY, ~10.7 KB total
   verify RAM on the C28x).
---
 wolfcrypt/src/wc_mldsa.c      | 218 ++++++++++++++++++++--------------
 wolfssl/wolfcrypt/dilithium.h |  34 ++++++
 wolfssl/wolfcrypt/wc_mldsa.h  |   4 +
 3 files changed, 169 insertions(+), 87 deletions(-)

diff --git a/wolfcrypt/src/wc_mldsa.c b/wolfcrypt/src/wc_mldsa.c
index 45e247e889b..62538dc0e88 100644
--- a/wolfcrypt/src/wc_mldsa.c
+++ b/wolfcrypt/src/wc_mldsa.c
@@ -1021,6 +1021,7 @@ static int mldsa_squeeze256(wc_Shake* shake256, const byte* in,
  * @param [in]  eta  Range specifier of each value.
  * @param [out] p    Buffer to encode into.
  */
+
 static void mldsa_vec_encode_eta_bits_c(const sword32* s, byte d, byte eta,
     byte* p)
 {
@@ -1047,9 +1048,9 @@ static void mldsa_vec_encode_eta_bits_c(const sword32* s, byte d, byte eta,
                 byte s7 = (byte)(2 - s[j + 7]);
 
                 /* Pack 8 3-bit values into 3 bytes. */
-                p[0] = (byte)((s0 >> 0) | (s1 << 3) | (s2 << 6));
-                p[1] = (byte)((s2 >> 2) | (s3 << 1) | (s4 << 4) | (s5 << 7));
-                p[2] = (byte)((s5 >> 1) | (s6 << 2) | (s7 << 5));
+                p[0] = WC_OCTET((s0 >> 0) | (s1 << 3) | (s2 << 6));
+                p[1] = WC_OCTET((s2 >> 2) | (s3 << 1) | (s4 << 4) | (s5 << 7));
+                p[2] = WC_OCTET((s5 >> 1) | (s6 << 2) | (s7 << 5));
                 /* Move to next place to encode into. */
                 p += MLDSA_ETA_2_BITS;
             }
@@ -1159,15 +1160,18 @@ static void mldsa_decode_eta_2_bits_c(const byte* p, sword32* s)
      * 3 bits to encode each number.
      * 8 numbers from 3 bytes. (8 * 3 bits = 3 * 8 bits) */
     for (j = 0; j < MLDSA_N; j += 8) {
-        /* Get 3 bits and put in range of -2..2. */
-        s[j + 0] = 2 - ((p[0] >> 0) & 0x7                      );
-        s[j + 1] = 2 - ((p[0] >> 3) & 0x7                      );
-        s[j + 2] = 2 - ((p[0] >> 6)       | ((p[1] << 2) & 0x7));
-        s[j + 3] = 2 - ((p[1] >> 1) & 0x7                      );
-        s[j + 4] = 2 - ((p[1] >> 4) & 0x7                      );
-        s[j + 5] = 2 - ((p[1] >> 7)       | ((p[2] << 1) & 0x7));
-        s[j + 6] = 2 - ((p[2] >> 2) & 0x7                      );
-        s[j + 7] = 2 - ((p[2] >> 5) & 0x7                      );
+        /* Get 3 bits and put in range of -2..2.
+         * Cast to signed 32-bit before the subtract: where int is 16-bit a
+         * byte/word16 field promotes to unsigned, so a negative result would
+         * zero-extend instead of sign-extend into sword32. */
+        s[j + 0] = 2 - (sword32)((p[0] >> 0) & 0x7                      );
+        s[j + 1] = 2 - (sword32)((p[0] >> 3) & 0x7                      );
+        s[j + 2] = 2 - (sword32)((p[0] >> 6)       | ((p[1] << 2) & 0x7));
+        s[j + 3] = 2 - (sword32)((p[1] >> 1) & 0x7                      );
+        s[j + 4] = 2 - (sword32)((p[1] >> 4) & 0x7                      );
+        s[j + 5] = 2 - (sword32)((p[1] >> 7)       | ((p[2] << 1) & 0x7));
+        s[j + 6] = 2 - (sword32)((p[2] >> 2) & 0x7                      );
+        s[j + 7] = 2 - (sword32)((p[2] >> 5) & 0x7                      );
         /* Move to next place to decode from. */
         p += MLDSA_ETA_2_BITS;
     }
@@ -1221,24 +1225,24 @@ static void mldsa_decode_eta_4_bits_c(const byte* p, sword32* s)
      * 4 bits to encode each number.
      * 2 numbers from 1 bytes. (2 * 4 bits = 1 * 8 bits) */
     for (j = 0; j < MLDSA_N / 2; j++) {
-        /* Get 4 bits and put in range of -4..4. */
-        s[j * 2 + 0] = 4 - (p[j] & 0xf);
-        s[j * 2 + 1] = 4 - (p[j] >> 4);
+        /* Get 4 bits and put in range of -4..4. (sword32 cast: see eta-2.) */
+        s[j * 2 + 0] = 4 - (sword32)(p[j] & 0xf);
+        s[j * 2 + 1] = 4 - (sword32)(p[j] >> 4);
     }
 #else
     /* Step 6 or 9.
      * 4 bits to encode each number.
      * 8 numbers from 4 bytes. (8 * 4 bits = 4 * 8 bits) */
     for (j = 0; j < MLDSA_N / 2; j += 4) {
-        /* Get 4 bits and put in range of -4..4. */
-        s[j * 2 + 0] = 4 - (p[j + 0] & 0xf);
-        s[j * 2 + 1] = 4 - (p[j + 0] >> 4);
-        s[j * 2 + 2] = 4 - (p[j + 1] & 0xf);
-        s[j * 2 + 3] = 4 - (p[j + 1] >> 4);
-        s[j * 2 + 4] = 4 - (p[j + 2] & 0xf);
-        s[j * 2 + 5] = 4 - (p[j + 2] >> 4);
-        s[j * 2 + 6] = 4 - (p[j + 3] & 0xf);
-        s[j * 2 + 7] = 4 - (p[j + 3] >> 4);
+        /* Get 4 bits and put in range of -4..4. (sword32 cast: see eta-2.) */
+        s[j * 2 + 0] = 4 - (sword32)(p[j + 0] & 0xf);
+        s[j * 2 + 1] = 4 - (sword32)(p[j + 0] >> 4);
+        s[j * 2 + 2] = 4 - (sword32)(p[j + 1] & 0xf);
+        s[j * 2 + 3] = 4 - (sword32)(p[j + 1] >> 4);
+        s[j * 2 + 4] = 4 - (sword32)(p[j + 2] & 0xf);
+        s[j * 2 + 5] = 4 - (sword32)(p[j + 2] >> 4);
+        s[j * 2 + 6] = 4 - (sword32)(p[j + 3] & 0xf);
+        s[j * 2 + 7] = 4 - (sword32)(p[j + 3] >> 4);
     }
 #endif /* WOLFSSL_MLDSA_SMALL */
 }
@@ -1378,21 +1382,21 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0,
                 MLDSA_D);
             /* Take 8 values of t and take bottom bits and make positive. */
             word16 n0_0 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 0] - (n1_0 << MLDSA_D)));
+                                   (t[j + 0] - ((sword32)n1_0 << MLDSA_D)));
             word16 n0_1 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 1] - (n1_1 << MLDSA_D)));
+                                   (t[j + 1] - ((sword32)n1_1 << MLDSA_D)));
             word16 n0_2 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 2] - (n1_2 << MLDSA_D)));
+                                   (t[j + 2] - ((sword32)n1_2 << MLDSA_D)));
             word16 n0_3 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 3] - (n1_3 << MLDSA_D)));
+                                   (t[j + 3] - ((sword32)n1_3 << MLDSA_D)));
             word16 n0_4 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 4] - (n1_4 << MLDSA_D)));
+                                   (t[j + 4] - ((sword32)n1_4 << MLDSA_D)));
             word16 n0_5 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 5] - (n1_5 << MLDSA_D)));
+                                   (t[j + 5] - ((sword32)n1_5 << MLDSA_D)));
             word16 n0_6 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 6] - (n1_6 << MLDSA_D)));
+                                   (t[j + 6] - ((sword32)n1_6 << MLDSA_D)));
             word16 n0_7 = (word16)(MLDSA_D_MAX_HALF -
-                                   (t[j + 7] - (n1_7 << MLDSA_D)));
+                                   (t[j + 7] - ((sword32)n1_7 << MLDSA_D)));
 
             /* 13 bits per number.
              * 8 numbers become 13 bytes. (8 * 13 bits = 13 * 8 bits) */
@@ -1406,20 +1410,20 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0,
             tp[2] =  (n0_4 >> 12) | ((word32)n0_5 <<  1) |
                                     ((word32)n0_6 << 14) | ((word32)n0_7 << 27);
         #else
-            t0[ 0] = (byte)(               (n0_0 <<  0));
-            t0[ 1] = (byte)((n0_0 >>  8) | (n0_1 <<  5));
-            t0[ 2] = (byte)((n0_1 >>  3)               );
-            t0[ 3] = (byte)((n0_1 >> 11) | (n0_2 <<  2));
-            t0[ 4] = (byte)((n0_2 >>  6) | (n0_3 <<  7));
-            t0[ 5] = (byte)((n0_3 >>  1)               );
-            t0[ 6] = (byte)((n0_3 >>  9) | (n0_4 <<  4));
-            t0[ 7] = (byte)((n0_4 >>  4)               );
-            t0[ 8] = (byte)((n0_4 >> 12) | (n0_5 <<  1));
-            t0[ 9] = (byte)((n0_5 >>  7) | (n0_6 <<  6));
-            t0[10] = (byte)((n0_6 >>  2)               );
-            t0[11] = (byte)((n0_6 >> 10) | (n0_7 <<  3));
+            t0[ 0] = WC_OCTET(             (n0_0 <<  0));
+            t0[ 1] = WC_OCTET((n0_0 >>  8) | (n0_1 <<  5));
+            t0[ 2] = WC_OCTET((n0_1 >>  3)               );
+            t0[ 3] = WC_OCTET((n0_1 >> 11) | (n0_2 <<  2));
+            t0[ 4] = WC_OCTET((n0_2 >>  6) | (n0_3 <<  7));
+            t0[ 5] = WC_OCTET((n0_3 >>  1)               );
+            t0[ 6] = WC_OCTET((n0_3 >>  9) | (n0_4 <<  4));
+            t0[ 7] = WC_OCTET((n0_4 >>  4)               );
+            t0[ 8] = WC_OCTET((n0_4 >> 12) | (n0_5 <<  1));
+            t0[ 9] = WC_OCTET((n0_5 >>  7) | (n0_6 <<  6));
+            t0[10] = WC_OCTET((n0_6 >>  2)               );
+            t0[11] = WC_OCTET((n0_6 >> 10) | (n0_7 <<  3));
         #endif
-            t0[12] = (byte)((n0_7 >>  5)               );
+            t0[12] = WC_OCTET((n0_7 >>  5)               );
 
             /* 10 bits per number.
              * 8 bytes become 10 bytes. (8 * 10 bits = 10 * 8 bits) */
@@ -1430,17 +1434,17 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0,
             tp[1] =  (n1_3 >>  2) | ((word32)n1_4 <<  8) |
                      ((word32)n1_5 << 18) | ((word32)n1_6 << 28);
         #else
-            t1[0] = (byte)(               (n1_0 << 0));
-            t1[1] = (byte)((n1_0 >> 8) |  (n1_1 << 2));
-            t1[2] = (byte)((n1_1 >> 6) |  (n1_2 << 4));
-            t1[3] = (byte)((n1_2 >> 4) |  (n1_3 << 6));
-            t1[4] = (byte)((n1_3 >> 2)               );
-            t1[5] = (byte)(               (n1_4 << 0));
-            t1[6] = (byte)((n1_4 >> 8) |  (n1_5 << 2));
-            t1[7] = (byte)((n1_5 >> 6) |  (n1_6 << 4));
+            t1[0] = WC_OCTET(             (n1_0 << 0));
+            t1[1] = WC_OCTET((n1_0 >> 8) |  (n1_1 << 2));
+            t1[2] = WC_OCTET((n1_1 >> 6) |  (n1_2 << 4));
+            t1[3] = WC_OCTET((n1_2 >> 4) |  (n1_3 << 6));
+            t1[4] = WC_OCTET((n1_3 >> 2)               );
+            t1[5] = WC_OCTET(             (n1_4 << 0));
+            t1[6] = WC_OCTET((n1_4 >> 8) |  (n1_5 << 2));
+            t1[7] = WC_OCTET((n1_5 >> 6) |  (n1_6 << 4));
         #endif
-            t1[8] = (byte)((n1_6 >> 4) |  (n1_7 << 6));
-            t1[9] = (byte)((n1_7 >> 2)               );
+            t1[8] = WC_OCTET((n1_6 >> 4) |  (n1_7 << 6));
+            t1[9] = WC_OCTET((n1_7 >> 2)               );
 
             /* Move to next place to encode bottom bits to. */
             t0 += MLDSA_D;
@@ -1526,25 +1530,27 @@ static void mldsa_decode_t0_c(const byte* t0, sword32* t)
         t[j + 7] = MLDSA_D_MAX_HALF - (sword32)
                    (( t32_2 >> 27          ) | ((word32)t0[12] ) <<  5 );
 #else
-        t[j + 0] = MLDSA_D_MAX_HALF -
+        /* sword32 cast on the unpacked field: see eta-2 decode - the subtract
+         * must be signed/32-bit so a negative t0 sign-extends correctly. */
+        t[j + 0] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 0]     ) | (((word16)(t0[ 1] & 0x1f)) <<  8));
-        t[j + 1] = MLDSA_D_MAX_HALF -
+        t[j + 1] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 1] >> 5) | (((word16)(t0[ 2]       )) <<  3) |
                                     (((word16)(t0[ 3] & 0x03)) << 11));
-        t[j + 2] = MLDSA_D_MAX_HALF -
+        t[j + 2] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 3] >> 2) | (((word16)(t0[ 4] & 0x7f)) <<  6));
-        t[j + 3] = MLDSA_D_MAX_HALF -
+        t[j + 3] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 4] >> 7) | (((word16)(t0[ 5]       )) <<  1) |
                                     (((word16)(t0[ 6] & 0x0f)) <<  9));
-        t[j + 4] = MLDSA_D_MAX_HALF -
+        t[j + 4] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 6] >> 4) | (((word16)(t0[ 7]       )) <<  4) |
                                     (((word16)(t0[ 8] & 0x01)) << 12));
-        t[j + 5] = MLDSA_D_MAX_HALF -
+        t[j + 5] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 8] >> 1) | (((word16)(t0[ 9] & 0x3f)) <<  7));
-        t[j + 6] = MLDSA_D_MAX_HALF -
+        t[j + 6] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[ 9] >> 6) | (((word16)(t0[10]       )) <<  2) |
                                     (((word16)(t0[11] & 0x07)) << 10));
-        t[j + 7] = MLDSA_D_MAX_HALF -
+        t[j + 7] = MLDSA_D_MAX_HALF - (sword32)
                    ((t0[11] >> 3) | (((word16)(t0[12]       )) <<  5));
 #endif
         /* Move to next place to decode from. */
@@ -1771,16 +1777,16 @@ static void mldsa_encode_gamma1_17_bits_c(const sword32* z, byte* s)
         s32p[1] = (z1 >> 14) | (z2 <<  4) | (z3 << 22);
     #endif
 #else
-        s[0] = (byte)( z0                   );
-        s[1] = (byte)( z0 >>  8             );
-        s[2] = (byte)((z0 >> 16) | (z1 << 2));
-        s[3] = (byte)( z1 >>  6             );
-        s[4] = (byte)((z1 >> 14) | (z2 << 4));
-        s[5] = (byte)( z2 >>  4             );
-        s[6] = (byte)((z2 >> 12) | (z3 << 6));
-        s[7] = (byte)( z3 >>  2             );
-#endif
-        s[8] = (byte)( z3 >> 10             );
+        s[0] = WC_OCTET( z0                   );
+        s[1] = WC_OCTET( z0 >>  8             );
+        s[2] = WC_OCTET((z0 >> 16) | (z1 << 2));
+        s[3] = WC_OCTET( z1 >>  6             );
+        s[4] = WC_OCTET((z1 >> 14) | (z2 << 4));
+        s[5] = WC_OCTET( z2 >>  4             );
+        s[6] = WC_OCTET((z2 >> 12) | (z3 << 6));
+        s[7] = WC_OCTET( z3 >>  2             );
+#endif
+        s[8] = WC_OCTET( z3 >> 10             );
         /* Move to next place to encode to. */
         s += MLDSA_GAMMA1_17_ENC_BITS / 2;
     }
@@ -1842,16 +1848,16 @@ static void mldsa_encode_gamma1_19_bits_c(const sword32* z, byte* s)
     #endif
         s16p[4] = (word16)((z3 >>  4)                          );
 #else
-        s[0] = (byte)  z0                    ;
-        s[1] = (byte) (z0 >>  8)             ;
-        s[2] = (byte)((z0 >> 16) | (z1 << 4));
-        s[3] = (byte) (z1 >>  4)             ;
-        s[4] = (byte) (z1 >> 12)             ;
-        s[5] = (byte)  z2                    ;
-        s[6] = (byte) (z2 >>  8)             ;
-        s[7] = (byte)((z2 >> 16) | (z3 << 4));
-        s[8] = (byte) (z3 >>  4)             ;
-        s[9] = (byte) (z3 >> 12)             ;
+        s[0] = WC_OCTET( z0                    );
+        s[1] = WC_OCTET((z0 >>  8)             );
+        s[2] = WC_OCTET((z0 >> 16) | (z1 << 4));
+        s[3] = WC_OCTET((z1 >>  4)             );
+        s[4] = WC_OCTET((z1 >> 12)             );
+        s[5] = WC_OCTET( z2                    );
+        s[6] = WC_OCTET((z2 >>  8)             );
+        s[7] = WC_OCTET((z2 >> 16) | (z3 << 4));
+        s[8] = WC_OCTET((z3 >>  4)             );
+        s[9] = WC_OCTET((z3 >> 12)             );
 #endif
         /* Move to next place to encode to. */
         s += MLDSA_GAMMA1_19_ENC_BITS / 2;
@@ -2244,6 +2250,9 @@ static void mldsa_decode_gamma1(const byte* s, int bits, sword32* z)
  * @param [in]  bits  Number of bits used in encoding - GAMMA1 bits.
  * @param [out] z  Vector of polynomials.
  */
+#ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+/* The smallest-mem verify streams z one polynomial at a time with
+ * mldsa_decode_gamma1() directly, so the whole-vector wrapper is unused. */
 static void mldsa_vec_decode_gamma1(const byte* x, byte l, int bits,
     sword32* z)
 {
@@ -2258,6 +2267,7 @@ static void mldsa_vec_decode_gamma1(const byte* x, byte l, int bits,
         z += MLDSA_N;
     }
 }
+#endif /* !WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM */
 #endif
 
 #if !defined(WOLFSSL_MLDSA_NO_SIGN) || !defined(WOLFSSL_MLDSA_NO_VERIFY)
@@ -4624,8 +4634,12 @@ static int mldsa_sample_in_ball_ex(int level, wc_Shake* shake256,
 
         /* Step 8: Move value from random index to current index. */
         c[i] = c[j];
-        /* Step 9: Set value at random index to +/- 1. */
-        c[j] = 1 - ((((signs[s >> 3]) >> (s & 0x7)) & 0x1) << 1);
+        /* Step 9: Set value at random index to +/- 1.
+         * Cast to sword32 before the subtract: where a byte is as wide as int,
+         * signs[] promotes to unsigned and -1 would widen as 0x0000ffff.
+         * Matches the USE_INTEL_SPEEDUP path. */
+        c[j] = (sword32)1 -
+               (sword32)((((signs[s >> 3]) >> (s & 0x7)) & 0x1) << 1);
         /* Next sign bit index. */
         s++;
     }
@@ -9771,6 +9785,10 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu,
     byte o;
     byte* encW1;
     byte* seed = commit_calc;
+#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+    /* Bytes of encoded z per polynomial - z is streamed one poly at a time. */
+    word32 zStride = (word32)(MLDSA_N / 8) * (word32)(params->gamma1_bits + 1);
+#endif
 
     /* Ensure the signature is the right size for the parameters. */
     if (sigLen != params->sigSz) {
@@ -9825,15 +9843,33 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu,
 #endif
 
     if (ret == 0) {
-        /* Step 2: Decode z from signature. */
-        mldsa_vec_decode_gamma1(ze, params->l, params->gamma1_bits, z);
         /* Step 13: Check z is valid - values are low enough. */
         hi = ((sword32)1 << params->gamma1_bits) - params->beta;
+#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+        {
+            /* Step 2/13: Stream z one polynomial at a time for the range check;
+             * the per-poly NTT happens inside the matrix loop below. */
+            const byte*  zp = ze;
+            unsigned int zi;
+
+            valid = 1;
+            for (zi = 0; valid && (zi < params->l); zi++) {
+                mldsa_decode_gamma1(zp, params->gamma1_bits, z);
+                valid = mldsa_check_low(z, hi);
+                zp += zStride;
+            }
+        }
+#else
+        /* Step 2: Decode z from signature. */
+        mldsa_vec_decode_gamma1(ze, params->l, params->gamma1_bits, z);
         valid = mldsa_vec_check_low(z, params->l, hi);
+#endif
     }
     if ((ret == 0) && valid) {
+#ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
         /* Step 10: NTT(z) */
         mldsa_vec_ntt_full(z, params->l);
+#endif
 
          /* Step 9: Compute c from first 256 bits of commit. */
 #ifdef WOLFSSL_MLDSA_VERIFY_NO_MALLOC
@@ -9907,6 +9943,14 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu,
             for (s = 0; (ret == 0) && (s < params->l); s++) {
                 /* Put s into buffer to be hashed. */
                 seed[MLDSA_PUB_SEED_SZ + 0] = (byte)s;
+            #ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+                /* Step 2/10: Decode and NTT this z polynomial on demand (z is
+                 * not kept as a whole vector in this mode). */
+                mldsa_decode_gamma1(ze + (word32)s * zStride,
+                    params->gamma1_bits, z);
+                mldsa_ntt_full(z);
+                zt = z;
+            #endif
                 /* Step 3: Create polynomial from hashing seed. */
             #ifdef WOLFSSL_MLDSA_VERIFY_NO_MALLOC
                 ret = mldsa_rej_ntt_poly_ex(&key->shake, seed, a, key->h);
diff --git a/wolfssl/wolfcrypt/dilithium.h b/wolfssl/wolfcrypt/dilithium.h
index 123625e53f7..a37b806bff6 100644
--- a/wolfssl/wolfcrypt/dilithium.h
+++ b/wolfssl/wolfcrypt/dilithium.h
@@ -145,6 +145,24 @@
         #define WOLFSSL_MLDSA_VERIFY_SMALL_MEM
     #endif
 #endif
+#ifdef WOLFSSL_DILITHIUM_VERIFY_SMALLEST_MEM
+    #ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+        #define WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+    #endif
+#endif
+#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+    /* Smallest verify RAM: in addition to the small-mem, no-malloc verify path,
+     * stream the signature's z vector one polynomial at a time instead of
+     * pinning the whole l-vector in the key.  Trades a per-row z decode+NTT for
+     * roughly (l-1)*256*sizeof(sword32) bytes of state (e.g. ~6 KB for
+     * ML-DSA-87).  Implies the small-mem + no-malloc verify path. */
+    #ifndef WOLFSSL_MLDSA_VERIFY_SMALL_MEM
+        #define WOLFSSL_MLDSA_VERIFY_SMALL_MEM
+    #endif
+    #ifndef WOLFSSL_MLDSA_VERIFY_NO_MALLOC
+        #define WOLFSSL_MLDSA_VERIFY_NO_MALLOC
+    #endif
+#endif
 #ifdef WOLFSSL_DILITHIUM_MAKE_KEY_SMALL_MEM
     #ifndef WOLFSSL_MLDSA_MAKE_KEY_SMALL_MEM
         #define WOLFSSL_MLDSA_MAKE_KEY_SMALL_MEM
@@ -288,6 +306,22 @@
     #endif
 #endif
 
+/* On a 16-bit-int CPU (WC_16BIT_CPU) the toolchain synthesizes 32-/64-bit
+ * multiplies, and some (e.g. TI cl2000 for the C2000 C28x) miscompile the
+ * multiply-based Montgomery reduction in mldsa_mont_red().  The shift-based
+ * reduction is mathematically identical (q = 2^23 - 2^13 + 1, and the q^-1
+ * expansion likewise), avoids the wide multiply, and is also cheaper on such
+ * targets.  Select it by default there; a user can still override either
+ * macro explicitly. */
+#if defined(WC_16BIT_CPU)
+    #ifndef MLDSA_MUL_QINV_SLOW
+        #define MLDSA_MUL_QINV_SLOW
+    #endif
+    #ifndef MLDSA_MUL_Q_SLOW
+        #define MLDSA_MUL_Q_SLOW
+    #endif
+#endif
+
 #endif /* !WOLFSSL_NO_DILITHIUM_LEGACY_GATES */
 
 /* === Derived canonical gates ========================================== */
diff --git a/wolfssl/wolfcrypt/wc_mldsa.h b/wolfssl/wolfcrypt/wc_mldsa.h
index 58a94bd771c..deda3e4ef52 100644
--- a/wolfssl/wolfcrypt/wc_mldsa.h
+++ b/wolfssl/wolfcrypt/wc_mldsa.h
@@ -662,7 +662,11 @@ struct wc_MlDsaKey {
 #endif
 #if defined(WOLFSSL_MLDSA_VERIFY_NO_MALLOC) && \
     defined(WOLFSSL_MLDSA_VERIFY_SMALL_MEM)
+#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+    sword32 z[MLDSA_N];                  /* one z poly, streamed per use */
+#else
     sword32 z[MLDSA_MAX_L_VECTOR_COUNT];
+#endif
     sword32 c[MLDSA_N];
     sword32 w[MLDSA_N];
     sword32 t1[MLDSA_N];

From 12a3dceeff7eaf4f57a6c477019f6602dfdf357a Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Thu, 18 Jun 2026 09:15:10 -0700
Subject: [PATCH 3/4] test/bench/ci: CHAR_BIT!=8 test vectors, NO_MALLOC bench,
 TI C2000 compile CI

 - test.c: store the SHA/SHAKE large_digest KAT vectors as brace-init byte
   arrays (clean octets) instead of "\x.." string literals, which a
   signed-16-bit-char toolchain (cl2000) would sign-extend.
 - benchmark.c: WOLFSSL_NO_MALLOC mode uses static plain/cipher buffers and
   skips the key/iv XMALLOC/XFREE (gated; default build unchanged).
 - scripts/ti-c2000/ + .github/workflows/ti-c2000-compile.yml: a hardware-free
   cl2000 compile-only CI guard for the CHAR_BIT!=8 wolfCrypt subset.
---
 .github/workflows/ti-c2000-compile.yml |  78 +++++++++
 scripts/ti-c2000/compile.sh            |  65 ++++++++
 scripts/ti-c2000/user_settings.h       |  84 ++++++++++
 wolfcrypt/benchmark/benchmark.c        |  35 +++-
 wolfcrypt/test/test.c                  | 216 ++++++++++++++-----------
 5 files changed, 382 insertions(+), 96 deletions(-)
 create mode 100644 .github/workflows/ti-c2000-compile.yml
 create mode 100755 scripts/ti-c2000/compile.sh
 create mode 100644 scripts/ti-c2000/user_settings.h

diff --git a/.github/workflows/ti-c2000-compile.yml b/.github/workflows/ti-c2000-compile.yml
new file mode 100644
index 00000000000..39f1e5c60f0
--- /dev/null
+++ b/.github/workflows/ti-c2000-compile.yml
@@ -0,0 +1,78 @@
+name: TI C2000 (C28x) compile-only
+
+# Compile-guard for the TI C2000 C28x port (CHAR_BIT == 16).  It builds the
+# wolfCrypt subset that carries the CHAR_BIT != 8 gated fixes with the TI cl2000
+# code generation tools - no linking, no C2000Ware, no hardware.  Purpose: catch
+# compile regressions in the octet/SP/ML-DSA gated paths.  On-target run-tests
+# live on a hardware-in-the-loop runner (there is no public C28x simulator).
+
+# START OF COMMON SECTION
+on:
+  # Only build when something that can affect the C28x compile changes, so the
+  # job (and the CGT it pulls) does not burn runner minutes on unrelated PRs.
+  push:
+    branches: [ 'master', 'main', 'release/**' ]
+    paths:
+      - 'wolfcrypt/src/**'
+      - 'wolfssl/wolfcrypt/**'
+      - 'scripts/ti-c2000/**'
+      - '.github/workflows/ti-c2000-compile.yml'
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+    branches: [ '*' ]
+    paths:
+      - 'wolfcrypt/src/**'
+      - 'wolfssl/wolfcrypt/**'
+      - 'scripts/ti-c2000/**'
+      - '.github/workflows/ti-c2000-compile.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+# END OF COMMON SECTION
+
+jobs:
+  ti_c2000_compile:
+    name: cl2000 compile-only
+    if: ${{ github.event_name != 'pull_request' || github.event.pull_request.draft == false }}
+    runs-on: ubuntu-22.04
+    timeout-minutes: 20
+    env:
+      # TI C2000 code generation tools.  The CGT is a free download from TI.
+      # Update CGT_VER / CGT_URL when bumping the toolchain.  If the TI URL
+      # starts requiring a click-through, host the installer as a repo/org
+      # cache artifact and point CGT_URL at it (the cache step below makes the
+      # download a once-per-version cost).
+      CGT_VER: "22.6.2.LTS"
+      CGT_URL: "https://dr-download.ti.com/software-development/ide-configuration-compiler-or-debugger/MD-LP0nQ4O8eX/22.6.2.LTS/ti_cgt_c2000_22.6.2.LTS_linux-x64_installer.bin"
+      CGT_DIR: "${{ github.workspace }}/ti-cgt-c2000"
+    steps:
+      - uses: actions/checkout@v4
+        name: Checkout wolfSSL
+
+      - name: Cache TI C2000 CGT
+        id: cgt-cache
+        uses: actions/cache@v4
+        with:
+          path: ${{ env.CGT_DIR }}
+          key: ti-cgt-c2000-${{ env.CGT_VER }}
+
+      - name: Download + install TI C2000 CGT
+        if: steps.cgt-cache.outputs.cache-hit != 'true'
+        run: |
+          set -e
+          curl -fsSL "$CGT_URL" -o /tmp/cgt.bin
+          chmod +x /tmp/cgt.bin
+          /tmp/cgt.bin --mode unattended --prefix "$CGT_DIR"
+
+      - name: Locate cl2000
+        id: find-cl
+        run: |
+          CL=$(find "$CGT_DIR" -type f -name cl2000 | head -1)
+          test -n "$CL" || { echo "cl2000 not found under $CGT_DIR"; exit 1; }
+          echo "cgt_root=$(dirname "$(dirname "$CL")")" >> "$GITHUB_OUTPUT"
+
+      - name: Compile-only guard
+        run: |
+          CGT_ROOT="${{ steps.find-cl.outputs.cgt_root }}" \
+            scripts/ti-c2000/compile.sh
diff --git a/scripts/ti-c2000/compile.sh b/scripts/ti-c2000/compile.sh
new file mode 100755
index 00000000000..0731b6b5e90
--- /dev/null
+++ b/scripts/ti-c2000/compile.sh
@@ -0,0 +1,65 @@
+#!/bin/sh
+# compile.sh - compile-only guard for the TI C2000 (C28x, CHAR_BIT==16) port.
+#
+# Builds the wolfCrypt subset that compiles under CHAR_BIT==16 with the TI cl2000
+# code generation tools, using scripts/ti-c2000/user_settings.h.  No linking, no
+# C2000Ware, no hardware: this only catches compile regressions in the
+# CHAR_BIT != 8 gated code paths (SHA-2/3/SHAKE, ML-DSA-87 verify, SP-ECC).
+#
+# Usage:
+#   CGT_ROOT=/path/to/ti-cgt-c2000_xx.y.z scripts/ti-c2000/compile.sh
+#
+# CGT_ROOT must point at a TI C2000 codegen install (the dir containing
+# bin/cl2000).  The CGT is a free download from TI; in CI it is fetched/cached
+# by .github/workflows/ti-c2000-compile.yml.
+
+set -e
+
+: "${CGT_ROOT:?set CGT_ROOT to the ti-cgt-c2000 install (dir with bin/cl2000)}"
+
+# Repo root = two levels up from this script.
+SELF_DIR=$(cd "$(dirname "$0")" && pwd)
+WOLFROOT=$(cd "$SELF_DIR/../.." && pwd)
+CL="$CGT_ROOT/bin/cl2000"
+
+if [ ! -x "$CL" ]; then
+    echo "ERROR: cl2000 not found/executable at $CL" >&2
+    exit 2
+fi
+
+OUT=$(mktemp -d)
+trap 'rm -rf "$OUT"' EXIT
+
+INCS="-I$CGT_ROOT/include -I$WOLFROOT -I$SELF_DIR"
+CFLAGS="-v28 --abi=eabi --float_support=fpu32 --tmu_support=tmu1 -O2 \
+  --define=WOLFSSL_USER_SETTINGS --display_error_number --diag_warning=225"
+
+# wolfCrypt sources to compile-guard under CHAR_BIT==16.  This is the set that
+# carries the CHAR_BIT != 8 gated fixes (plus their direct deps) - the
+# regression surface for this port.  hash.c (an unmodified dispatch wrapper) is
+# intentionally omitted: its wc_OidGetHash() OID switch needs the fuller ASN/OID
+# config of a real build to avoid a 16-bit-int case-label fold, and it is
+# covered by the on-target example build, not by this minimal guard.
+SRCS="error wc_port memory logging misc coding \
+  sha256 sha512 sha3 wc_mldsa random ecc sp_int sp_c32"
+
+rc=0
+for s in $SRCS; do
+    printf 'CC  %s.c ... ' "$s"
+    if "$CL" $CFLAGS $INCS --compile_only --skip_assembler \
+            --asm_directory="$OUT" --obj_directory="$OUT" \
+            "$WOLFROOT/wolfcrypt/src/$s.c" > "$OUT/$s.log" 2>&1; then
+        echo "ok"
+    else
+        echo "FAIL"
+        cat "$OUT/$s.log"
+        rc=1
+    fi
+done
+
+if [ "$rc" -eq 0 ]; then
+    echo "TI C2000 compile-only guard: PASS"
+else
+    echo "TI C2000 compile-only guard: FAIL" >&2
+fi
+exit "$rc"
diff --git a/scripts/ti-c2000/user_settings.h b/scripts/ti-c2000/user_settings.h
new file mode 100644
index 00000000000..9d7c535fff4
--- /dev/null
+++ b/scripts/ti-c2000/user_settings.h
@@ -0,0 +1,84 @@
+/* user_settings.h - minimal wolfCrypt config for the TI C2000 (C28x,
+ * CHAR_BIT==16) compile-only CI guard.
+ *
+ * This is NOT a board config: it has no BSP/device dependencies.  Its only job
+ * is to enable the subset of wolfCrypt that exercises the CHAR_BIT != 8 gated
+ * code paths (SHA-2/3/SHAKE, ML-DSA-87 verify, ECDSA/ECDH P-256 via SP math)
+ * so that scripts/ti-c2000/compile.sh can compile them with cl2000 and catch
+ * regressions.  cl2000 predefines __TMS320C28XX__, so types.h auto-enables
+ * WOLFSSL_WIDE_BYTE; we do not set it here.
+ */
+#ifndef TI_C2000_CI_USER_SETTINGS_H
+#define TI_C2000_CI_USER_SETTINGS_H
+
+#define WOLFCRYPT_ONLY              /* crypto only - no TLS (no MD5/SHA1 dep) */
+#define WOLFSSL_GENERAL_ALIGNMENT 2
+#define HAVE_LIMITS_H
+#define WOLFSSL_NO_ASM
+#define NO_INLINE
+#define SINGLE_THREADED
+#define NO_FILESYSTEM
+#define NO_WOLFSSL_DIR
+#define NO_MAIN_DRIVER
+#define NO_DEV_RANDOM
+#define WOLFSSL_IGNORE_FILE_WARN
+#define BENCH_EMBEDDED
+#define NO_WOLFSSL_MEMORY
+#define WOLFSSL_GENSEED_FORTEST     /* dev-only seed; no TRNG on this part */
+
+/* Hashes */
+#define WOLFSSL_SHA512
+#define WOLFSSL_SHA384
+#define WOLFSSL_SHA3
+#define WOLFSSL_SHAKE128
+#define WOLFSSL_SHAKE256
+
+/* ML-DSA-87 verify (smallest-mem streaming verifier) */
+#define WOLFSSL_HAVE_MLDSA
+#define WOLFSSL_NO_ML_DSA_44
+#define WOLFSSL_NO_ML_DSA_65
+#define WOLFSSL_MLDSA_NO_ASN1
+#define WOLFSSL_MLDSA_VERIFY_ONLY
+#define WOLFSSL_MLDSA_VERIFY_SMALL_MEM
+#define WOLFSSL_MLDSA_VERIFY_NO_MALLOC
+#define WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
+#undef  WOLFSSL_MLDSA_ALIGNMENT
+#define WOLFSSL_MLDSA_ALIGNMENT 16
+#define WOLFSSL_SMALL_STACK
+
+/* ECDSA / ECDH P-256 via SP single-precision math (sp_c32.c) */
+#define HAVE_ECC
+#define ECC_USER_CURVES
+#define HAVE_ECC256
+#define HAVE_ECC_VERIFY
+#define HAVE_ECC_SIGN
+#define HAVE_ECC_DHE
+#define ECC_TIMING_RESISTANT
+#define WOLFSSL_SP_MATH
+#define WOLFSSL_HAVE_SP_ECC
+#define WOLFSSL_SP_NO_MALLOC
+#define WOLFSSL_SP_SMALL
+#define SP_WORD_SIZE 32
+#define WOLFSSL_SP_ALLOW_16BIT_CPU
+
+/* Off: anything that pulls in big-int/ASN/symmetric not under test here. */
+#define NO_RSA
+#define NO_DH
+#define NO_DSA
+#define NO_ASN
+#define NO_CERTS
+#define NO_PWDBASED
+#define NO_PKCS7
+#define NO_PKCS12
+#define NO_SIG_WRAPPER
+#define NO_AES
+#define NO_DES3
+#define NO_RC4
+#define NO_MD4
+#define NO_MD5
+#define NO_SHA
+#define NO_HMAC
+#define NO_ASN_TIME
+#define WOLFSSL_USER_CURRTIME
+
+#endif /* TI_C2000_CI_USER_SETTINGS_H */
diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c
index 214d873bc2f..b0f0b439b84 100644
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@@ -2357,6 +2357,15 @@ static const char* bench_result_words2[][6] = {
 
 static int    numBlocks  = NUM_BLOCKS;
 static word32 bench_size = BENCH_SIZE;
+#ifdef WOLFSSL_NO_MALLOC
+    /* No heap: use file-scope static buffers for the core bench buffers.
+     * Sized to hold bench_buf_size (BENCH_SIZE + BENCH_CIPHER_ADD rounded up
+     * to the 16-byte AES block size) plus the +16 slack used at the alloc
+     * site, plus 64 bytes so the buffer can be 64-byte aligned at runtime. */
+    #define BENCH_MAX_PAD (BENCH_CIPHER_ADD + 16 + 64)
+    static THREAD_LS_T XGEN_ALIGN byte bench_plain_buf[BENCH_SIZE + BENCH_MAX_PAD];
+    static THREAD_LS_T XGEN_ALIGN byte bench_cipher_buf[BENCH_SIZE + BENCH_MAX_PAD];
+#endif
 static int base2 = 1;
 static int digest_stream = 1;
 #ifndef NO_HMAC
@@ -3805,7 +3814,23 @@ static void* benchmarks_do(void* args)
     if (bench_buf_size % 16)
         bench_buf_size += 16 - (bench_buf_size % 16);
 
-#ifdef WOLFSSL_AFALG_XILINX_AES
+#ifdef WOLFSSL_NO_MALLOC
+    /* No heap: point at the file-scope static buffers.  bench_size can be
+     * raised at runtime (benchmark_configure(), the -base16/auth size paths,
+     * or hash-file input), so confirm the requested size still fits the fixed
+     * static capacity before using them - otherwise the later XMEMSET / crypto
+     * writes would run past the end of the buffer. */
+    if ((unsigned long)bench_buf_size + 16UL >
+            (unsigned long)sizeof(bench_plain_buf)) {
+        printf("%sBenchmark size %lu exceeds WOLFSSL_NO_MALLOC static buffer "
+               "(%lu); rebuild with a larger BENCH_SIZE\n", err_prefix,
+               (unsigned long)bench_buf_size,
+               (unsigned long)sizeof(bench_plain_buf));
+        goto exit;
+    }
+    bench_plain = bench_plain_buf;
+    bench_cipher = bench_cipher_buf;
+#elif defined(WOLFSSL_AFALG_XILINX_AES)
     bench_plain = (byte*)aligned_alloc(64, (size_t)bench_buf_size + 16); /* native heap */
     bench_cipher = (byte*)aligned_alloc(64, (size_t)bench_buf_size + 16); /* native heap */
 #else
@@ -3918,7 +3943,8 @@ static void* benchmarks_do(void* args)
     }
 #endif
 
-#if defined(WOLFSSL_ASYNC_CRYPT) || defined(HAVE_INTEL_QA_SYNC)
+#if (defined(WOLFSSL_ASYNC_CRYPT) || defined(HAVE_INTEL_QA_SYNC)) && \
+    !defined(WOLFSSL_NO_MALLOC)
     bench_key = (byte*)XMALLOC(sizeof(bench_key_buf),
                                HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT);
     bench_iv = (byte*)XMALLOC(sizeof(bench_iv_buf),
@@ -4744,9 +4770,12 @@ static void* benchmarks_do(void* args)
 
 exit:
     /* free benchmark buffers */
+#ifndef WOLFSSL_NO_MALLOC
+    /* under WOLFSSL_NO_MALLOC these point at file-scope static buffers */
     XFREE(bench_plain, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT);
     XFREE(bench_cipher, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT);
-#ifdef WOLFSSL_ASYNC_CRYPT
+#endif
+#if defined(WOLFSSL_ASYNC_CRYPT) && !defined(WOLFSSL_NO_MALLOC)
     XFREE(bench_key, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT);
     XFREE(bench_iv, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT);
 #endif
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index e95bba5ae5e..85ceb55a946 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -6432,18 +6432,18 @@ static wc_test_ret_t sha512_224_kat_test(wc_Sha512* sha, wc_Sha512* shaCopy)
 static wc_test_ret_t sha512_224_large_hash_test(wc_Sha512* sha)
 {
 #ifdef HASH_SIZE_LIMIT
-    static const char* large_digest =
-        "\x98\x68\xc3\xd9\xb9\xef\x17\x53"
-        "\x43\x66\x0e\x60\xdf\x29\xf8\xef"
-        "\x96\xe3\x93\x34\x8c\x6f\xc0\xeb"
-        "\x14\x6c\xcf\x6a";
+    static const byte large_digest[] = {
+        0x98, 0x68, 0xc3, 0xd9, 0xb9, 0xef, 0x17, 0x53, 0x43, 0x66, 0x0e, 0x60,
+        0xdf, 0x29, 0xf8, 0xef, 0x96, 0xe3, 0x93, 0x34, 0x8c, 0x6f, 0xc0, 0xeb,
+        0x14, 0x6c, 0xcf, 0x6a, 0x00
+    };
     int times = 20;
 #else
-    static const char* large_digest =
-        "\x26\x5f\x98\xd1\x76\x49\x71\x4e"
-        "\x82\xb7\x9d\x52\x32\x67\x9d\x56"
-        "\x91\xf5\x88\xc3\x05\xbb\x3f\x90"
-        "\xe2\x4e\x85\x05";
+    static const byte large_digest[] = {
+        0x26, 0x5f, 0x98, 0xd1, 0x76, 0x49, 0x71, 0x4e, 0x82, 0xb7, 0x9d, 0x52,
+        0x32, 0x67, 0x9d, 0x56, 0x91, 0xf5, 0x88, 0xc3, 0x05, 0xbb, 0x3f, 0x90,
+        0xe2, 0x4e, 0x85, 0x05, 0x00
+    };
     int times = 100;
 #endif
     byte      hash[WC_SHA512_224_DIGEST_SIZE];
@@ -6624,18 +6624,18 @@ static wc_test_ret_t sha512_256_kat_test(wc_Sha512* sha, wc_Sha512* shaCopy)
 static wc_test_ret_t sha512_256_large_hash_test(wc_Sha512* sha)
 {
 #ifdef HASH_SIZE_LIMIT
-    static const char* large_digest =
-        "\x49\xcc\xbc\x7a\x93\x0b\x02\xb8"
-        "\xad\x9a\x46\x51\x00\x1f\x13\x80"
-        "\x35\x84\x36\xf1\xf2\x3c\xeb\xd8"
-        "\x41\xd4\x06\x8b\x1d\x19\xad\x72";
+    static const byte large_digest[] = {
+        0x49, 0xcc, 0xbc, 0x7a, 0x93, 0x0b, 0x02, 0xb8, 0xad, 0x9a, 0x46, 0x51,
+        0x00, 0x1f, 0x13, 0x80, 0x35, 0x84, 0x36, 0xf1, 0xf2, 0x3c, 0xeb, 0xd8,
+        0x41, 0xd4, 0x06, 0x8b, 0x1d, 0x19, 0xad, 0x72, 0x00
+    };
     int times = 20;
 #else
-    static const char* large_digest =
-        "\x7a\xe3\x84\x05\xcb\x06\x22\x08"
-        "\x7e\x2c\x65\x89\x1f\x26\x45\xfd"
-        "\xad\xbc\x2e\x29\x83\x12\x84\x4b"
-        "\xf2\xa0\xde\xbe\x06\x11\xd7\x44";
+    static const byte large_digest[] = {
+        0x7a, 0xe3, 0x84, 0x05, 0xcb, 0x06, 0x22, 0x08, 0x7e, 0x2c, 0x65, 0x89,
+        0x1f, 0x26, 0x45, 0xfd, 0xad, 0xbc, 0x2e, 0x29, 0x83, 0x12, 0x84, 0x4b,
+        0xf2, 0xa0, 0xde, 0xbe, 0x06, 0x11, 0xd7, 0x44, 0x00
+    };
     int times = 100;
 #endif
     byte      hash[WC_SHA512_256_DIGEST_SIZE];
@@ -7000,9 +7000,11 @@ static wc_test_ret_t sha3_224_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy)
 #ifndef NO_LARGE_HASH_TEST
 static wc_test_ret_t sha3_224_large_hash_test(wc_Sha3* sha)
 {
-    static const char* large_digest =
-        "\x13\xe5\xd3\x98\x7b\x94\xda\x41\x12\xc7\x1e\x92\x3a\x19"
-        "\x21\x20\x86\x6f\x24\xbf\x0a\x31\xbc\xfd\xd6\x70\x36\xf3";
+    static const byte large_digest[] = {
+        0x13, 0xe5, 0xd3, 0x98, 0x7b, 0x94, 0xda, 0x41, 0x12, 0xc7, 0x1e, 0x92,
+        0x3a, 0x19, 0x21, 0x20, 0x86, 0x6f, 0x24, 0xbf, 0x0a, 0x31, 0xbc, 0xfd,
+        0xd6, 0x70, 0x36, 0xf3, 0x00
+    };
     byte hash[WC_SHA3_224_DIGEST_SIZE];
     byte large_input[1024];
     wc_test_ret_t ret;
@@ -7214,9 +7216,11 @@ static wc_test_ret_t sha3_256_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy)
 #ifndef NO_LARGE_HASH_TEST
 static wc_test_ret_t sha3_256_large_hash_test(wc_Sha3* sha)
 {
-    static const char* large_digest =
-        "\xdc\x90\xc0\xb1\x25\xdb\x2c\x34\x81\xa3\xff\xbc\x1e\x2e\x87\xeb"
-        "\x6d\x70\x85\x61\xe0\xe9\x63\x61\xff\xe5\x84\x4b\x1f\x68\x05\x15";
+    static const byte large_digest[] = {
+        0xdc, 0x90, 0xc0, 0xb1, 0x25, 0xdb, 0x2c, 0x34, 0x81, 0xa3, 0xff, 0xbc,
+        0x1e, 0x2e, 0x87, 0xeb, 0x6d, 0x70, 0x85, 0x61, 0xe0, 0xe9, 0x63, 0x61,
+        0xff, 0xe5, 0x84, 0x4b, 0x1f, 0x68, 0x05, 0x15, 0x00
+    };
     byte hash[WC_SHA3_256_DIGEST_SIZE];
     byte large_input[1024];
     wc_test_ret_t ret;
@@ -7431,10 +7435,12 @@ static wc_test_ret_t sha3_384_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy)
 #ifndef NO_LARGE_HASH_TEST
 static wc_test_ret_t sha3_384_large_hash_test(wc_Sha3* sha)
 {
-    static const char* large_digest =
-        "\x30\x44\xec\x17\xef\x47\x9f\x55\x36\x11\xd6\x3f\x8a\x31\x5a\x71"
-        "\x8a\x71\xa7\x1d\x8e\x84\xe8\x6c\x24\x02\x2f\x7a\x08\x4e\xea\xd7"
-        "\x42\x36\x5d\xa8\xc2\xb7\x42\xad\xec\x19\xfb\xca\xc6\x64\xb3\xa4";
+    static const byte large_digest[] = {
+        0x30, 0x44, 0xec, 0x17, 0xef, 0x47, 0x9f, 0x55, 0x36, 0x11, 0xd6, 0x3f,
+        0x8a, 0x31, 0x5a, 0x71, 0x8a, 0x71, 0xa7, 0x1d, 0x8e, 0x84, 0xe8, 0x6c,
+        0x24, 0x02, 0x2f, 0x7a, 0x08, 0x4e, 0xea, 0xd7, 0x42, 0x36, 0x5d, 0xa8,
+        0xc2, 0xb7, 0x42, 0xad, 0xec, 0x19, 0xfb, 0xca, 0xc6, 0x64, 0xb3, 0xa4, 0x00
+    };
     byte hash[WC_SHA3_384_DIGEST_SIZE];
     byte large_input[1024];
     wc_test_ret_t ret;
@@ -7618,11 +7624,14 @@ static wc_test_ret_t sha3_512_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy)
 #ifndef NO_LARGE_HASH_TEST
 static wc_test_ret_t sha3_512_large_hash_test(wc_Sha3* sha)
 {
-    static const char* large_digest =
-        "\x9c\x13\x26\xb6\x26\xb2\x94\x31\xbc\xf4\x34\xe9\x6f\xf2\xd6\x29"
-        "\x9a\xd0\x9b\x32\x63\x2f\x18\xa7\x5f\x23\xc9\x60\xc2\x32\x0c\xbc"
-        "\x57\x77\x33\xf1\x83\x81\x8a\xd3\x15\x7c\x93\xdc\x80\x9f\xed\x61"
-        "\x41\xa7\x5b\xfd\x32\x0e\x38\x15\xb0\x46\x3b\x7a\x4f\xfd\x44\x88";
+    static const byte large_digest[] = {
+        0x9c, 0x13, 0x26, 0xb6, 0x26, 0xb2, 0x94, 0x31, 0xbc, 0xf4, 0x34, 0xe9,
+        0x6f, 0xf2, 0xd6, 0x29, 0x9a, 0xd0, 0x9b, 0x32, 0x63, 0x2f, 0x18, 0xa7,
+        0x5f, 0x23, 0xc9, 0x60, 0xc2, 0x32, 0x0c, 0xbc, 0x57, 0x77, 0x33, 0xf1,
+        0x83, 0x81, 0x8a, 0xd3, 0x15, 0x7c, 0x93, 0xdc, 0x80, 0x9f, 0xed, 0x61,
+        0x41, 0xa7, 0x5b, 0xfd, 0x32, 0x0e, 0x38, 0x15, 0xb0, 0x46, 0x3b, 0x7a,
+        0x4f, 0xfd, 0x44, 0x88, 0x00
+    };
     byte hash[WC_SHA3_512_DIGEST_SIZE];
     byte large_input[1024];
     wc_test_ret_t ret;
@@ -7768,28 +7777,36 @@ static wc_test_ret_t shake128_absorb_test(wc_Shake* sha, byte *large_input_buf,
 {
     wc_test_ret_t ret = 0;
     int i;
-    static const char large_digest[] =
-        "\x2b\xd1\x69\x9f\xb3\x75\x40\x74\xb8\xb2\xd2\x0b\x92\x47\x9b\xfe"
-        "\xc9\x91\x48\xbe\xda\xa4\x09\xd7\x61\x35\x18\x05\x07\x71\xa5\x61"
-        "\x4d\xc4\x94\xad\xbe\x04\x7d\xad\x95\x2f\xeb\x2c\xc0\x10\x67\x43"
-        "\x40\xf1\x4a\x58\x1c\x54\xfa\x24\x1c\x1a\x4e\x8d\x9b\xbc\xea\xa7"
-        "\x32\xf2\x4c\xc7\x86\x05\x36\xdc\xb4\x42\xd8\x35\xd1\xb4\xa2\x79"
-        "\xa2\xe6\xee\x67\x4f\xbf\x2a\x93\x41\x88\x25\x56\x29\x90\x1a\x06"
-        "\xba\xfe\x9f\xa6\x1a\x74\xe8\x7e\x85\x4a\xc8\x58\x60\xb1\x7b\x18"
-        "\xdf\x77\x59\x46\x04\xc1\xff\x4b\x9b\xcb\xad\xfe\x91\x28\xf0\x01"
-        "\xc1\x33\xd0\x99\x99\x2e\x0c\x86\x84\x67\x4d\x37\xa4\x42\x45\x10"
-        "\xdc\x8f\xdb\x6f\xa6\x9b\xee\x8a\x60\xa5\x1f\x95\x3f\x8f\xf5\x31"
-        "\x4b\x1d\x48\x1e\x45\xff\x79\x5c\xbe\x72\xfc\x56\xed\x6d\x1a\x99"
-        "\x7f\x23\x7c\xd1\xa5\x50\x9e\xb0\x4d\x61\x37\xa5\xcb\x24\x71\x3b"
-        "\xa3\x60\x51\x2e\x80\x83\x8b\xe0\x55\x50\xa7\x1e\xcc\x9f\xac\x41"
-        "\x77\x2c\x79\x22\x30\x09\x1b\x1a\x83\x5b\x2c\x48\xdc\x09\x7d\x59"
-        "\x0d\xf0\x54\x17\xfb\x5e\x38\x68\xde\xdb\xc5\x93\xab\x17\x5f\x4b"
-        "\x4d\x6d\xf2\xc7\x4e\x15\x1e\x10\x76\xc4\xcb\x87\xd8\xb7\x9d\xa8"
-        "\xbf\xc5\x2e\x5e\xfc\xd3\x6c\x45\xd4\x5d\x72\x0f\x66\xeb\x67\x86"
-        "\xfa\x6c\xd6\x80\xa4\x23\xcb\x5d\xed\x3c\xde\xdc\x5b\x3d\xca\x95"
-        "\x43\x4b\xdc\xe8\x49\xd3\xe1\x01\xd4\xf1\xe4\x47\xcf\x56\xba\x71"
-        "\xb4\x69\xed\xe7\xdb\x0f\x89\xd6\xbb\xcd\x1a\xff\xb4\xbe\x72\x26"
-        "\xdc\x76\x79\xb3\x1a\x4b\xe6\x8d\x9b\x8e\xd9\xe9\xe6\xf9\xff\xa5";
+    static const byte large_digest[] = {
+        0x2b, 0xd1, 0x69, 0x9f, 0xb3, 0x75, 0x40, 0x74, 0xb8, 0xb2, 0xd2, 0x0b,
+        0x92, 0x47, 0x9b, 0xfe, 0xc9, 0x91, 0x48, 0xbe, 0xda, 0xa4, 0x09, 0xd7,
+        0x61, 0x35, 0x18, 0x05, 0x07, 0x71, 0xa5, 0x61, 0x4d, 0xc4, 0x94, 0xad,
+        0xbe, 0x04, 0x7d, 0xad, 0x95, 0x2f, 0xeb, 0x2c, 0xc0, 0x10, 0x67, 0x43,
+        0x40, 0xf1, 0x4a, 0x58, 0x1c, 0x54, 0xfa, 0x24, 0x1c, 0x1a, 0x4e, 0x8d,
+        0x9b, 0xbc, 0xea, 0xa7, 0x32, 0xf2, 0x4c, 0xc7, 0x86, 0x05, 0x36, 0xdc,
+        0xb4, 0x42, 0xd8, 0x35, 0xd1, 0xb4, 0xa2, 0x79, 0xa2, 0xe6, 0xee, 0x67,
+        0x4f, 0xbf, 0x2a, 0x93, 0x41, 0x88, 0x25, 0x56, 0x29, 0x90, 0x1a, 0x06,
+        0xba, 0xfe, 0x9f, 0xa6, 0x1a, 0x74, 0xe8, 0x7e, 0x85, 0x4a, 0xc8, 0x58,
+        0x60, 0xb1, 0x7b, 0x18, 0xdf, 0x77, 0x59, 0x46, 0x04, 0xc1, 0xff, 0x4b,
+        0x9b, 0xcb, 0xad, 0xfe, 0x91, 0x28, 0xf0, 0x01, 0xc1, 0x33, 0xd0, 0x99,
+        0x99, 0x2e, 0x0c, 0x86, 0x84, 0x67, 0x4d, 0x37, 0xa4, 0x42, 0x45, 0x10,
+        0xdc, 0x8f, 0xdb, 0x6f, 0xa6, 0x9b, 0xee, 0x8a, 0x60, 0xa5, 0x1f, 0x95,
+        0x3f, 0x8f, 0xf5, 0x31, 0x4b, 0x1d, 0x48, 0x1e, 0x45, 0xff, 0x79, 0x5c,
+        0xbe, 0x72, 0xfc, 0x56, 0xed, 0x6d, 0x1a, 0x99, 0x7f, 0x23, 0x7c, 0xd1,
+        0xa5, 0x50, 0x9e, 0xb0, 0x4d, 0x61, 0x37, 0xa5, 0xcb, 0x24, 0x71, 0x3b,
+        0xa3, 0x60, 0x51, 0x2e, 0x80, 0x83, 0x8b, 0xe0, 0x55, 0x50, 0xa7, 0x1e,
+        0xcc, 0x9f, 0xac, 0x41, 0x77, 0x2c, 0x79, 0x22, 0x30, 0x09, 0x1b, 0x1a,
+        0x83, 0x5b, 0x2c, 0x48, 0xdc, 0x09, 0x7d, 0x59, 0x0d, 0xf0, 0x54, 0x17,
+        0xfb, 0x5e, 0x38, 0x68, 0xde, 0xdb, 0xc5, 0x93, 0xab, 0x17, 0x5f, 0x4b,
+        0x4d, 0x6d, 0xf2, 0xc7, 0x4e, 0x15, 0x1e, 0x10, 0x76, 0xc4, 0xcb, 0x87,
+        0xd8, 0xb7, 0x9d, 0xa8, 0xbf, 0xc5, 0x2e, 0x5e, 0xfc, 0xd3, 0x6c, 0x45,
+        0xd4, 0x5d, 0x72, 0x0f, 0x66, 0xeb, 0x67, 0x86, 0xfa, 0x6c, 0xd6, 0x80,
+        0xa4, 0x23, 0xcb, 0x5d, 0xed, 0x3c, 0xde, 0xdc, 0x5b, 0x3d, 0xca, 0x95,
+        0x43, 0x4b, 0xdc, 0xe8, 0x49, 0xd3, 0xe1, 0x01, 0xd4, 0xf1, 0xe4, 0x47,
+        0xcf, 0x56, 0xba, 0x71, 0xb4, 0x69, 0xed, 0xe7, 0xdb, 0x0f, 0x89, 0xd6,
+        0xbb, 0xcd, 0x1a, 0xff, 0xb4, 0xbe, 0x72, 0x26, 0xdc, 0x76, 0x79, 0xb3,
+        0x1a, 0x4b, 0xe6, 0x8d, 0x9b, 0x8e, 0xd9, 0xe9, 0xe6, 0xf9, 0xff, 0xa5, 0x00
+    };
     byte  hash[sizeof(large_digest) - 1];
     testVector a, b, c, d, e;
     testVector test_sha[5];
@@ -7955,15 +7972,18 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t shake128_test(void)
 #else
     byte large_input[SHAKE128_LARGE_INPUT_BUFSIZ];
 #endif
-    static const char large_digest[] =
-        "\x88\xd7\x0e\x86\x46\x72\x6b\x3d\x7d\x22\xe1\xa9\x2d\x02\xdb\x35"
-        "\x92\x4f\x1b\x03\x90\xee\xa3\xce\xd1\x3a\x08\x3a\xd7\x4e\x10\xdf"
-        "\x09\x67\x33\x35\x4f\xdd\x38\x50\x5b\xcb\x75\xc7\xba\x65\xe5\xe8"
-        "\xb8\x76\xde\xc5\xee\xd7\xf1\x65\x93\x4e\x5e\xc4\xb1\xd7\x6b\xee"
-        "\x4b\x57\x48\xf5\x38\x49\x9e\x45\xa0\xf7\x32\xe9\x05\x26\x6a\x10"
-        "\x70\xd4\x7c\x19\x01\x1f\x6d\x37\xba\x7b\x74\xc2\xbc\xb6\xbc\x74"
-        "\xa3\x66\x6c\x9b\x11\x84\x9d\x4a\x36\xbc\x8a\x0d\x4c\xe3\x39\xfa"
-        "\xfa\x1b";
+    static const byte large_digest[] = {
+        0x88, 0xd7, 0x0e, 0x86, 0x46, 0x72, 0x6b, 0x3d, 0x7d, 0x22, 0xe1, 0xa9,
+        0x2d, 0x02, 0xdb, 0x35, 0x92, 0x4f, 0x1b, 0x03, 0x90, 0xee, 0xa3, 0xce,
+        0xd1, 0x3a, 0x08, 0x3a, 0xd7, 0x4e, 0x10, 0xdf, 0x09, 0x67, 0x33, 0x35,
+        0x4f, 0xdd, 0x38, 0x50, 0x5b, 0xcb, 0x75, 0xc7, 0xba, 0x65, 0xe5, 0xe8,
+        0xb8, 0x76, 0xde, 0xc5, 0xee, 0xd7, 0xf1, 0x65, 0x93, 0x4e, 0x5e, 0xc4,
+        0xb1, 0xd7, 0x6b, 0xee, 0x4b, 0x57, 0x48, 0xf5, 0x38, 0x49, 0x9e, 0x45,
+        0xa0, 0xf7, 0x32, 0xe9, 0x05, 0x26, 0x6a, 0x10, 0x70, 0xd4, 0x7c, 0x19,
+        0x01, 0x1f, 0x6d, 0x37, 0xba, 0x7b, 0x74, 0xc2, 0xbc, 0xb6, 0xbc, 0x74,
+        0xa3, 0x66, 0x6c, 0x9b, 0x11, 0x84, 0x9d, 0x4a, 0x36, 0xbc, 0x8a, 0x0d,
+        0x4c, 0xe3, 0x39, 0xfa, 0xfa, 0x1b, 0x00
+    };
     /*
     ** https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHAKE128_Msg0.pdf
     ** d/e: NIST CAVP - full rate / multi-block output.
@@ -8149,24 +8169,31 @@ static wc_test_ret_t shake256_absorb_test(wc_Shake* sha, byte *large_input_buf,
 {
     wc_test_ret_t ret = 0;
     int i;
-    static const char large_digest[] =
-        "\x21\x25\x8e\xae\x6e\x4f\xa7\xe1\xb9\x6d\xa7\xc9\x7d\x46\x03\x69"
-        "\x29\x0d\x81\x49\xba\x5d\xaf\x37\xfd\xeb\x25\x52\x1d\xd9\xbd\x65"
-        "\xfa\x99\xb9\xd1\x70\x6b\xeb\xd4\xc1\x2c\xea\x24\x20\x27\xa7\xcd"
-        "\xfa\xe1\x81\xd9\xd5\xc1\x1c\xc7\xe9\x70\xc3\xc7\x21\x6f\x32\x22"
-        "\xe3\x27\xdb\x58\x5e\xea\x18\x2d\x63\x4d\x14\x6c\x94\xcf\x2b\x7e"
-        "\x6e\x2a\x74\xf3\xe0\xac\xb3\xb2\xcc\xef\x38\xe9\xe7\x35\xb3\xc5"
-        "\x77\x9d\xff\xe3\x08\x8e\xf8\x2c\x89\xbb\x45\x22\x16\x99\x91\xc0"
-        "\xe7\x71\x57\x75\xc5\xb1\xc6\xaf\x27\xcb\x64\x8c\xc4\xee\x3d\x5f"
-        "\x4c\x35\xfb\x1c\xf3\xf8\x0e\xfd\x5e\xfc\x07\xd8\x4d\x55\x32\x49"
-        "\x45\x0d\xab\x4a\x49\xc4\x83\xde\xd2\x50\xc9\x33\x8f\x85\xcd\x93"
-        "\x7a\xe6\x6b\xb4\x36\xf3\xb4\x02\x6e\x85\x9f\xda\x1c\xa5\x71\x43"
-        "\x2f\x3b\xfc\x09\xe7\xc0\x3c\xa4\xd1\x83\xb7\x41\x11\x1c\xa0\x48"
-        "\x3d\x0e\xda\xbc\x03\xfe\xb2\x3b\x17\xee\x48\xe8\x44\xba\x24\x08"
-        "\xd9\xdc\xfd\x01\x39\xd2\xe8\xc7\x31\x01\x25\xae\xe8\x01\xc6\x1a"
-        "\xb7\x90\x0d\x1e\xfc\x47\xc0\x78\x28\x17\x66\xf3\x61\xc5\xe6\x11"
-        "\x13\x46\x23\x5e\x1d\xc3\x83\x25\x66\x6c\x68\x1b\x30\xdd\xc4\xe6"
-        "\x83\x8b\x0f\x23\x58\x7e\x06\x5f\x4a\x2b\xed\xc9\x6c\x97\x68\x44";
+    static const byte large_digest[] = {
+        0x21, 0x25, 0x8e, 0xae, 0x6e, 0x4f, 0xa7, 0xe1, 0xb9, 0x6d, 0xa7, 0xc9,
+        0x7d, 0x46, 0x03, 0x69, 0x29, 0x0d, 0x81, 0x49, 0xba, 0x5d, 0xaf, 0x37,
+        0xfd, 0xeb, 0x25, 0x52, 0x1d, 0xd9, 0xbd, 0x65, 0xfa, 0x99, 0xb9, 0xd1,
+        0x70, 0x6b, 0xeb, 0xd4, 0xc1, 0x2c, 0xea, 0x24, 0x20, 0x27, 0xa7, 0xcd,
+        0xfa, 0xe1, 0x81, 0xd9, 0xd5, 0xc1, 0x1c, 0xc7, 0xe9, 0x70, 0xc3, 0xc7,
+        0x21, 0x6f, 0x32, 0x22, 0xe3, 0x27, 0xdb, 0x58, 0x5e, 0xea, 0x18, 0x2d,
+        0x63, 0x4d, 0x14, 0x6c, 0x94, 0xcf, 0x2b, 0x7e, 0x6e, 0x2a, 0x74, 0xf3,
+        0xe0, 0xac, 0xb3, 0xb2, 0xcc, 0xef, 0x38, 0xe9, 0xe7, 0x35, 0xb3, 0xc5,
+        0x77, 0x9d, 0xff, 0xe3, 0x08, 0x8e, 0xf8, 0x2c, 0x89, 0xbb, 0x45, 0x22,
+        0x16, 0x99, 0x91, 0xc0, 0xe7, 0x71, 0x57, 0x75, 0xc5, 0xb1, 0xc6, 0xaf,
+        0x27, 0xcb, 0x64, 0x8c, 0xc4, 0xee, 0x3d, 0x5f, 0x4c, 0x35, 0xfb, 0x1c,
+        0xf3, 0xf8, 0x0e, 0xfd, 0x5e, 0xfc, 0x07, 0xd8, 0x4d, 0x55, 0x32, 0x49,
+        0x45, 0x0d, 0xab, 0x4a, 0x49, 0xc4, 0x83, 0xde, 0xd2, 0x50, 0xc9, 0x33,
+        0x8f, 0x85, 0xcd, 0x93, 0x7a, 0xe6, 0x6b, 0xb4, 0x36, 0xf3, 0xb4, 0x02,
+        0x6e, 0x85, 0x9f, 0xda, 0x1c, 0xa5, 0x71, 0x43, 0x2f, 0x3b, 0xfc, 0x09,
+        0xe7, 0xc0, 0x3c, 0xa4, 0xd1, 0x83, 0xb7, 0x41, 0x11, 0x1c, 0xa0, 0x48,
+        0x3d, 0x0e, 0xda, 0xbc, 0x03, 0xfe, 0xb2, 0x3b, 0x17, 0xee, 0x48, 0xe8,
+        0x44, 0xba, 0x24, 0x08, 0xd9, 0xdc, 0xfd, 0x01, 0x39, 0xd2, 0xe8, 0xc7,
+        0x31, 0x01, 0x25, 0xae, 0xe8, 0x01, 0xc6, 0x1a, 0xb7, 0x90, 0x0d, 0x1e,
+        0xfc, 0x47, 0xc0, 0x78, 0x28, 0x17, 0x66, 0xf3, 0x61, 0xc5, 0xe6, 0x11,
+        0x13, 0x46, 0x23, 0x5e, 0x1d, 0xc3, 0x83, 0x25, 0x66, 0x6c, 0x68, 0x1b,
+        0x30, 0xdd, 0xc4, 0xe6, 0x83, 0x8b, 0x0f, 0x23, 0x58, 0x7e, 0x06, 0x5f,
+        0x4a, 0x2b, 0xed, 0xc9, 0x6c, 0x97, 0x68, 0x44, 0x00
+    };
     byte  hash[sizeof(large_digest) - 1];
     testVector a, b, c, d, e;
     testVector test_sha[5];
@@ -8320,15 +8347,18 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t shake256_test(void)
 #else
     byte large_input[SHAKE256_LARGE_INPUT_BUFSIZ];
 #endif
-    static const char large_digest[] =
-        "\x90\x32\x4a\xcc\xd1\xdf\xb8\x0b\x79\x1f\xb8\xc8\x5b\x54\xc8\xe7"
-        "\x45\xf5\x60\x6b\x38\x26\xb2\x0a\xee\x38\x01\xf3\xd9\xfa\x96\x9f"
-        "\x6a\xd7\x15\xdf\xb6\xc2\xf4\x20\x33\x44\x55\xe8\x2a\x09\x2b\x68"
-        "\x2e\x18\x65\x5e\x65\x93\x28\xbc\xb1\x9e\xe2\xb1\x92\xea\x98\xac"
-        "\x21\xef\x4c\xe1\xb4\xb7\xbe\x81\x5c\x1d\xd3\xb7\x17\xe5\xbb\xc5"
-        "\x8c\x68\xb7\xfb\xac\x55\x8a\x9b\x4d\x91\xe4\x9f\x72\xbb\x6e\x38"
-        "\xaf\x21\x7d\x21\xaa\x98\x4e\x75\xc4\xb4\x1c\x7c\x50\x45\x54\xf9"
-        "\xea\x26";
+    static const byte large_digest[] = {
+        0x90, 0x32, 0x4a, 0xcc, 0xd1, 0xdf, 0xb8, 0x0b, 0x79, 0x1f, 0xb8, 0xc8,
+        0x5b, 0x54, 0xc8, 0xe7, 0x45, 0xf5, 0x60, 0x6b, 0x38, 0x26, 0xb2, 0x0a,
+        0xee, 0x38, 0x01, 0xf3, 0xd9, 0xfa, 0x96, 0x9f, 0x6a, 0xd7, 0x15, 0xdf,
+        0xb6, 0xc2, 0xf4, 0x20, 0x33, 0x44, 0x55, 0xe8, 0x2a, 0x09, 0x2b, 0x68,
+        0x2e, 0x18, 0x65, 0x5e, 0x65, 0x93, 0x28, 0xbc, 0xb1, 0x9e, 0xe2, 0xb1,
+        0x92, 0xea, 0x98, 0xac, 0x21, 0xef, 0x4c, 0xe1, 0xb4, 0xb7, 0xbe, 0x81,
+        0x5c, 0x1d, 0xd3, 0xb7, 0x17, 0xe5, 0xbb, 0xc5, 0x8c, 0x68, 0xb7, 0xfb,
+        0xac, 0x55, 0x8a, 0x9b, 0x4d, 0x91, 0xe4, 0x9f, 0x72, 0xbb, 0x6e, 0x38,
+        0xaf, 0x21, 0x7d, 0x21, 0xaa, 0x98, 0x4e, 0x75, 0xc4, 0xb4, 0x1c, 0x7c,
+        0x50, 0x45, 0x54, 0xf9, 0xea, 0x26, 0x00
+    };
     /*
     ** https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHAKE256_Msg0.pdf
     ** d/e: NIST CAVP - full rate / multi-block output.

From 20e4053a9fa00b6e216185f2157ee9512c2ceb17 Mon Sep 17 00:00:00 2001
From: David Garske <david@wolfssl.com>
Date: Thu, 18 Jun 2026 11:44:21 -0700
Subject: [PATCH 4/4] dilithium: faster Montgomery q^-1 via 64-bit-widened
 multiply on 16-bit CPUs

The TI cl2000 (C2000 C28x) compiler miscompiles the 32x32->32 low multiply
used for the q^-1 step of mldsa_mont_red() - verified on a TMS320F28P550SJ,
the ML-DSA-87 verify KAT fails (res=0) - but compiles the 32x64->64 widening
multiply correctly. Compute the q^-1 product through the 64-bit path
(MLDSA_MUL_QINV_WIDE64): correct on any conforming compiler and, on the C28x,
~4% faster than the shift-based reduction (305 vs 317 ms/op for ML-DSA-87
verify). dilithium.h auto-selects it for WC_16BIT_CPU and leaves the q
multiply enabled (it compiles correctly); a user can still force the shift
form with MLDSA_MUL_QINV_SLOW / MLDSA_MUL_Q_SLOW. Validated on hardware for
keygen+sign+verify (round-trip res=1). No effect on 8-bit/>=32-bit-int builds.
---
 wolfcrypt/src/wc_mldsa.c      |  6 ++++++
 wolfssl/wolfcrypt/dilithium.h | 20 ++++++++------------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/wolfcrypt/src/wc_mldsa.c b/wolfcrypt/src/wc_mldsa.c
index 62538dc0e88..3464ab49d0e 100644
--- a/wolfcrypt/src/wc_mldsa.c
+++ b/wolfcrypt/src/wc_mldsa.c
@@ -5594,7 +5594,13 @@ static void mldsa_vec_use_hint(sword32* w1, byte k, sword32 gamma2,
 static sword32 mldsa_mont_red(sword64 a)
 {
 #ifndef MLDSA_MUL_QINV_SLOW
+#ifdef MLDSA_MUL_QINV_WIDE64
+    /* Low 32 bits of the q^-1 product via the 64-bit multiply; see
+     * MLDSA_MUL_QINV_WIDE64 in dilithium.h. */
+    sword64 t = (sword32)((sword64)(sword32)a * (sword64)MLDSA_QINV);
+#else
     sword64 t = (sword32)((sword32)a * (sword32)MLDSA_QINV);
+#endif
 #else
     sword64 t = (sword32)((sword32)a + (sword32)((sword32)a << 13) -
         (sword32)((sword32)a << 23) + (sword32)((sword32)a << 26));
diff --git a/wolfssl/wolfcrypt/dilithium.h b/wolfssl/wolfcrypt/dilithium.h
index a37b806bff6..2a344a711a4 100644
--- a/wolfssl/wolfcrypt/dilithium.h
+++ b/wolfssl/wolfcrypt/dilithium.h
@@ -306,19 +306,15 @@
     #endif
 #endif
 
-/* On a 16-bit-int CPU (WC_16BIT_CPU) the toolchain synthesizes 32-/64-bit
- * multiplies, and some (e.g. TI cl2000 for the C2000 C28x) miscompile the
- * multiply-based Montgomery reduction in mldsa_mont_red().  The shift-based
- * reduction is mathematically identical (q = 2^23 - 2^13 + 1, and the q^-1
- * expansion likewise), avoids the wide multiply, and is also cheaper on such
- * targets.  Select it by default there; a user can still override either
- * macro explicitly. */
+/* MLDSA_MUL_QINV_WIDE64: compute the q^-1 step of mldsa_mont_red() through the
+ * 32x64->64 widening multiply instead of a 32x32->32 low multiply.  Some 16-bit
+ * toolchains miscompile the 32x32->32 form (e.g. TI cl2000 on the C28x); the
+ * widening form is correct on any conforming compiler and no slower.  Default
+ * it on for WC_16BIT_CPU; a user can force the shift form with
+ * MLDSA_MUL_QINV_SLOW / MLDSA_MUL_Q_SLOW. */
 #if defined(WC_16BIT_CPU)
-    #ifndef MLDSA_MUL_QINV_SLOW
-        #define MLDSA_MUL_QINV_SLOW
-    #endif
-    #ifndef MLDSA_MUL_Q_SLOW
-        #define MLDSA_MUL_Q_SLOW
+    #if !defined(MLDSA_MUL_QINV_SLOW) && !defined(MLDSA_MUL_QINV_WIDE64)
+        #define MLDSA_MUL_QINV_WIDE64
     #endif
 #endif