From 5e898d7f778e2eccb610d4b467372a630efd1a47 Mon Sep 17 00:00:00 2001 From: David Garske Date: Thu, 18 Jun 2026 09:15:10 -0700 Subject: [PATCH 1/4] wolfcrypt: support CHAR_BIT != 8 (WOLFSSL_WIDE_BYTE) targets - infra, hashes, DRBG Enables wolfCrypt on toolchains where a C byte/char is wider than 8 bits (e.g. TI C2000 C28x, CHAR_BIT == 16), all gated on WOLFSSL_WIDE_BYTE and a no-op on 8-bit-byte targets (the default fast paths are left exactly as-is): - types.h: auto-set WOLFSSL_WIDE_BYTE for CHAR_BIT != 8 / known TI C2000 toolchains (and define CHAR_BIT = 16 when is absent); wc_port.h/.c widen the atomic init-state bitfield + CHAR_BIT static assert for 16-bit int. - settings.h + sp_int.h: allow SP math on a 16-bit-int CPU via WOLFSSL_SP_ALLOW_16BIT_CPU, and detect a 16-bit char in the SP smallest-type selection. - misc.c/misc.h: shared big-endian octet<->word helpers (WordsFromBytesBE32/64, BytesFromWordsBE32/64) for WOLFSSL_WIDE_BYTE, where a word cannot be aliased as an octet stream. They are CHAR_BIT-generic, cl2000-safe (loads accumulate with <<= 8, since (word)octet << 24 is miscompiled as a 16-bit shift), in-place safe for the SHA schedule, and store by octet count for partial digests. misc.c rotate width uses CHAR_BIT. - coding.c: mask the constant-time base64 result to an octet. - sha256.c/sha512.c: use the shared helpers for the schedule load and digest store, plus a CHAR_BIT*sizeof length carry; sha3.c: octet-wise Keccak squeeze. - random.c: Hash-DRBG length + reseed-counter serialization via the shared helpers (and an octet-masked carry) under WOLFSSL_WIDE_BYTE; default builds keep the word-aliasing path unchanged. WOLFSSL_WIDE_BYTE replaces the earlier WOLFSSL_NO_OCTET_BYTE working name. --- .wolfssl_known_macro_extras | 7 ++++ wolfcrypt/src/coding.c | 2 +- wolfcrypt/src/misc.c | 75 +++++++++++++++++++++++++++++++++--- wolfcrypt/src/random.c | 56 ++++++++++++++++++++++++--- wolfcrypt/src/sha256.c | 54 +++++++++++++++++++++++++- wolfcrypt/src/sha3.c | 49 ++++++++++++++++++++--- wolfcrypt/src/sha512.c | 68 ++++++++++++++++++++++++++++++-- wolfcrypt/src/wc_port.c | 2 +- wolfssl/wolfcrypt/misc.h | 13 +++++++ wolfssl/wolfcrypt/settings.h | 8 +++- wolfssl/wolfcrypt/sp_int.h | 11 +++++- wolfssl/wolfcrypt/types.h | 52 ++++++++++++++++++++++++- wolfssl/wolfcrypt/wc_port.h | 9 ++++- 13 files changed, 376 insertions(+), 30 deletions(-) diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index 101ebf2fa88..31da154be16 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -747,6 +747,7 @@ WOLFSSL_CLANG_TIDY WOLFSSL_CLIENT_EXAMPLE WOLFSSL_CONTIKI WOLFSSL_CRL_ALLOW_MISSING_CDP +WOLFSSL_DILITHIUM_VERIFY_SMALLEST_MEM WOLFSSL_DISABLE_EARLY_SANITY_CHECKS WOLFSSL_DRBG_SHA256 WOLFSSL_DTLS13_ECHO_LEGACY_SESSION_ID @@ -1115,6 +1116,12 @@ __SUNPRO_CC __SVR4 __TASKING__ __TI_COMPILER_VERSION__ +__TMS320C2000__ +__TMS320C2800__ +__TMS320C28XX__ +__TMS320C54X__ +__TMS320C5500__ +__TMS320C55X__ __TURBOC__ __UNIX__ __USE_GNU diff --git a/wolfcrypt/src/coding.c b/wolfcrypt/src/coding.c index b3f804fcf02..8d91b2674b8 100644 --- a/wolfcrypt/src/coding.c +++ b/wolfcrypt/src/coding.c @@ -72,7 +72,7 @@ static WC_INLINE byte Base64_Char2Val_CT(byte c) v |= ((slashStart >> 8) ^ (slashEnd >> 8)) & (slashStart + 63 + 1); v |= ((plusStart >> 8) ^ (plusEnd >> 8)) & (plusStart + 62 + 1); - return (byte)(v - 1); + return WC_OCTET(v - 1); } #ifndef BASE64_NO_TABLE diff --git a/wolfcrypt/src/misc.c b/wolfcrypt/src/misc.c index 669794702dc..98636d2c5d4 100644 --- a/wolfcrypt/src/misc.c +++ b/wolfcrypt/src/misc.c @@ -108,13 +108,13 @@ masking and clearing memory logic. WC_MISC_STATIC WC_INLINE word32 rotlFixed(word32 x, word32 y) { - return (x << y) | (x >> (sizeof(x) * 8 - y)); + return (x << y) | (x >> (sizeof(x) * CHAR_BIT - y)); } /* This routine performs a right circular arithmetic shift of by value. */ WC_MISC_STATIC WC_INLINE word32 rotrFixed(word32 x, word32 y) { - return (x >> y) | (x << (sizeof(x) * 8 - y)); + return (x >> y) | (x << (sizeof(x) * CHAR_BIT - y)); } #endif @@ -122,14 +122,14 @@ masking and clearing memory logic. /* This routine performs a left circular arithmetic shift of by value */ WC_MISC_STATIC WC_INLINE word16 rotlFixed16(word16 x, word16 y) { - return (word16)((x << y) | (x >> (sizeof(x) * 8U - y))); + return (word16)((x << y) | (x >> (sizeof(x) * CHAR_BIT - y))); } /* This routine performs a right circular arithmetic shift of by value */ WC_MISC_STATIC WC_INLINE word16 rotrFixed16(word16 x, word16 y) { - return (word16)((x >> y) | (x << (sizeof(x) * 8U - y))); + return (word16)((x >> y) | (x << (sizeof(x) * CHAR_BIT - y))); } /* This routine performs a byte swap of 32-bit word value. */ @@ -329,13 +329,13 @@ WC_MISC_STATIC WC_INLINE void writeUnalignedWords64(byte *out, const word64 *in, WC_MISC_STATIC WC_INLINE word64 rotlFixed64(word64 x, word64 y) { - return (x << y) | (x >> (sizeof(y) * 8 - y)); + return (x << y) | (x >> (sizeof(y) * CHAR_BIT - y)); } WC_MISC_STATIC WC_INLINE word64 rotrFixed64(word64 x, word64 y) { - return (x >> y) | (x << (sizeof(y) * 8 - y)); + return (x >> y) | (x << (sizeof(y) * CHAR_BIT - y)); } @@ -410,6 +410,69 @@ WC_MISC_STATIC WC_INLINE void ByteReverseWords64(word64* out, const word64* in, #endif /* WORD64_AVAILABLE && !WOLFSSL_NO_WORD64_OPS */ +#ifdef WOLFSSL_WIDE_BYTE +/* Big-endian octet <-> word conversion for targets where a C 'byte' (char) is + * wider than 8 bits (WOLFSSL_WIDE_BYTE, i.e. CHAR_BIT != 8 - e.g. the TI C2000 + * C28x). There a word holds several octets per addressable cell, so it cannot + * be aliased as an octet stream (XMEMCPY / ByteReverseWords / (byte) casts all + * assume one octet per cell). These helpers move between an octet stream (one + * octet per byte cell) and big-endian words with shifts, correct for any + * CHAR_BIT. Shared by SHA-2 (schedule load + digest store) and the Hash-DRBG. + * + * Loads accumulate through a word with <<= 8 so each shift is unambiguously + * full-word width: a single (word32)octet << 24 is miscompiled as a 16-bit + * shift by the TI cl2000 (the top octets are dropped). A load reads all of a + * word's octets before writing the word, so it is safe in place over the + * destination word buffer. Stores take an octet count (not a word count) so a + * partial trailing word works - e.g. SHA-512/224's 28-octet digest. */ +WC_MISC_STATIC WC_INLINE void WordsFromBytesBE32(word32* w, const byte* b, + word32 wordCnt) +{ + word32 i; + word32 r; + for (i = 0; i < wordCnt; i++) { + r = (word32)(b[(i * 4) + 0] & 0xFF); r <<= 8; + r |= (word32)(b[(i * 4) + 1] & 0xFF); r <<= 8; + r |= (word32)(b[(i * 4) + 2] & 0xFF); r <<= 8; + r |= (word32)(b[(i * 4) + 3] & 0xFF); + w[i] = r; + } +} +WC_MISC_STATIC WC_INLINE void BytesFromWordsBE32(byte* b, const word32* w, + word32 byteCnt) +{ + word32 i; + for (i = 0; i < byteCnt; i++) { + b[i] = (byte)((w[i >> 2] >> (24 - ((i & 0x3) * 8))) & 0xFF); + } +} +#ifdef WORD64_AVAILABLE +WC_MISC_STATIC WC_INLINE void WordsFromBytesBE64(word64* w, const byte* b, + word32 wordCnt) +{ + word32 i; + int j; + word64 r; + for (i = 0; i < wordCnt; i++) { + r = 0; + for (j = 0; j < 8; j++) { + r <<= 8; + r |= (word64)(b[(i * 8) + j] & 0xFF); + } + w[i] = r; + } +} +WC_MISC_STATIC WC_INLINE void BytesFromWordsBE64(byte* b, const word64* w, + word32 byteCnt) +{ + word32 i; + for (i = 0; i < byteCnt; i++) { + b[i] = (byte)((w[i >> 3] >> (56 - ((i & 0x7) * 8))) & 0xFF); + } +} +#endif /* WORD64_AVAILABLE */ +#endif /* WOLFSSL_WIDE_BYTE */ + #ifndef WOLFSSL_NO_XOR_OPS /* Leave no doubt that WOLFSSL_WORD_SIZE is a power of 2. */ diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c index a75d3400e70..0a720986086 100644 --- a/wolfcrypt/src/random.c +++ b/wolfcrypt/src/random.c @@ -464,7 +464,10 @@ static int Hash_df(DRBG_internal* drbg, byte* out, word32 outSz, byte type, byte ctr; word32 i; word32 len; - word32 bits = (outSz * 8); /* reverse byte order */ + word32 bits = (outSz * 8); +#ifdef WOLFSSL_WIDE_BYTE + byte bitsBuf[4]; /* the 32-bit length as four big-endian octets */ +#endif #ifdef WOLFSSL_SMALL_STACK_CACHE wc_Sha256* sha = &drbg->sha256; #else @@ -489,7 +492,11 @@ static int Hash_df(DRBG_internal* drbg, byte* out, word32 outSz, byte type, return DRBG_FAILURE; #endif -#ifdef LITTLE_ENDIAN_ORDER +#ifdef WOLFSSL_WIDE_BYTE + /* A word32 cannot be aliased as an octet stream where a C byte is wider + * than 8 bits; emit the length as four big-endian octets (shared helper). */ + BytesFromWordsBE32(bitsBuf, &bits, 4); +#elif defined(LITTLE_ENDIAN_ORDER) bits = ByteReverseWord32(bits); #endif len = (outSz / OUTPUT_BLOCK_LEN) @@ -509,7 +516,11 @@ static int Hash_df(DRBG_internal* drbg, byte* out, word32 outSz, byte type, ret = wc_Sha256Update(sha, &ctr, sizeof(ctr)); if (ret == 0) { ctr++; +#ifdef WOLFSSL_WIDE_BYTE + ret = wc_Sha256Update(sha, bitsBuf, sizeof(bitsBuf)); +#else ret = wc_Sha256Update(sha, (byte*)&bits, sizeof(bits)); +#endif } if (ret == 0) { @@ -666,7 +677,7 @@ static WC_INLINE void array_add_one(byte* data, word32 dataSz) { int i; for (i = (int)dataSz - 1; i >= 0; i--) { - data[i]++; + data[i] = WC_OCTET(data[i] + 1); if (data[i] != 0) break; } } @@ -789,14 +800,14 @@ static WC_INLINE void array_add(byte* d, word32 dLen, const byte* s, word32 sLen dIdx = (int)dLen - 1; for (sIdx = (int)sLen - 1; sIdx >= 0; sIdx--) { carry = (word16)(carry + d[dIdx] + s[sIdx]); - d[dIdx] = (byte)carry; + d[dIdx] = WC_OCTET(carry); carry >>= 8; dIdx--; } for (; dIdx >= 0; dIdx--) { carry = (word16)(carry + d[dIdx]); - d[dIdx] = (byte)carry; + d[dIdx] = WC_OCTET(carry); carry >>= 8; } } @@ -820,6 +831,9 @@ static int Hash_DRBG_Generate(DRBG_internal* drbg, byte* out, word32 outSz, #else word32 reseedCtr; #endif +#ifdef WOLFSSL_WIDE_BYTE + byte ctrBuf[8]; /* reseed counter as big-endian octets */ +#endif if (drbg == NULL) { return DRBG_FAILURE; @@ -914,6 +928,17 @@ static int Hash_DRBG_Generate(DRBG_internal* drbg, byte* out, word32 outSz, if (ret == 0) { array_add(drbg->V, sizeof(drbg->V), digest, WC_SHA256_DIGEST_SIZE); array_add(drbg->V, sizeof(drbg->V), drbg->C, sizeof(drbg->C)); +#ifdef WOLFSSL_WIDE_BYTE + /* A word cannot be aliased as an octet stream where a C byte is + * wider than 8 bits; add the counter as big-endian octets. */ + #ifdef WORD64_AVAILABLE + BytesFromWordsBE64(ctrBuf, &reseedCtr, 8); + array_add(drbg->V, sizeof(drbg->V), ctrBuf, 8); + #else + BytesFromWordsBE32(ctrBuf, &reseedCtr, 4); + array_add(drbg->V, sizeof(drbg->V), ctrBuf, 4); + #endif +#else #ifdef LITTLE_ENDIAN_ORDER #ifdef WORD64_AVAILABLE reseedCtr = ByteReverseWord64(reseedCtr); @@ -923,6 +948,7 @@ static int Hash_DRBG_Generate(DRBG_internal* drbg, byte* out, word32 outSz, #endif array_add(drbg->V, sizeof(drbg->V), (byte*)&reseedCtr, sizeof(reseedCtr)); +#endif ret = DRBG_SUCCESS; } drbg->reseedCtr++; @@ -1050,6 +1076,9 @@ static int Hash512_df(DRBG_SHA512_internal* drbg, byte* out, word32 outSz, word32 i; word32 len; word32 bits = (outSz * 8); +#ifdef WOLFSSL_WIDE_BYTE + byte bitsBuf[4]; /* the 32-bit length as four big-endian octets */ +#endif #ifdef WOLFSSL_SMALL_STACK_CACHE wc_Sha512* sha = &drbg->sha512; #else @@ -1082,7 +1111,9 @@ static int Hash512_df(DRBG_SHA512_internal* drbg, byte* out, word32 outSz, return DRBG_FAILURE; #endif -#ifdef LITTLE_ENDIAN_ORDER +#ifdef WOLFSSL_WIDE_BYTE + BytesFromWordsBE32(bitsBuf, &bits, 4); /* see Hash_df */ +#elif defined(LITTLE_ENDIAN_ORDER) bits = ByteReverseWord32(bits); #endif len = (outSz / OUTPUT_BLOCK_LEN_SHA512) @@ -1102,7 +1133,11 @@ static int Hash512_df(DRBG_SHA512_internal* drbg, byte* out, word32 outSz, ret = wc_Sha512Update(sha, &ctr, sizeof(ctr)); if (ret == 0) { ctr++; +#ifdef WOLFSSL_WIDE_BYTE + ret = wc_Sha512Update(sha, bitsBuf, sizeof(bitsBuf)); +#else ret = wc_Sha512Update(sha, (byte*)&bits, sizeof(bits)); +#endif } if (ret == 0) { @@ -1300,6 +1335,9 @@ static int Hash512_DRBG_Generate(DRBG_SHA512_internal* drbg, byte* out, #endif byte type; word64 reseedCtr; +#ifdef WOLFSSL_WIDE_BYTE + byte ctrBuf[8]; /* reseed counter as big-endian octets */ +#endif if (drbg == NULL) { return DRBG_FAILURE; @@ -1377,11 +1415,17 @@ static int Hash512_DRBG_Generate(DRBG_SHA512_internal* drbg, byte* out, array_add(drbg->V, sizeof(drbg->V), digest, WC_SHA512_DIGEST_SIZE); array_add(drbg->V, sizeof(drbg->V), drbg->C, sizeof(drbg->C)); +#ifdef WOLFSSL_WIDE_BYTE + /* See Hash_DRBG_Generate. */ + BytesFromWordsBE64(ctrBuf, &reseedCtr, 8); + array_add(drbg->V, sizeof(drbg->V), ctrBuf, 8); +#else #ifdef LITTLE_ENDIAN_ORDER reseedCtr = ByteReverseWord64(reseedCtr); #endif array_add(drbg->V, sizeof(drbg->V), (byte*)&reseedCtr, sizeof(reseedCtr)); +#endif ret = DRBG_SUCCESS; } drbg->reseedCtr++; diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c index 047c57dade8..992c178b84d 100644 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -248,6 +248,9 @@ on the specific device platform. #define SHA256_UPDATE_REV_BYTES(ctx) SHA256_REV_BYTES(ctx) #endif +/* WOLFSSL_WIDE_BYTE octet<->word32 conversion uses the shared big-endian + * helpers WordsFromBytesBE32 / BytesFromWordsBE32 from misc.c. */ + #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \ (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH) || \ @@ -1499,10 +1502,16 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, } #endif + #ifdef WOLFSSL_WIDE_BYTE + /* CHAR_BIT != 8: load 16 big-endian schedule words octet-wise. */ + WordsFromBytesBE32(sha256->buffer, (const byte*)sha256->buffer, + WC_SHA256_BLOCK_SIZE / 4); + #else if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) { ByteReverseWords(sha256->buffer, sha256->buffer, WC_SHA256_BLOCK_SIZE); } + #endif #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \ !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256) @@ -1603,9 +1612,14 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, } #endif + #ifdef WOLFSSL_WIDE_BYTE + WordsFromBytesBE32(local32, (const byte*)local32, + WC_SHA256_BLOCK_SIZE / 4); + #else if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) { ByteReverseWords(local32, local32, WC_SHA256_BLOCK_SIZE); } + #endif #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \ !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256) @@ -1705,10 +1719,15 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, } #endif + #ifdef WOLFSSL_WIDE_BYTE + WordsFromBytesBE32(sha256->buffer, (const byte*)sha256->buffer, + WC_SHA256_BLOCK_SIZE / 4); + #else if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) { ByteReverseWords(sha256->buffer, sha256->buffer, WC_SHA256_BLOCK_SIZE); } + #endif #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \ !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256) @@ -1733,7 +1752,8 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, WC_SHA256_PAD_SIZE - sha256->buffLen); /* put 64 bit length in separate 32 bit parts */ - sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) + + sha256->hiLen = (sha256->loLen >> + (CHAR_BIT * sizeof(sha256->loLen) - 3)) + (sha256->hiLen << 3); sha256->loLen = sha256->loLen << 3; @@ -1745,6 +1765,21 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, #endif /* store lengths */ +#ifdef WOLFSSL_WIDE_BYTE + /* CHAR_BIT != 8: 'local' indexes octet cells, not the word32 layout, so + * place the 64-bit bit-length as 8 big-endian octets, then load all 16 + * big-endian schedule words octet-wise (covers W[14]=hiLen,W[15]=loLen). */ + local[WC_SHA256_PAD_SIZE + 0] = (byte)((sha256->hiLen >> 24) & 0xFF); + local[WC_SHA256_PAD_SIZE + 1] = (byte)((sha256->hiLen >> 16) & 0xFF); + local[WC_SHA256_PAD_SIZE + 2] = (byte)((sha256->hiLen >> 8) & 0xFF); + local[WC_SHA256_PAD_SIZE + 3] = (byte)((sha256->hiLen ) & 0xFF); + local[WC_SHA256_PAD_SIZE + 4] = (byte)((sha256->loLen >> 24) & 0xFF); + local[WC_SHA256_PAD_SIZE + 5] = (byte)((sha256->loLen >> 16) & 0xFF); + local[WC_SHA256_PAD_SIZE + 6] = (byte)((sha256->loLen >> 8) & 0xFF); + local[WC_SHA256_PAD_SIZE + 7] = (byte)((sha256->loLen ) & 0xFF); + WordsFromBytesBE32(sha256->buffer, (const byte*)sha256->buffer, + WC_SHA256_BLOCK_SIZE / 4); +#else if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) { ByteReverseWords(sha256->buffer, sha256->buffer, WC_SHA256_PAD_SIZE); @@ -1753,6 +1788,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, sizeof(word32)); +#endif /* Only the ESP32-C3 with HW enabled may need pad size byte order reversal * depending on HW or SW mode */ @@ -1841,7 +1877,11 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, return BAD_FUNC_ARG; } - #ifdef LITTLE_ENDIAN_ORDER + #if defined(WOLFSSL_WIDE_BYTE) + /* CHAR_BIT != 8: store digest words as big-endian octets. */ + BytesFromWordsBE32(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); + (void)digest; + #elif defined(LITTLE_ENDIAN_ORDER) if (SHA256_REV_BYTES(&sha256->ctx)) { ByteReverseWords((word32*)digest, (word32*)sha256->digest, WC_SHA256_DIGEST_SIZE); @@ -1889,6 +1929,10 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, return ret; } + #if defined(WOLFSSL_WIDE_BYTE) + /* CHAR_BIT != 8: store digest words as big-endian octets. */ + BytesFromWordsBE32(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); + #else #if defined(LITTLE_ENDIAN_ORDER) if (SHA256_REV_BYTES(&sha256->ctx)) { ByteReverseWords(sha256->digest, sha256->digest, @@ -1896,6 +1940,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, } #endif XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); + #endif return InitSha256(sha256); /* reset state */ } @@ -2417,6 +2462,10 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, if (ret != 0) return ret; + #if defined(WOLFSSL_WIDE_BYTE) + /* CHAR_BIT != 8: store digest words as big-endian octets. */ + BytesFromWordsBE32(hash, sha224->digest, WC_SHA224_DIGEST_SIZE); + #else #if defined(LITTLE_ENDIAN_ORDER) if (SHA256_REV_BYTES(&sha224->ctx)) { ByteReverseWords(sha224->digest, @@ -2425,6 +2474,7 @@ static WC_INLINE int Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, } #endif XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE); + #endif return InitSha224(sha224); /* reset state */ } diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c index 33d06928834..a87a1170b9c 100644 --- a/wolfcrypt/src/sha3.c +++ b/wolfcrypt/src/sha3.c @@ -94,6 +94,15 @@ #define WC_SHA3_SW_KECCAK #endif +/* On targets where a C 'byte' is not 8 bits (e.g. TI C28x has CHAR_BIT == 16) + * the word64 Keccak state cannot be aliased as a contiguous octet stream, so + * the little-endian fast paths that xorbuf / memcpy / cast the state as bytes + * are wrong. WC_SHA3_BYTEWISE forces the octet-wise absorb (Load64), pad, and + * squeeze paths instead. */ +#if !defined(WC_SHA3_BYTEWISE) && defined(WOLFSSL_WIDE_BYTE) + #define WC_SHA3_BYTEWISE +#endif + #if FIPS_VERSION3_GE(6,0,0) const unsigned int wolfCrypt_FIPS_sha3_ro_sanity[2] = { 0x1a2b3c4d, 0x00000016 }; @@ -604,7 +613,7 @@ void BlockSha3(word64* s) #endif /* !WOLFSSL_ARMASM && !WOLFSSL_RISCV_ASM */ #ifdef WC_SHA3_SW_KECCAK -#if defined(BIG_ENDIAN_ORDER) +#if defined(BIG_ENDIAN_ORDER) || defined(WC_SHA3_BYTEWISE) static WC_INLINE word64 Load64Unaligned(const unsigned char *a) { return ((word64)a[0] << 0) | @@ -780,7 +789,8 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p) sha3->i = (byte)(sha3->i + i); if (sha3->i == p * 8) { - #if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) + #if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) && \ + !defined(WC_SHA3_BYTEWISE) xorbuf(sha3->s, sha3->t, (word32)(p * 8)); #else for (i = 0; i < p; i++) { @@ -817,7 +827,8 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p) total_check += blocks * p; #endif for (; blocks > 0; blocks--) { -#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) +#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) && \ + !defined(WC_SHA3_BYTEWISE) xorbuf(sha3->s, data, (word32)(p * 8)); #else for (i = 0; i < p; i++) { @@ -866,18 +877,34 @@ static int Sha3Update(wc_Sha3* sha3, const byte* data, word32 len, byte p) * len Number of bytes in output. * returns 0 on success. */ +#ifdef WC_SHA3_BYTEWISE +/* Squeeze len output bytes from the Keccak state, extracting each octet from + * the 64-bit lanes (little-endian within a lane). Used where a C 'byte' is + * wider than 8 bits (CHAR_BIT != 8) so the state cannot be copied as an octet + * stream. */ +static void Sha3SqueezeBytes(byte* out, const word64* s, word32 len) +{ + word32 k; + for (k = 0; k < len; k++) { + out[k] = (byte)((s[k >> 3] >> (8 * (k & 7))) & 0xFF); + } +} +#endif + static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l) { word32 rate = p * 8U; word32 j; -#if defined(BIG_ENDIAN_ORDER) || defined(WC_SHA3_FAULT_HARDEN) +#if defined(BIG_ENDIAN_ORDER) || defined(WC_SHA3_FAULT_HARDEN) || \ + defined(WC_SHA3_BYTEWISE) word32 i; #endif #ifdef WC_SHA3_FAULT_HARDEN int check = 0; #endif -#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) +#if !defined(BIG_ENDIAN_ORDER) && !defined(WC_SHA3_FAULT_HARDEN) && \ + !defined(WC_SHA3_BYTEWISE) xorbuf(sha3->s, sha3->t, sha3->i); #ifdef WOLFSSL_HASH_FLAGS if ((p == WC_SHA3_256_COUNT) && (sha3->flags & WC_HASH_SHA3_KECCAK256)) { @@ -924,6 +951,8 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l) #endif #if defined(BIG_ENDIAN_ORDER) ByteReverseWords64((word64*)(hash + j), sha3->s, rate); + #elif defined(WC_SHA3_BYTEWISE) + Sha3SqueezeBytes(hash + j, sha3->s, rate); #else XMEMCPY(hash + j, sha3->s, rate); #endif @@ -936,8 +965,12 @@ static int Sha3Final(wc_Sha3* sha3, byte padChar, byte* hash, byte p, word32 l) #endif #if defined(BIG_ENDIAN_ORDER) ByteReverseWords64(sha3->s, sha3->s, rate); - #endif XMEMCPY(hash + j, sha3->s, l - j); + #elif defined(WC_SHA3_BYTEWISE) + Sha3SqueezeBytes(hash + j, sha3->s, l - j); + #else + XMEMCPY(hash + j, sha3->s, l - j); + #endif } #if defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) && defined(USE_INTEL_SPEEDUP) if (SHA3_BLOCK == sha3_block_avx2) { @@ -1916,6 +1949,8 @@ int wc_Shake128_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt) #endif #if defined(BIG_ENDIAN_ORDER) ByteReverseWords64((word64*)out, shake->s, WC_SHA3_128_COUNT * 8); + #elif defined(WC_SHA3_BYTEWISE) + Sha3SqueezeBytes(out, shake->s, WC_SHA3_128_COUNT * 8); #else XMEMCPY(out, shake->s, WC_SHA3_128_COUNT * 8); #endif @@ -2157,6 +2192,8 @@ int wc_Shake256_SqueezeBlocks(wc_Shake* shake, byte* out, word32 blockCnt) #endif #if defined(BIG_ENDIAN_ORDER) ByteReverseWords64((word64*)out, shake->s, WC_SHA3_256_COUNT * 8); + #elif defined(WC_SHA3_BYTEWISE) + Sha3SqueezeBytes(out, shake->s, WC_SHA3_256_COUNT * 8); #else XMEMCPY(out, shake->s, WC_SHA3_256_COUNT * 8); #endif diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index b2f57b13b86..7312fd8eca9 100644 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -1751,6 +1751,9 @@ static int _Transform_Sha512(wc_Sha512* sha512) #endif +/* WOLFSSL_WIDE_BYTE octet<->word64 conversion uses the shared big-endian + * helpers WordsFromBytesBE64 / BytesFromWordsBE64 from misc.c. */ + static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len) { word64 tmp = sha512->loLen; @@ -1798,8 +1801,13 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512)) && \ !defined(WOLFSSL_ARMASM) + #ifdef WOLFSSL_WIDE_BYTE + WordsFromBytesBE64(sha512->buffer, + (const byte*)sha512->buffer, WC_SHA512_BLOCK_SIZE / 8); + #else ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_BLOCK_SIZE); + #endif #endif } #endif @@ -1900,8 +1908,13 @@ static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 le #if !defined(WOLFSSL_ESP32_CRYPT) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512) + #ifdef WOLFSSL_WIDE_BYTE + WordsFromBytesBE64(sha512->buffer, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE / 8); + #else ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_BLOCK_SIZE); + #endif #endif #if !defined(WOLFSSL_ESP32_CRYPT) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \ @@ -2035,8 +2048,13 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) defined(NO_WOLFSSL_ESP32_CRYPT_HASH) || \ defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA512)) && \ !defined(WOLFSSL_ARMASM) + #ifdef WOLFSSL_WIDE_BYTE + WordsFromBytesBE64(sha512->buffer, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE / 8); + #else ByteReverseWords64(sha512->buffer,sha512->buffer, WC_SHA512_BLOCK_SIZE); + #endif #endif } @@ -2071,11 +2089,37 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen); /* put lengths in bits */ - sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) + + sha512->hiLen = (sha512->loLen >> + (CHAR_BIT * sizeof(sha512->loLen) - 3)) + (sha512->hiLen << 3); sha512->loLen = sha512->loLen << 3; /* store lengths */ +#ifdef WOLFSSL_WIDE_BYTE + /* CHAR_BIT != 8: 'local' indexes octet cells, not the word64 layout (the + * buffer[BLOCK/sizeof(word64) - 2/-1] indices assume 16 words per block, + * which is wrong when sizeof(word64) != 8). Place the 128-bit bit-length + * as 16 big-endian octets at the pad offset, then load all 16 big-endian + * schedule words octet-wise. */ + local[WC_SHA512_PAD_SIZE + 0] = (byte)((sha512->hiLen >> 56) & 0xFF); + local[WC_SHA512_PAD_SIZE + 1] = (byte)((sha512->hiLen >> 48) & 0xFF); + local[WC_SHA512_PAD_SIZE + 2] = (byte)((sha512->hiLen >> 40) & 0xFF); + local[WC_SHA512_PAD_SIZE + 3] = (byte)((sha512->hiLen >> 32) & 0xFF); + local[WC_SHA512_PAD_SIZE + 4] = (byte)((sha512->hiLen >> 24) & 0xFF); + local[WC_SHA512_PAD_SIZE + 5] = (byte)((sha512->hiLen >> 16) & 0xFF); + local[WC_SHA512_PAD_SIZE + 6] = (byte)((sha512->hiLen >> 8) & 0xFF); + local[WC_SHA512_PAD_SIZE + 7] = (byte)((sha512->hiLen ) & 0xFF); + local[WC_SHA512_PAD_SIZE + 8] = (byte)((sha512->loLen >> 56) & 0xFF); + local[WC_SHA512_PAD_SIZE + 9] = (byte)((sha512->loLen >> 48) & 0xFF); + local[WC_SHA512_PAD_SIZE + 10] = (byte)((sha512->loLen >> 40) & 0xFF); + local[WC_SHA512_PAD_SIZE + 11] = (byte)((sha512->loLen >> 32) & 0xFF); + local[WC_SHA512_PAD_SIZE + 12] = (byte)((sha512->loLen >> 24) & 0xFF); + local[WC_SHA512_PAD_SIZE + 13] = (byte)((sha512->loLen >> 16) & 0xFF); + local[WC_SHA512_PAD_SIZE + 14] = (byte)((sha512->loLen >> 8) & 0xFF); + local[WC_SHA512_PAD_SIZE + 15] = (byte)((sha512->loLen ) & 0xFF); + WordsFromBytesBE64(sha512->buffer, (const byte*)sha512->buffer, + WC_SHA512_BLOCK_SIZE / 8); +#else #if defined(LITTLE_ENDIAN_ORDER) #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \ (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)) @@ -2100,6 +2144,7 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; #endif +#endif /* WOLFSSL_WIDE_BYTE */ #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \ (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)) @@ -2151,7 +2196,10 @@ static WC_INLINE int Sha512Final(wc_Sha512* sha512) return ret; #endif - #ifdef LITTLE_ENDIAN_ORDER + /* CHAR_BIT != 8: leave digest in host word order here; the *Final output + * functions convert it to big-endian octets via BytesFromWordsBE64 + * (a word64 array cannot be aliased as octets / reversed in place). */ + #if defined(LITTLE_ENDIAN_ORDER) && !defined(WOLFSSL_WIDE_BYTE) ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE); #endif @@ -2185,7 +2233,9 @@ static int Sha512FinalRaw(wc_Sha512* sha512, byte* hash, word32 digestSz) return BAD_FUNC_ARG; } -#ifdef LITTLE_ENDIAN_ORDER +#if defined(WOLFSSL_WIDE_BYTE) + BytesFromWordsBE64(hash, sha512->digest, digestSz); +#elif defined(LITTLE_ENDIAN_ORDER) if ((digestSz & 0x7) == 0) ByteReverseWords64((word64 *)hash, sha512->digest, digestSz); else { @@ -2238,7 +2288,11 @@ static int Sha512_Family_Final(wc_Sha512* sha512, byte* hash, size_t digestSz, if (ret != 0) return ret; +#ifdef WOLFSSL_WIDE_BYTE + BytesFromWordsBE64(hash, sha512->digest, (word32)digestSz); +#else XMEMCPY(hash, sha512->digest, digestSz); +#endif /* initialize Sha512 structure for the next use */ return initfp(sha512); @@ -2636,7 +2690,9 @@ int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash) return BAD_FUNC_ARG; } -#ifdef LITTLE_ENDIAN_ORDER +#if defined(WOLFSSL_WIDE_BYTE) + BytesFromWordsBE64(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); +#elif defined(LITTLE_ENDIAN_ORDER) ByteReverseWords64((word64 *)hash, sha384->digest, WC_SHA384_DIGEST_SIZE); #else XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); @@ -2677,7 +2733,11 @@ int wc_Sha384Final(wc_Sha384* sha384, byte* hash) if (ret != 0) return ret; +#ifdef WOLFSSL_WIDE_BYTE + BytesFromWordsBE64(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); +#else XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); +#endif return InitSha384(sha384); /* reset state */ } diff --git a/wolfcrypt/src/wc_port.c b/wolfcrypt/src/wc_port.c index 63c1bb57f54..efec4b8a362 100644 --- a/wolfcrypt/src/wc_port.c +++ b/wolfcrypt/src/wc_port.c @@ -254,7 +254,7 @@ int wc_local_InitUp(wc_init_state_t *s) */ for (;;) { wc_static_assert(WC_INIT_STATE_STATE_BITS + WC_INIT_STATE_COUNT_BITS == - sizeof(WC_ATOMIC_UINT_ARG) * 8); + sizeof(WC_ATOMIC_UINT_ARG) * CHAR_BIT); if (exp_wc_init_state.c.count == (((WC_ATOMIC_UINT_ARG)1 << WC_INIT_STATE_COUNT_BITS) - (WC_ATOMIC_UINT_ARG)1)) diff --git a/wolfssl/wolfcrypt/misc.h b/wolfssl/wolfcrypt/misc.h index 0a20345287b..a46c375c579 100644 --- a/wolfssl/wolfcrypt/misc.h +++ b/wolfssl/wolfcrypt/misc.h @@ -56,6 +56,13 @@ word32 ByteReverseWord32(word32 value); WOLFSSL_LOCAL void ByteReverseWords(word32* out, const word32* in, word32 byteCount); +#ifdef WOLFSSL_WIDE_BYTE +WOLFSSL_LOCAL +void WordsFromBytesBE32(word32* w, const byte* b, word32 wordCnt); +WOLFSSL_LOCAL +void BytesFromWordsBE32(byte* b, const word32* w, word32 byteCnt); +#endif + WOLFSSL_LOCAL void XorWordsOut(wolfssl_word** r, const wolfssl_word** a, const wolfssl_word** b, word32 n); @@ -99,6 +106,12 @@ WOLFSSL_LOCAL word64 ByteReverseWord64(word64 value); WOLFSSL_LOCAL void ByteReverseWords64(word64* out, const word64* in, word32 byteCount); +#ifdef WOLFSSL_WIDE_BYTE +WOLFSSL_LOCAL +void WordsFromBytesBE64(word64* w, const byte* b, word32 wordCnt); +WOLFSSL_LOCAL +void BytesFromWordsBE64(byte* b, const word64* w, word32 byteCnt); +#endif #endif /* WORD64_AVAILABLE */ #ifndef WOLFSSL_HAVE_MIN diff --git a/wolfssl/wolfcrypt/settings.h b/wolfssl/wolfcrypt/settings.h index 9f699145847..45657fabbbd 100644 --- a/wolfssl/wolfcrypt/settings.h +++ b/wolfssl/wolfcrypt/settings.h @@ -5157,7 +5157,13 @@ blinding by defining WC_BLINDING_NO_RNG_ACKNOWLEDGE_WEAKNESS." " with static memory (WOLFSSL_STATIC_MEMORY)" #endif #if defined(WC_16BIT_CPU) && \ - (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) + (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) && \ + !defined(WOLFSSL_SP_ALLOW_16BIT_CPU) + /* SP math on a 16-bit-int CPU needs an explicitly wider digit + * (SP_WORD_SIZE >= 32, so sp_int_digit is at least 32-bit) and an octet + * byte (or the CHAR_BIT!=8 octet handling). Targets that have set those up + * - e.g. TI C28x with SP_WORD_SIZE 32 - opt in with + * WOLFSSL_SP_ALLOW_16BIT_CPU. */ #error "16-bit build (WC_16BIT_CPU) is not available with SP math" #endif diff --git a/wolfssl/wolfcrypt/sp_int.h b/wolfssl/wolfcrypt/sp_int.h index 31936d40753..435eae92e74 100644 --- a/wolfssl/wolfcrypt/sp_int.h +++ b/wolfssl/wolfcrypt/sp_int.h @@ -58,8 +58,17 @@ extern "C" { typedef unsigned char sp_uint7; typedef char sp_int7; +#elif UCHAR_MAX == 65535 + /* CHAR_BIT == 16 (e.g. TI C28x): the smallest addressable type is 16-bit, + * so there is no native 8-bit type. An "8-bit" SP value is a 16-bit char + * cell holding an octet (0..255); byte I/O masks to an octet (the same + * CHAR_BIT!=8 handling used elsewhere - see WOLFSSL_WIDE_BYTE). */ + #define SP_UCHAR_BITS 16 + + typedef unsigned char sp_uint8; + typedef char sp_int8; #else - #error "Size of unsigned short not detected" + #error "Size of unsigned char not detected" #endif #if USHRT_MAX == 65535 diff --git a/wolfssl/wolfcrypt/types.h b/wolfssl/wolfcrypt/types.h index 3a1efb79556..da4e388ba9f 100644 --- a/wolfssl/wolfcrypt/types.h +++ b/wolfssl/wolfcrypt/types.h @@ -264,6 +264,51 @@ typedef const char wcchar[]; #endif #endif + /* Targets where a C 'byte' (char) is wider than 8 bits - e.g. the TI + * C2000 C28x DSP has CHAR_BIT == 16. On such targets a word cannot be + * aliased as an octet stream (XMEMCPY/ByteReverseWords/(byte) casts assume + * 8-bit bytes), so byte<->word conversions must be done octet-wise with + * shifts. It stays undefined - and all existing fast paths are unchanged - + * on the usual 8-bit-byte targets. + * + * Detection is two-pronged: first by known 16-bit-char toolchains (so it + * works even without ), then generically from CHAR_BIT when + * is available (HAVE_LIMITS_H). The TI cl2000 predefines + * __TMS320C28XX__ / __TMS320C2000__ for the C28x (CHAR_BIT == 16); the + * C2800 (cl2800) predefines __TMS320C2800__. */ +#if !defined(WOLFSSL_WIDE_BYTE) && \ + (defined(__TMS320C28XX__) || defined(__TMS320C2000__) || \ + defined(__TMS320C2800__) || defined(__TMS320C5500__) || \ + defined(__TMS320C55X__) || defined(__TMS320C54X__)) + #define WOLFSSL_WIDE_BYTE + /* These TI DSP toolchains all have a 16-bit char. Some octet-wise paths + * gated on WOLFSSL_WIDE_BYTE use CHAR_BIT (rotate width, SHA length + * carry), so provide it when was not included (NO_LIMITS_H / + * no HAVE_LIMITS_H). These targets are unambiguously CHAR_BIT == 16. */ + #ifndef CHAR_BIT + #define CHAR_BIT 16 + #endif +#endif +#if !defined(WOLFSSL_WIDE_BYTE) && defined(CHAR_BIT) && (CHAR_BIT != 8) + #define WOLFSSL_WIDE_BYTE +#endif +/* Guarantee CHAR_BIT is defined so it can be used unconditionally for value-bit + * widths (rotate complements, SHA length carries). When was + * included (above) or a known wide-char toolchain was detected, CHAR_BIT is + * already correct; otherwise the target has the usual 8-bit byte. */ +#ifndef CHAR_BIT + #define CHAR_BIT 8 +#endif + +/* Reduce a value to a single octet (its low 8 bits) stored in a byte. On the + * usual 8-bit-byte targets a (byte) cast already truncates to 8 bits, so the + * mask is a no-op and this is byte-for-byte the historical (byte) cast; on + * WOLFSSL_WIDE_BYTE targets (a C byte is wider than an octet, e.g. 16-bit on + * the TI C28x) the mask is required so a carry or high bits do not leak into + * the stored octet. Used by the byte-packers in SHA-2/3, the Hash-DRBG, base64 + * and ML-DSA. */ +#define WC_OCTET(x) ((byte)((x) & 0xFF)) + #if defined(HAVE___UINT128_T) && !defined(NO_INT128) #ifndef WOLFSSL_UINT128_T_DEFINED #ifdef __SIZEOF_INT128__ @@ -377,7 +422,12 @@ typedef const char wcchar[]; #endif #elif defined(WC_16BIT_CPU) - #ifndef MICROCHIP_PIC24 + /* WC_16BIT_CPU selects 16-bit int (word16=unsigned int, word32=unsigned + * long). It historically also assumes no native 64-bit type, but targets + * like the TI C2000 C28x are 16-bit-int yet have a 64-bit long long, so + * keep WORD64_AVAILABLE when one exists. */ + #if !defined(MICROCHIP_PIC24) && \ + !(defined(SIZEOF_LONG_LONG) && (SIZEOF_LONG_LONG == 8)) #undef WORD64_AVAILABLE #endif typedef word16 wolfssl_word; diff --git a/wolfssl/wolfcrypt/wc_port.h b/wolfssl/wolfcrypt/wc_port.h index 0bd5f793a23..de571b146cc 100644 --- a/wolfssl/wolfcrypt/wc_port.h +++ b/wolfssl/wolfcrypt/wc_port.h @@ -516,7 +516,14 @@ typedef wolfSSL_Mutex wolfSSL_RwLock; #endif -#ifdef WC_16BIT_CPU +/* The init-state bit field below needs a 32-bit type. Use a wider type when + * int is too narrow (16-bit int targets such as C28x, detected via UINT_MAX), + * not only when WC_16BIT_CPU is set. */ +#ifdef HAVE_LIMITS_H + #include +#endif +#if defined(WC_16BIT_CPU) || \ + (defined(UINT_MAX) && (UINT_MAX < 0xFFFFFFFFUL)) #define WC_ATOMIC_INT_ARG long int #define WC_ATOMIC_UINT_ARG long unsigned int #else From 5522b65804739739e3f2c86905fd446517a13b3e Mon Sep 17 00:00:00 2001 From: David Garske Date: Thu, 18 Jun 2026 09:15:10 -0700 Subject: [PATCH 2/4] mldsa: correct ML-DSA on CHAR_BIT!=8 + add WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM ML-DSA-87 keygen/sign/verify on a 16-bit byte/int CPU (TI C28x), gated and a no-op on normal targets: - Encode/decode integer-promotion fixes: a byte/word16 field promotes to *unsigned* int where int is 16-bit, so '2 - field' was unsigned and a negative coefficient zero-extended into sword32 (e.g. -1 -> 0x0000FFFF); cast the unpacked field to sword32 (eta-2/eta-4/t0 decode). Bit-packers relied on (byte) truncating to 8 bits; mask with MLDSA_OCT() and cast the <> 0) | (s1 << 3) | (s2 << 6)); - p[1] = (byte)((s2 >> 2) | (s3 << 1) | (s4 << 4) | (s5 << 7)); - p[2] = (byte)((s5 >> 1) | (s6 << 2) | (s7 << 5)); + p[0] = WC_OCTET((s0 >> 0) | (s1 << 3) | (s2 << 6)); + p[1] = WC_OCTET((s2 >> 2) | (s3 << 1) | (s4 << 4) | (s5 << 7)); + p[2] = WC_OCTET((s5 >> 1) | (s6 << 2) | (s7 << 5)); /* Move to next place to encode into. */ p += MLDSA_ETA_2_BITS; } @@ -1159,15 +1160,18 @@ static void mldsa_decode_eta_2_bits_c(const byte* p, sword32* s) * 3 bits to encode each number. * 8 numbers from 3 bytes. (8 * 3 bits = 3 * 8 bits) */ for (j = 0; j < MLDSA_N; j += 8) { - /* Get 3 bits and put in range of -2..2. */ - s[j + 0] = 2 - ((p[0] >> 0) & 0x7 ); - s[j + 1] = 2 - ((p[0] >> 3) & 0x7 ); - s[j + 2] = 2 - ((p[0] >> 6) | ((p[1] << 2) & 0x7)); - s[j + 3] = 2 - ((p[1] >> 1) & 0x7 ); - s[j + 4] = 2 - ((p[1] >> 4) & 0x7 ); - s[j + 5] = 2 - ((p[1] >> 7) | ((p[2] << 1) & 0x7)); - s[j + 6] = 2 - ((p[2] >> 2) & 0x7 ); - s[j + 7] = 2 - ((p[2] >> 5) & 0x7 ); + /* Get 3 bits and put in range of -2..2. + * Cast to signed 32-bit before the subtract: where int is 16-bit a + * byte/word16 field promotes to unsigned, so a negative result would + * zero-extend instead of sign-extend into sword32. */ + s[j + 0] = 2 - (sword32)((p[0] >> 0) & 0x7 ); + s[j + 1] = 2 - (sword32)((p[0] >> 3) & 0x7 ); + s[j + 2] = 2 - (sword32)((p[0] >> 6) | ((p[1] << 2) & 0x7)); + s[j + 3] = 2 - (sword32)((p[1] >> 1) & 0x7 ); + s[j + 4] = 2 - (sword32)((p[1] >> 4) & 0x7 ); + s[j + 5] = 2 - (sword32)((p[1] >> 7) | ((p[2] << 1) & 0x7)); + s[j + 6] = 2 - (sword32)((p[2] >> 2) & 0x7 ); + s[j + 7] = 2 - (sword32)((p[2] >> 5) & 0x7 ); /* Move to next place to decode from. */ p += MLDSA_ETA_2_BITS; } @@ -1221,24 +1225,24 @@ static void mldsa_decode_eta_4_bits_c(const byte* p, sword32* s) * 4 bits to encode each number. * 2 numbers from 1 bytes. (2 * 4 bits = 1 * 8 bits) */ for (j = 0; j < MLDSA_N / 2; j++) { - /* Get 4 bits and put in range of -4..4. */ - s[j * 2 + 0] = 4 - (p[j] & 0xf); - s[j * 2 + 1] = 4 - (p[j] >> 4); + /* Get 4 bits and put in range of -4..4. (sword32 cast: see eta-2.) */ + s[j * 2 + 0] = 4 - (sword32)(p[j] & 0xf); + s[j * 2 + 1] = 4 - (sword32)(p[j] >> 4); } #else /* Step 6 or 9. * 4 bits to encode each number. * 8 numbers from 4 bytes. (8 * 4 bits = 4 * 8 bits) */ for (j = 0; j < MLDSA_N / 2; j += 4) { - /* Get 4 bits and put in range of -4..4. */ - s[j * 2 + 0] = 4 - (p[j + 0] & 0xf); - s[j * 2 + 1] = 4 - (p[j + 0] >> 4); - s[j * 2 + 2] = 4 - (p[j + 1] & 0xf); - s[j * 2 + 3] = 4 - (p[j + 1] >> 4); - s[j * 2 + 4] = 4 - (p[j + 2] & 0xf); - s[j * 2 + 5] = 4 - (p[j + 2] >> 4); - s[j * 2 + 6] = 4 - (p[j + 3] & 0xf); - s[j * 2 + 7] = 4 - (p[j + 3] >> 4); + /* Get 4 bits and put in range of -4..4. (sword32 cast: see eta-2.) */ + s[j * 2 + 0] = 4 - (sword32)(p[j + 0] & 0xf); + s[j * 2 + 1] = 4 - (sword32)(p[j + 0] >> 4); + s[j * 2 + 2] = 4 - (sword32)(p[j + 1] & 0xf); + s[j * 2 + 3] = 4 - (sword32)(p[j + 1] >> 4); + s[j * 2 + 4] = 4 - (sword32)(p[j + 2] & 0xf); + s[j * 2 + 5] = 4 - (sword32)(p[j + 2] >> 4); + s[j * 2 + 6] = 4 - (sword32)(p[j + 3] & 0xf); + s[j * 2 + 7] = 4 - (sword32)(p[j + 3] >> 4); } #endif /* WOLFSSL_MLDSA_SMALL */ } @@ -1378,21 +1382,21 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0, MLDSA_D); /* Take 8 values of t and take bottom bits and make positive. */ word16 n0_0 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 0] - (n1_0 << MLDSA_D))); + (t[j + 0] - ((sword32)n1_0 << MLDSA_D))); word16 n0_1 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 1] - (n1_1 << MLDSA_D))); + (t[j + 1] - ((sword32)n1_1 << MLDSA_D))); word16 n0_2 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 2] - (n1_2 << MLDSA_D))); + (t[j + 2] - ((sword32)n1_2 << MLDSA_D))); word16 n0_3 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 3] - (n1_3 << MLDSA_D))); + (t[j + 3] - ((sword32)n1_3 << MLDSA_D))); word16 n0_4 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 4] - (n1_4 << MLDSA_D))); + (t[j + 4] - ((sword32)n1_4 << MLDSA_D))); word16 n0_5 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 5] - (n1_5 << MLDSA_D))); + (t[j + 5] - ((sword32)n1_5 << MLDSA_D))); word16 n0_6 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 6] - (n1_6 << MLDSA_D))); + (t[j + 6] - ((sword32)n1_6 << MLDSA_D))); word16 n0_7 = (word16)(MLDSA_D_MAX_HALF - - (t[j + 7] - (n1_7 << MLDSA_D))); + (t[j + 7] - ((sword32)n1_7 << MLDSA_D))); /* 13 bits per number. * 8 numbers become 13 bytes. (8 * 13 bits = 13 * 8 bits) */ @@ -1406,20 +1410,20 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0, tp[2] = (n0_4 >> 12) | ((word32)n0_5 << 1) | ((word32)n0_6 << 14) | ((word32)n0_7 << 27); #else - t0[ 0] = (byte)( (n0_0 << 0)); - t0[ 1] = (byte)((n0_0 >> 8) | (n0_1 << 5)); - t0[ 2] = (byte)((n0_1 >> 3) ); - t0[ 3] = (byte)((n0_1 >> 11) | (n0_2 << 2)); - t0[ 4] = (byte)((n0_2 >> 6) | (n0_3 << 7)); - t0[ 5] = (byte)((n0_3 >> 1) ); - t0[ 6] = (byte)((n0_3 >> 9) | (n0_4 << 4)); - t0[ 7] = (byte)((n0_4 >> 4) ); - t0[ 8] = (byte)((n0_4 >> 12) | (n0_5 << 1)); - t0[ 9] = (byte)((n0_5 >> 7) | (n0_6 << 6)); - t0[10] = (byte)((n0_6 >> 2) ); - t0[11] = (byte)((n0_6 >> 10) | (n0_7 << 3)); + t0[ 0] = WC_OCTET( (n0_0 << 0)); + t0[ 1] = WC_OCTET((n0_0 >> 8) | (n0_1 << 5)); + t0[ 2] = WC_OCTET((n0_1 >> 3) ); + t0[ 3] = WC_OCTET((n0_1 >> 11) | (n0_2 << 2)); + t0[ 4] = WC_OCTET((n0_2 >> 6) | (n0_3 << 7)); + t0[ 5] = WC_OCTET((n0_3 >> 1) ); + t0[ 6] = WC_OCTET((n0_3 >> 9) | (n0_4 << 4)); + t0[ 7] = WC_OCTET((n0_4 >> 4) ); + t0[ 8] = WC_OCTET((n0_4 >> 12) | (n0_5 << 1)); + t0[ 9] = WC_OCTET((n0_5 >> 7) | (n0_6 << 6)); + t0[10] = WC_OCTET((n0_6 >> 2) ); + t0[11] = WC_OCTET((n0_6 >> 10) | (n0_7 << 3)); #endif - t0[12] = (byte)((n0_7 >> 5) ); + t0[12] = WC_OCTET((n0_7 >> 5) ); /* 10 bits per number. * 8 bytes become 10 bytes. (8 * 10 bits = 10 * 8 bits) */ @@ -1430,17 +1434,17 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0, tp[1] = (n1_3 >> 2) | ((word32)n1_4 << 8) | ((word32)n1_5 << 18) | ((word32)n1_6 << 28); #else - t1[0] = (byte)( (n1_0 << 0)); - t1[1] = (byte)((n1_0 >> 8) | (n1_1 << 2)); - t1[2] = (byte)((n1_1 >> 6) | (n1_2 << 4)); - t1[3] = (byte)((n1_2 >> 4) | (n1_3 << 6)); - t1[4] = (byte)((n1_3 >> 2) ); - t1[5] = (byte)( (n1_4 << 0)); - t1[6] = (byte)((n1_4 >> 8) | (n1_5 << 2)); - t1[7] = (byte)((n1_5 >> 6) | (n1_6 << 4)); + t1[0] = WC_OCTET( (n1_0 << 0)); + t1[1] = WC_OCTET((n1_0 >> 8) | (n1_1 << 2)); + t1[2] = WC_OCTET((n1_1 >> 6) | (n1_2 << 4)); + t1[3] = WC_OCTET((n1_2 >> 4) | (n1_3 << 6)); + t1[4] = WC_OCTET((n1_3 >> 2) ); + t1[5] = WC_OCTET( (n1_4 << 0)); + t1[6] = WC_OCTET((n1_4 >> 8) | (n1_5 << 2)); + t1[7] = WC_OCTET((n1_5 >> 6) | (n1_6 << 4)); #endif - t1[8] = (byte)((n1_6 >> 4) | (n1_7 << 6)); - t1[9] = (byte)((n1_7 >> 2) ); + t1[8] = WC_OCTET((n1_6 >> 4) | (n1_7 << 6)); + t1[9] = WC_OCTET((n1_7 >> 2) ); /* Move to next place to encode bottom bits to. */ t0 += MLDSA_D; @@ -1526,25 +1530,27 @@ static void mldsa_decode_t0_c(const byte* t0, sword32* t) t[j + 7] = MLDSA_D_MAX_HALF - (sword32) (( t32_2 >> 27 ) | ((word32)t0[12] ) << 5 ); #else - t[j + 0] = MLDSA_D_MAX_HALF - + /* sword32 cast on the unpacked field: see eta-2 decode - the subtract + * must be signed/32-bit so a negative t0 sign-extends correctly. */ + t[j + 0] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 0] ) | (((word16)(t0[ 1] & 0x1f)) << 8)); - t[j + 1] = MLDSA_D_MAX_HALF - + t[j + 1] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 1] >> 5) | (((word16)(t0[ 2] )) << 3) | (((word16)(t0[ 3] & 0x03)) << 11)); - t[j + 2] = MLDSA_D_MAX_HALF - + t[j + 2] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 3] >> 2) | (((word16)(t0[ 4] & 0x7f)) << 6)); - t[j + 3] = MLDSA_D_MAX_HALF - + t[j + 3] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 4] >> 7) | (((word16)(t0[ 5] )) << 1) | (((word16)(t0[ 6] & 0x0f)) << 9)); - t[j + 4] = MLDSA_D_MAX_HALF - + t[j + 4] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 6] >> 4) | (((word16)(t0[ 7] )) << 4) | (((word16)(t0[ 8] & 0x01)) << 12)); - t[j + 5] = MLDSA_D_MAX_HALF - + t[j + 5] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 8] >> 1) | (((word16)(t0[ 9] & 0x3f)) << 7)); - t[j + 6] = MLDSA_D_MAX_HALF - + t[j + 6] = MLDSA_D_MAX_HALF - (sword32) ((t0[ 9] >> 6) | (((word16)(t0[10] )) << 2) | (((word16)(t0[11] & 0x07)) << 10)); - t[j + 7] = MLDSA_D_MAX_HALF - + t[j + 7] = MLDSA_D_MAX_HALF - (sword32) ((t0[11] >> 3) | (((word16)(t0[12] )) << 5)); #endif /* Move to next place to decode from. */ @@ -1771,16 +1777,16 @@ static void mldsa_encode_gamma1_17_bits_c(const sword32* z, byte* s) s32p[1] = (z1 >> 14) | (z2 << 4) | (z3 << 22); #endif #else - s[0] = (byte)( z0 ); - s[1] = (byte)( z0 >> 8 ); - s[2] = (byte)((z0 >> 16) | (z1 << 2)); - s[3] = (byte)( z1 >> 6 ); - s[4] = (byte)((z1 >> 14) | (z2 << 4)); - s[5] = (byte)( z2 >> 4 ); - s[6] = (byte)((z2 >> 12) | (z3 << 6)); - s[7] = (byte)( z3 >> 2 ); -#endif - s[8] = (byte)( z3 >> 10 ); + s[0] = WC_OCTET( z0 ); + s[1] = WC_OCTET( z0 >> 8 ); + s[2] = WC_OCTET((z0 >> 16) | (z1 << 2)); + s[3] = WC_OCTET( z1 >> 6 ); + s[4] = WC_OCTET((z1 >> 14) | (z2 << 4)); + s[5] = WC_OCTET( z2 >> 4 ); + s[6] = WC_OCTET((z2 >> 12) | (z3 << 6)); + s[7] = WC_OCTET( z3 >> 2 ); +#endif + s[8] = WC_OCTET( z3 >> 10 ); /* Move to next place to encode to. */ s += MLDSA_GAMMA1_17_ENC_BITS / 2; } @@ -1842,16 +1848,16 @@ static void mldsa_encode_gamma1_19_bits_c(const sword32* z, byte* s) #endif s16p[4] = (word16)((z3 >> 4) ); #else - s[0] = (byte) z0 ; - s[1] = (byte) (z0 >> 8) ; - s[2] = (byte)((z0 >> 16) | (z1 << 4)); - s[3] = (byte) (z1 >> 4) ; - s[4] = (byte) (z1 >> 12) ; - s[5] = (byte) z2 ; - s[6] = (byte) (z2 >> 8) ; - s[7] = (byte)((z2 >> 16) | (z3 << 4)); - s[8] = (byte) (z3 >> 4) ; - s[9] = (byte) (z3 >> 12) ; + s[0] = WC_OCTET( z0 ); + s[1] = WC_OCTET((z0 >> 8) ); + s[2] = WC_OCTET((z0 >> 16) | (z1 << 4)); + s[3] = WC_OCTET((z1 >> 4) ); + s[4] = WC_OCTET((z1 >> 12) ); + s[5] = WC_OCTET( z2 ); + s[6] = WC_OCTET((z2 >> 8) ); + s[7] = WC_OCTET((z2 >> 16) | (z3 << 4)); + s[8] = WC_OCTET((z3 >> 4) ); + s[9] = WC_OCTET((z3 >> 12) ); #endif /* Move to next place to encode to. */ s += MLDSA_GAMMA1_19_ENC_BITS / 2; @@ -2244,6 +2250,9 @@ static void mldsa_decode_gamma1(const byte* s, int bits, sword32* z) * @param [in] bits Number of bits used in encoding - GAMMA1 bits. * @param [out] z Vector of polynomials. */ +#ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM +/* The smallest-mem verify streams z one polynomial at a time with + * mldsa_decode_gamma1() directly, so the whole-vector wrapper is unused. */ static void mldsa_vec_decode_gamma1(const byte* x, byte l, int bits, sword32* z) { @@ -2258,6 +2267,7 @@ static void mldsa_vec_decode_gamma1(const byte* x, byte l, int bits, z += MLDSA_N; } } +#endif /* !WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM */ #endif #if !defined(WOLFSSL_MLDSA_NO_SIGN) || !defined(WOLFSSL_MLDSA_NO_VERIFY) @@ -4624,8 +4634,12 @@ static int mldsa_sample_in_ball_ex(int level, wc_Shake* shake256, /* Step 8: Move value from random index to current index. */ c[i] = c[j]; - /* Step 9: Set value at random index to +/- 1. */ - c[j] = 1 - ((((signs[s >> 3]) >> (s & 0x7)) & 0x1) << 1); + /* Step 9: Set value at random index to +/- 1. + * Cast to sword32 before the subtract: where a byte is as wide as int, + * signs[] promotes to unsigned and -1 would widen as 0x0000ffff. + * Matches the USE_INTEL_SPEEDUP path. */ + c[j] = (sword32)1 - + (sword32)((((signs[s >> 3]) >> (s & 0x7)) & 0x1) << 1); /* Next sign bit index. */ s++; } @@ -9771,6 +9785,10 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu, byte o; byte* encW1; byte* seed = commit_calc; +#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + /* Bytes of encoded z per polynomial - z is streamed one poly at a time. */ + word32 zStride = (word32)(MLDSA_N / 8) * (word32)(params->gamma1_bits + 1); +#endif /* Ensure the signature is the right size for the parameters. */ if (sigLen != params->sigSz) { @@ -9825,15 +9843,33 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu, #endif if (ret == 0) { - /* Step 2: Decode z from signature. */ - mldsa_vec_decode_gamma1(ze, params->l, params->gamma1_bits, z); /* Step 13: Check z is valid - values are low enough. */ hi = ((sword32)1 << params->gamma1_bits) - params->beta; +#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + { + /* Step 2/13: Stream z one polynomial at a time for the range check; + * the per-poly NTT happens inside the matrix loop below. */ + const byte* zp = ze; + unsigned int zi; + + valid = 1; + for (zi = 0; valid && (zi < params->l); zi++) { + mldsa_decode_gamma1(zp, params->gamma1_bits, z); + valid = mldsa_check_low(z, hi); + zp += zStride; + } + } +#else + /* Step 2: Decode z from signature. */ + mldsa_vec_decode_gamma1(ze, params->l, params->gamma1_bits, z); valid = mldsa_vec_check_low(z, params->l, hi); +#endif } if ((ret == 0) && valid) { +#ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM /* Step 10: NTT(z) */ mldsa_vec_ntt_full(z, params->l); +#endif /* Step 9: Compute c from first 256 bits of commit. */ #ifdef WOLFSSL_MLDSA_VERIFY_NO_MALLOC @@ -9907,6 +9943,14 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu, for (s = 0; (ret == 0) && (s < params->l); s++) { /* Put s into buffer to be hashed. */ seed[MLDSA_PUB_SEED_SZ + 0] = (byte)s; + #ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + /* Step 2/10: Decode and NTT this z polynomial on demand (z is + * not kept as a whole vector in this mode). */ + mldsa_decode_gamma1(ze + (word32)s * zStride, + params->gamma1_bits, z); + mldsa_ntt_full(z); + zt = z; + #endif /* Step 3: Create polynomial from hashing seed. */ #ifdef WOLFSSL_MLDSA_VERIFY_NO_MALLOC ret = mldsa_rej_ntt_poly_ex(&key->shake, seed, a, key->h); diff --git a/wolfssl/wolfcrypt/dilithium.h b/wolfssl/wolfcrypt/dilithium.h index 123625e53f7..a37b806bff6 100644 --- a/wolfssl/wolfcrypt/dilithium.h +++ b/wolfssl/wolfcrypt/dilithium.h @@ -145,6 +145,24 @@ #define WOLFSSL_MLDSA_VERIFY_SMALL_MEM #endif #endif +#ifdef WOLFSSL_DILITHIUM_VERIFY_SMALLEST_MEM + #ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + #define WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + #endif +#endif +#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + /* Smallest verify RAM: in addition to the small-mem, no-malloc verify path, + * stream the signature's z vector one polynomial at a time instead of + * pinning the whole l-vector in the key. Trades a per-row z decode+NTT for + * roughly (l-1)*256*sizeof(sword32) bytes of state (e.g. ~6 KB for + * ML-DSA-87). Implies the small-mem + no-malloc verify path. */ + #ifndef WOLFSSL_MLDSA_VERIFY_SMALL_MEM + #define WOLFSSL_MLDSA_VERIFY_SMALL_MEM + #endif + #ifndef WOLFSSL_MLDSA_VERIFY_NO_MALLOC + #define WOLFSSL_MLDSA_VERIFY_NO_MALLOC + #endif +#endif #ifdef WOLFSSL_DILITHIUM_MAKE_KEY_SMALL_MEM #ifndef WOLFSSL_MLDSA_MAKE_KEY_SMALL_MEM #define WOLFSSL_MLDSA_MAKE_KEY_SMALL_MEM @@ -288,6 +306,22 @@ #endif #endif +/* On a 16-bit-int CPU (WC_16BIT_CPU) the toolchain synthesizes 32-/64-bit + * multiplies, and some (e.g. TI cl2000 for the C2000 C28x) miscompile the + * multiply-based Montgomery reduction in mldsa_mont_red(). The shift-based + * reduction is mathematically identical (q = 2^23 - 2^13 + 1, and the q^-1 + * expansion likewise), avoids the wide multiply, and is also cheaper on such + * targets. Select it by default there; a user can still override either + * macro explicitly. */ +#if defined(WC_16BIT_CPU) + #ifndef MLDSA_MUL_QINV_SLOW + #define MLDSA_MUL_QINV_SLOW + #endif + #ifndef MLDSA_MUL_Q_SLOW + #define MLDSA_MUL_Q_SLOW + #endif +#endif + #endif /* !WOLFSSL_NO_DILITHIUM_LEGACY_GATES */ /* === Derived canonical gates ========================================== */ diff --git a/wolfssl/wolfcrypt/wc_mldsa.h b/wolfssl/wolfcrypt/wc_mldsa.h index 58a94bd771c..deda3e4ef52 100644 --- a/wolfssl/wolfcrypt/wc_mldsa.h +++ b/wolfssl/wolfcrypt/wc_mldsa.h @@ -662,7 +662,11 @@ struct wc_MlDsaKey { #endif #if defined(WOLFSSL_MLDSA_VERIFY_NO_MALLOC) && \ defined(WOLFSSL_MLDSA_VERIFY_SMALL_MEM) +#ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM + sword32 z[MLDSA_N]; /* one z poly, streamed per use */ +#else sword32 z[MLDSA_MAX_L_VECTOR_COUNT]; +#endif sword32 c[MLDSA_N]; sword32 w[MLDSA_N]; sword32 t1[MLDSA_N]; From 12a3dceeff7eaf4f57a6c477019f6602dfdf357a Mon Sep 17 00:00:00 2001 From: David Garske Date: Thu, 18 Jun 2026 09:15:10 -0700 Subject: [PATCH 3/4] test/bench/ci: CHAR_BIT!=8 test vectors, NO_MALLOC bench, TI C2000 compile CI - test.c: store the SHA/SHAKE large_digest KAT vectors as brace-init byte arrays (clean octets) instead of "\x.." string literals, which a signed-16-bit-char toolchain (cl2000) would sign-extend. - benchmark.c: WOLFSSL_NO_MALLOC mode uses static plain/cipher buffers and skips the key/iv XMALLOC/XFREE (gated; default build unchanged). - scripts/ti-c2000/ + .github/workflows/ti-c2000-compile.yml: a hardware-free cl2000 compile-only CI guard for the CHAR_BIT!=8 wolfCrypt subset. --- .github/workflows/ti-c2000-compile.yml | 78 +++++++++ scripts/ti-c2000/compile.sh | 65 ++++++++ scripts/ti-c2000/user_settings.h | 84 ++++++++++ wolfcrypt/benchmark/benchmark.c | 35 +++- wolfcrypt/test/test.c | 216 ++++++++++++++----------- 5 files changed, 382 insertions(+), 96 deletions(-) create mode 100644 .github/workflows/ti-c2000-compile.yml create mode 100755 scripts/ti-c2000/compile.sh create mode 100644 scripts/ti-c2000/user_settings.h diff --git a/.github/workflows/ti-c2000-compile.yml b/.github/workflows/ti-c2000-compile.yml new file mode 100644 index 00000000000..39f1e5c60f0 --- /dev/null +++ b/.github/workflows/ti-c2000-compile.yml @@ -0,0 +1,78 @@ +name: TI C2000 (C28x) compile-only + +# Compile-guard for the TI C2000 C28x port (CHAR_BIT == 16). It builds the +# wolfCrypt subset that carries the CHAR_BIT != 8 gated fixes with the TI cl2000 +# code generation tools - no linking, no C2000Ware, no hardware. Purpose: catch +# compile regressions in the octet/SP/ML-DSA gated paths. On-target run-tests +# live on a hardware-in-the-loop runner (there is no public C28x simulator). + +# START OF COMMON SECTION +on: + # Only build when something that can affect the C28x compile changes, so the + # job (and the CGT it pulls) does not burn runner minutes on unrelated PRs. + push: + branches: [ 'master', 'main', 'release/**' ] + paths: + - 'wolfcrypt/src/**' + - 'wolfssl/wolfcrypt/**' + - 'scripts/ti-c2000/**' + - '.github/workflows/ti-c2000-compile.yml' + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + branches: [ '*' ] + paths: + - 'wolfcrypt/src/**' + - 'wolfssl/wolfcrypt/**' + - 'scripts/ti-c2000/**' + - '.github/workflows/ti-c2000-compile.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +# END OF COMMON SECTION + +jobs: + ti_c2000_compile: + name: cl2000 compile-only + if: ${{ github.event_name != 'pull_request' || github.event.pull_request.draft == false }} + runs-on: ubuntu-22.04 + timeout-minutes: 20 + env: + # TI C2000 code generation tools. The CGT is a free download from TI. + # Update CGT_VER / CGT_URL when bumping the toolchain. If the TI URL + # starts requiring a click-through, host the installer as a repo/org + # cache artifact and point CGT_URL at it (the cache step below makes the + # download a once-per-version cost). + CGT_VER: "22.6.2.LTS" + CGT_URL: "https://dr-download.ti.com/software-development/ide-configuration-compiler-or-debugger/MD-LP0nQ4O8eX/22.6.2.LTS/ti_cgt_c2000_22.6.2.LTS_linux-x64_installer.bin" + CGT_DIR: "${{ github.workspace }}/ti-cgt-c2000" + steps: + - uses: actions/checkout@v4 + name: Checkout wolfSSL + + - name: Cache TI C2000 CGT + id: cgt-cache + uses: actions/cache@v4 + with: + path: ${{ env.CGT_DIR }} + key: ti-cgt-c2000-${{ env.CGT_VER }} + + - name: Download + install TI C2000 CGT + if: steps.cgt-cache.outputs.cache-hit != 'true' + run: | + set -e + curl -fsSL "$CGT_URL" -o /tmp/cgt.bin + chmod +x /tmp/cgt.bin + /tmp/cgt.bin --mode unattended --prefix "$CGT_DIR" + + - name: Locate cl2000 + id: find-cl + run: | + CL=$(find "$CGT_DIR" -type f -name cl2000 | head -1) + test -n "$CL" || { echo "cl2000 not found under $CGT_DIR"; exit 1; } + echo "cgt_root=$(dirname "$(dirname "$CL")")" >> "$GITHUB_OUTPUT" + + - name: Compile-only guard + run: | + CGT_ROOT="${{ steps.find-cl.outputs.cgt_root }}" \ + scripts/ti-c2000/compile.sh diff --git a/scripts/ti-c2000/compile.sh b/scripts/ti-c2000/compile.sh new file mode 100755 index 00000000000..0731b6b5e90 --- /dev/null +++ b/scripts/ti-c2000/compile.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# compile.sh - compile-only guard for the TI C2000 (C28x, CHAR_BIT==16) port. +# +# Builds the wolfCrypt subset that compiles under CHAR_BIT==16 with the TI cl2000 +# code generation tools, using scripts/ti-c2000/user_settings.h. No linking, no +# C2000Ware, no hardware: this only catches compile regressions in the +# CHAR_BIT != 8 gated code paths (SHA-2/3/SHAKE, ML-DSA-87 verify, SP-ECC). +# +# Usage: +# CGT_ROOT=/path/to/ti-cgt-c2000_xx.y.z scripts/ti-c2000/compile.sh +# +# CGT_ROOT must point at a TI C2000 codegen install (the dir containing +# bin/cl2000). The CGT is a free download from TI; in CI it is fetched/cached +# by .github/workflows/ti-c2000-compile.yml. + +set -e + +: "${CGT_ROOT:?set CGT_ROOT to the ti-cgt-c2000 install (dir with bin/cl2000)}" + +# Repo root = two levels up from this script. +SELF_DIR=$(cd "$(dirname "$0")" && pwd) +WOLFROOT=$(cd "$SELF_DIR/../.." && pwd) +CL="$CGT_ROOT/bin/cl2000" + +if [ ! -x "$CL" ]; then + echo "ERROR: cl2000 not found/executable at $CL" >&2 + exit 2 +fi + +OUT=$(mktemp -d) +trap 'rm -rf "$OUT"' EXIT + +INCS="-I$CGT_ROOT/include -I$WOLFROOT -I$SELF_DIR" +CFLAGS="-v28 --abi=eabi --float_support=fpu32 --tmu_support=tmu1 -O2 \ + --define=WOLFSSL_USER_SETTINGS --display_error_number --diag_warning=225" + +# wolfCrypt sources to compile-guard under CHAR_BIT==16. This is the set that +# carries the CHAR_BIT != 8 gated fixes (plus their direct deps) - the +# regression surface for this port. hash.c (an unmodified dispatch wrapper) is +# intentionally omitted: its wc_OidGetHash() OID switch needs the fuller ASN/OID +# config of a real build to avoid a 16-bit-int case-label fold, and it is +# covered by the on-target example build, not by this minimal guard. +SRCS="error wc_port memory logging misc coding \ + sha256 sha512 sha3 wc_mldsa random ecc sp_int sp_c32" + +rc=0 +for s in $SRCS; do + printf 'CC %s.c ... ' "$s" + if "$CL" $CFLAGS $INCS --compile_only --skip_assembler \ + --asm_directory="$OUT" --obj_directory="$OUT" \ + "$WOLFROOT/wolfcrypt/src/$s.c" > "$OUT/$s.log" 2>&1; then + echo "ok" + else + echo "FAIL" + cat "$OUT/$s.log" + rc=1 + fi +done + +if [ "$rc" -eq 0 ]; then + echo "TI C2000 compile-only guard: PASS" +else + echo "TI C2000 compile-only guard: FAIL" >&2 +fi +exit "$rc" diff --git a/scripts/ti-c2000/user_settings.h b/scripts/ti-c2000/user_settings.h new file mode 100644 index 00000000000..9d7c535fff4 --- /dev/null +++ b/scripts/ti-c2000/user_settings.h @@ -0,0 +1,84 @@ +/* user_settings.h - minimal wolfCrypt config for the TI C2000 (C28x, + * CHAR_BIT==16) compile-only CI guard. + * + * This is NOT a board config: it has no BSP/device dependencies. Its only job + * is to enable the subset of wolfCrypt that exercises the CHAR_BIT != 8 gated + * code paths (SHA-2/3/SHAKE, ML-DSA-87 verify, ECDSA/ECDH P-256 via SP math) + * so that scripts/ti-c2000/compile.sh can compile them with cl2000 and catch + * regressions. cl2000 predefines __TMS320C28XX__, so types.h auto-enables + * WOLFSSL_WIDE_BYTE; we do not set it here. + */ +#ifndef TI_C2000_CI_USER_SETTINGS_H +#define TI_C2000_CI_USER_SETTINGS_H + +#define WOLFCRYPT_ONLY /* crypto only - no TLS (no MD5/SHA1 dep) */ +#define WOLFSSL_GENERAL_ALIGNMENT 2 +#define HAVE_LIMITS_H +#define WOLFSSL_NO_ASM +#define NO_INLINE +#define SINGLE_THREADED +#define NO_FILESYSTEM +#define NO_WOLFSSL_DIR +#define NO_MAIN_DRIVER +#define NO_DEV_RANDOM +#define WOLFSSL_IGNORE_FILE_WARN +#define BENCH_EMBEDDED +#define NO_WOLFSSL_MEMORY +#define WOLFSSL_GENSEED_FORTEST /* dev-only seed; no TRNG on this part */ + +/* Hashes */ +#define WOLFSSL_SHA512 +#define WOLFSSL_SHA384 +#define WOLFSSL_SHA3 +#define WOLFSSL_SHAKE128 +#define WOLFSSL_SHAKE256 + +/* ML-DSA-87 verify (smallest-mem streaming verifier) */ +#define WOLFSSL_HAVE_MLDSA +#define WOLFSSL_NO_ML_DSA_44 +#define WOLFSSL_NO_ML_DSA_65 +#define WOLFSSL_MLDSA_NO_ASN1 +#define WOLFSSL_MLDSA_VERIFY_ONLY +#define WOLFSSL_MLDSA_VERIFY_SMALL_MEM +#define WOLFSSL_MLDSA_VERIFY_NO_MALLOC +#define WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM +#undef WOLFSSL_MLDSA_ALIGNMENT +#define WOLFSSL_MLDSA_ALIGNMENT 16 +#define WOLFSSL_SMALL_STACK + +/* ECDSA / ECDH P-256 via SP single-precision math (sp_c32.c) */ +#define HAVE_ECC +#define ECC_USER_CURVES +#define HAVE_ECC256 +#define HAVE_ECC_VERIFY +#define HAVE_ECC_SIGN +#define HAVE_ECC_DHE +#define ECC_TIMING_RESISTANT +#define WOLFSSL_SP_MATH +#define WOLFSSL_HAVE_SP_ECC +#define WOLFSSL_SP_NO_MALLOC +#define WOLFSSL_SP_SMALL +#define SP_WORD_SIZE 32 +#define WOLFSSL_SP_ALLOW_16BIT_CPU + +/* Off: anything that pulls in big-int/ASN/symmetric not under test here. */ +#define NO_RSA +#define NO_DH +#define NO_DSA +#define NO_ASN +#define NO_CERTS +#define NO_PWDBASED +#define NO_PKCS7 +#define NO_PKCS12 +#define NO_SIG_WRAPPER +#define NO_AES +#define NO_DES3 +#define NO_RC4 +#define NO_MD4 +#define NO_MD5 +#define NO_SHA +#define NO_HMAC +#define NO_ASN_TIME +#define WOLFSSL_USER_CURRTIME + +#endif /* TI_C2000_CI_USER_SETTINGS_H */ diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 214d873bc2f..b0f0b439b84 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -2357,6 +2357,15 @@ static const char* bench_result_words2[][6] = { static int numBlocks = NUM_BLOCKS; static word32 bench_size = BENCH_SIZE; +#ifdef WOLFSSL_NO_MALLOC + /* No heap: use file-scope static buffers for the core bench buffers. + * Sized to hold bench_buf_size (BENCH_SIZE + BENCH_CIPHER_ADD rounded up + * to the 16-byte AES block size) plus the +16 slack used at the alloc + * site, plus 64 bytes so the buffer can be 64-byte aligned at runtime. */ + #define BENCH_MAX_PAD (BENCH_CIPHER_ADD + 16 + 64) + static THREAD_LS_T XGEN_ALIGN byte bench_plain_buf[BENCH_SIZE + BENCH_MAX_PAD]; + static THREAD_LS_T XGEN_ALIGN byte bench_cipher_buf[BENCH_SIZE + BENCH_MAX_PAD]; +#endif static int base2 = 1; static int digest_stream = 1; #ifndef NO_HMAC @@ -3805,7 +3814,23 @@ static void* benchmarks_do(void* args) if (bench_buf_size % 16) bench_buf_size += 16 - (bench_buf_size % 16); -#ifdef WOLFSSL_AFALG_XILINX_AES +#ifdef WOLFSSL_NO_MALLOC + /* No heap: point at the file-scope static buffers. bench_size can be + * raised at runtime (benchmark_configure(), the -base16/auth size paths, + * or hash-file input), so confirm the requested size still fits the fixed + * static capacity before using them - otherwise the later XMEMSET / crypto + * writes would run past the end of the buffer. */ + if ((unsigned long)bench_buf_size + 16UL > + (unsigned long)sizeof(bench_plain_buf)) { + printf("%sBenchmark size %lu exceeds WOLFSSL_NO_MALLOC static buffer " + "(%lu); rebuild with a larger BENCH_SIZE\n", err_prefix, + (unsigned long)bench_buf_size, + (unsigned long)sizeof(bench_plain_buf)); + goto exit; + } + bench_plain = bench_plain_buf; + bench_cipher = bench_cipher_buf; +#elif defined(WOLFSSL_AFALG_XILINX_AES) bench_plain = (byte*)aligned_alloc(64, (size_t)bench_buf_size + 16); /* native heap */ bench_cipher = (byte*)aligned_alloc(64, (size_t)bench_buf_size + 16); /* native heap */ #else @@ -3918,7 +3943,8 @@ static void* benchmarks_do(void* args) } #endif -#if defined(WOLFSSL_ASYNC_CRYPT) || defined(HAVE_INTEL_QA_SYNC) +#if (defined(WOLFSSL_ASYNC_CRYPT) || defined(HAVE_INTEL_QA_SYNC)) && \ + !defined(WOLFSSL_NO_MALLOC) bench_key = (byte*)XMALLOC(sizeof(bench_key_buf), HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT); bench_iv = (byte*)XMALLOC(sizeof(bench_iv_buf), @@ -4744,9 +4770,12 @@ static void* benchmarks_do(void* args) exit: /* free benchmark buffers */ +#ifndef WOLFSSL_NO_MALLOC + /* under WOLFSSL_NO_MALLOC these point at file-scope static buffers */ XFREE(bench_plain, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT); XFREE(bench_cipher, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT); -#ifdef WOLFSSL_ASYNC_CRYPT +#endif +#if defined(WOLFSSL_ASYNC_CRYPT) && !defined(WOLFSSL_NO_MALLOC) XFREE(bench_key, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT); XFREE(bench_iv, HEAP_HINT, DYNAMIC_TYPE_WOLF_BIGINT); #endif diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index e95bba5ae5e..85ceb55a946 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -6432,18 +6432,18 @@ static wc_test_ret_t sha512_224_kat_test(wc_Sha512* sha, wc_Sha512* shaCopy) static wc_test_ret_t sha512_224_large_hash_test(wc_Sha512* sha) { #ifdef HASH_SIZE_LIMIT - static const char* large_digest = - "\x98\x68\xc3\xd9\xb9\xef\x17\x53" - "\x43\x66\x0e\x60\xdf\x29\xf8\xef" - "\x96\xe3\x93\x34\x8c\x6f\xc0\xeb" - "\x14\x6c\xcf\x6a"; + static const byte large_digest[] = { + 0x98, 0x68, 0xc3, 0xd9, 0xb9, 0xef, 0x17, 0x53, 0x43, 0x66, 0x0e, 0x60, + 0xdf, 0x29, 0xf8, 0xef, 0x96, 0xe3, 0x93, 0x34, 0x8c, 0x6f, 0xc0, 0xeb, + 0x14, 0x6c, 0xcf, 0x6a, 0x00 + }; int times = 20; #else - static const char* large_digest = - "\x26\x5f\x98\xd1\x76\x49\x71\x4e" - "\x82\xb7\x9d\x52\x32\x67\x9d\x56" - "\x91\xf5\x88\xc3\x05\xbb\x3f\x90" - "\xe2\x4e\x85\x05"; + static const byte large_digest[] = { + 0x26, 0x5f, 0x98, 0xd1, 0x76, 0x49, 0x71, 0x4e, 0x82, 0xb7, 0x9d, 0x52, + 0x32, 0x67, 0x9d, 0x56, 0x91, 0xf5, 0x88, 0xc3, 0x05, 0xbb, 0x3f, 0x90, + 0xe2, 0x4e, 0x85, 0x05, 0x00 + }; int times = 100; #endif byte hash[WC_SHA512_224_DIGEST_SIZE]; @@ -6624,18 +6624,18 @@ static wc_test_ret_t sha512_256_kat_test(wc_Sha512* sha, wc_Sha512* shaCopy) static wc_test_ret_t sha512_256_large_hash_test(wc_Sha512* sha) { #ifdef HASH_SIZE_LIMIT - static const char* large_digest = - "\x49\xcc\xbc\x7a\x93\x0b\x02\xb8" - "\xad\x9a\x46\x51\x00\x1f\x13\x80" - "\x35\x84\x36\xf1\xf2\x3c\xeb\xd8" - "\x41\xd4\x06\x8b\x1d\x19\xad\x72"; + static const byte large_digest[] = { + 0x49, 0xcc, 0xbc, 0x7a, 0x93, 0x0b, 0x02, 0xb8, 0xad, 0x9a, 0x46, 0x51, + 0x00, 0x1f, 0x13, 0x80, 0x35, 0x84, 0x36, 0xf1, 0xf2, 0x3c, 0xeb, 0xd8, + 0x41, 0xd4, 0x06, 0x8b, 0x1d, 0x19, 0xad, 0x72, 0x00 + }; int times = 20; #else - static const char* large_digest = - "\x7a\xe3\x84\x05\xcb\x06\x22\x08" - "\x7e\x2c\x65\x89\x1f\x26\x45\xfd" - "\xad\xbc\x2e\x29\x83\x12\x84\x4b" - "\xf2\xa0\xde\xbe\x06\x11\xd7\x44"; + static const byte large_digest[] = { + 0x7a, 0xe3, 0x84, 0x05, 0xcb, 0x06, 0x22, 0x08, 0x7e, 0x2c, 0x65, 0x89, + 0x1f, 0x26, 0x45, 0xfd, 0xad, 0xbc, 0x2e, 0x29, 0x83, 0x12, 0x84, 0x4b, + 0xf2, 0xa0, 0xde, 0xbe, 0x06, 0x11, 0xd7, 0x44, 0x00 + }; int times = 100; #endif byte hash[WC_SHA512_256_DIGEST_SIZE]; @@ -7000,9 +7000,11 @@ static wc_test_ret_t sha3_224_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy) #ifndef NO_LARGE_HASH_TEST static wc_test_ret_t sha3_224_large_hash_test(wc_Sha3* sha) { - static const char* large_digest = - "\x13\xe5\xd3\x98\x7b\x94\xda\x41\x12\xc7\x1e\x92\x3a\x19" - "\x21\x20\x86\x6f\x24\xbf\x0a\x31\xbc\xfd\xd6\x70\x36\xf3"; + static const byte large_digest[] = { + 0x13, 0xe5, 0xd3, 0x98, 0x7b, 0x94, 0xda, 0x41, 0x12, 0xc7, 0x1e, 0x92, + 0x3a, 0x19, 0x21, 0x20, 0x86, 0x6f, 0x24, 0xbf, 0x0a, 0x31, 0xbc, 0xfd, + 0xd6, 0x70, 0x36, 0xf3, 0x00 + }; byte hash[WC_SHA3_224_DIGEST_SIZE]; byte large_input[1024]; wc_test_ret_t ret; @@ -7214,9 +7216,11 @@ static wc_test_ret_t sha3_256_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy) #ifndef NO_LARGE_HASH_TEST static wc_test_ret_t sha3_256_large_hash_test(wc_Sha3* sha) { - static const char* large_digest = - "\xdc\x90\xc0\xb1\x25\xdb\x2c\x34\x81\xa3\xff\xbc\x1e\x2e\x87\xeb" - "\x6d\x70\x85\x61\xe0\xe9\x63\x61\xff\xe5\x84\x4b\x1f\x68\x05\x15"; + static const byte large_digest[] = { + 0xdc, 0x90, 0xc0, 0xb1, 0x25, 0xdb, 0x2c, 0x34, 0x81, 0xa3, 0xff, 0xbc, + 0x1e, 0x2e, 0x87, 0xeb, 0x6d, 0x70, 0x85, 0x61, 0xe0, 0xe9, 0x63, 0x61, + 0xff, 0xe5, 0x84, 0x4b, 0x1f, 0x68, 0x05, 0x15, 0x00 + }; byte hash[WC_SHA3_256_DIGEST_SIZE]; byte large_input[1024]; wc_test_ret_t ret; @@ -7431,10 +7435,12 @@ static wc_test_ret_t sha3_384_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy) #ifndef NO_LARGE_HASH_TEST static wc_test_ret_t sha3_384_large_hash_test(wc_Sha3* sha) { - static const char* large_digest = - "\x30\x44\xec\x17\xef\x47\x9f\x55\x36\x11\xd6\x3f\x8a\x31\x5a\x71" - "\x8a\x71\xa7\x1d\x8e\x84\xe8\x6c\x24\x02\x2f\x7a\x08\x4e\xea\xd7" - "\x42\x36\x5d\xa8\xc2\xb7\x42\xad\xec\x19\xfb\xca\xc6\x64\xb3\xa4"; + static const byte large_digest[] = { + 0x30, 0x44, 0xec, 0x17, 0xef, 0x47, 0x9f, 0x55, 0x36, 0x11, 0xd6, 0x3f, + 0x8a, 0x31, 0x5a, 0x71, 0x8a, 0x71, 0xa7, 0x1d, 0x8e, 0x84, 0xe8, 0x6c, + 0x24, 0x02, 0x2f, 0x7a, 0x08, 0x4e, 0xea, 0xd7, 0x42, 0x36, 0x5d, 0xa8, + 0xc2, 0xb7, 0x42, 0xad, 0xec, 0x19, 0xfb, 0xca, 0xc6, 0x64, 0xb3, 0xa4, 0x00 + }; byte hash[WC_SHA3_384_DIGEST_SIZE]; byte large_input[1024]; wc_test_ret_t ret; @@ -7618,11 +7624,14 @@ static wc_test_ret_t sha3_512_kat_test(wc_Sha3* sha, wc_Sha3* shaCopy) #ifndef NO_LARGE_HASH_TEST static wc_test_ret_t sha3_512_large_hash_test(wc_Sha3* sha) { - static const char* large_digest = - "\x9c\x13\x26\xb6\x26\xb2\x94\x31\xbc\xf4\x34\xe9\x6f\xf2\xd6\x29" - "\x9a\xd0\x9b\x32\x63\x2f\x18\xa7\x5f\x23\xc9\x60\xc2\x32\x0c\xbc" - "\x57\x77\x33\xf1\x83\x81\x8a\xd3\x15\x7c\x93\xdc\x80\x9f\xed\x61" - "\x41\xa7\x5b\xfd\x32\x0e\x38\x15\xb0\x46\x3b\x7a\x4f\xfd\x44\x88"; + static const byte large_digest[] = { + 0x9c, 0x13, 0x26, 0xb6, 0x26, 0xb2, 0x94, 0x31, 0xbc, 0xf4, 0x34, 0xe9, + 0x6f, 0xf2, 0xd6, 0x29, 0x9a, 0xd0, 0x9b, 0x32, 0x63, 0x2f, 0x18, 0xa7, + 0x5f, 0x23, 0xc9, 0x60, 0xc2, 0x32, 0x0c, 0xbc, 0x57, 0x77, 0x33, 0xf1, + 0x83, 0x81, 0x8a, 0xd3, 0x15, 0x7c, 0x93, 0xdc, 0x80, 0x9f, 0xed, 0x61, + 0x41, 0xa7, 0x5b, 0xfd, 0x32, 0x0e, 0x38, 0x15, 0xb0, 0x46, 0x3b, 0x7a, + 0x4f, 0xfd, 0x44, 0x88, 0x00 + }; byte hash[WC_SHA3_512_DIGEST_SIZE]; byte large_input[1024]; wc_test_ret_t ret; @@ -7768,28 +7777,36 @@ static wc_test_ret_t shake128_absorb_test(wc_Shake* sha, byte *large_input_buf, { wc_test_ret_t ret = 0; int i; - static const char large_digest[] = - "\x2b\xd1\x69\x9f\xb3\x75\x40\x74\xb8\xb2\xd2\x0b\x92\x47\x9b\xfe" - "\xc9\x91\x48\xbe\xda\xa4\x09\xd7\x61\x35\x18\x05\x07\x71\xa5\x61" - "\x4d\xc4\x94\xad\xbe\x04\x7d\xad\x95\x2f\xeb\x2c\xc0\x10\x67\x43" - "\x40\xf1\x4a\x58\x1c\x54\xfa\x24\x1c\x1a\x4e\x8d\x9b\xbc\xea\xa7" - "\x32\xf2\x4c\xc7\x86\x05\x36\xdc\xb4\x42\xd8\x35\xd1\xb4\xa2\x79" - "\xa2\xe6\xee\x67\x4f\xbf\x2a\x93\x41\x88\x25\x56\x29\x90\x1a\x06" - "\xba\xfe\x9f\xa6\x1a\x74\xe8\x7e\x85\x4a\xc8\x58\x60\xb1\x7b\x18" - "\xdf\x77\x59\x46\x04\xc1\xff\x4b\x9b\xcb\xad\xfe\x91\x28\xf0\x01" - "\xc1\x33\xd0\x99\x99\x2e\x0c\x86\x84\x67\x4d\x37\xa4\x42\x45\x10" - "\xdc\x8f\xdb\x6f\xa6\x9b\xee\x8a\x60\xa5\x1f\x95\x3f\x8f\xf5\x31" - "\x4b\x1d\x48\x1e\x45\xff\x79\x5c\xbe\x72\xfc\x56\xed\x6d\x1a\x99" - "\x7f\x23\x7c\xd1\xa5\x50\x9e\xb0\x4d\x61\x37\xa5\xcb\x24\x71\x3b" - "\xa3\x60\x51\x2e\x80\x83\x8b\xe0\x55\x50\xa7\x1e\xcc\x9f\xac\x41" - "\x77\x2c\x79\x22\x30\x09\x1b\x1a\x83\x5b\x2c\x48\xdc\x09\x7d\x59" - "\x0d\xf0\x54\x17\xfb\x5e\x38\x68\xde\xdb\xc5\x93\xab\x17\x5f\x4b" - "\x4d\x6d\xf2\xc7\x4e\x15\x1e\x10\x76\xc4\xcb\x87\xd8\xb7\x9d\xa8" - "\xbf\xc5\x2e\x5e\xfc\xd3\x6c\x45\xd4\x5d\x72\x0f\x66\xeb\x67\x86" - "\xfa\x6c\xd6\x80\xa4\x23\xcb\x5d\xed\x3c\xde\xdc\x5b\x3d\xca\x95" - "\x43\x4b\xdc\xe8\x49\xd3\xe1\x01\xd4\xf1\xe4\x47\xcf\x56\xba\x71" - "\xb4\x69\xed\xe7\xdb\x0f\x89\xd6\xbb\xcd\x1a\xff\xb4\xbe\x72\x26" - "\xdc\x76\x79\xb3\x1a\x4b\xe6\x8d\x9b\x8e\xd9\xe9\xe6\xf9\xff\xa5"; + static const byte large_digest[] = { + 0x2b, 0xd1, 0x69, 0x9f, 0xb3, 0x75, 0x40, 0x74, 0xb8, 0xb2, 0xd2, 0x0b, + 0x92, 0x47, 0x9b, 0xfe, 0xc9, 0x91, 0x48, 0xbe, 0xda, 0xa4, 0x09, 0xd7, + 0x61, 0x35, 0x18, 0x05, 0x07, 0x71, 0xa5, 0x61, 0x4d, 0xc4, 0x94, 0xad, + 0xbe, 0x04, 0x7d, 0xad, 0x95, 0x2f, 0xeb, 0x2c, 0xc0, 0x10, 0x67, 0x43, + 0x40, 0xf1, 0x4a, 0x58, 0x1c, 0x54, 0xfa, 0x24, 0x1c, 0x1a, 0x4e, 0x8d, + 0x9b, 0xbc, 0xea, 0xa7, 0x32, 0xf2, 0x4c, 0xc7, 0x86, 0x05, 0x36, 0xdc, + 0xb4, 0x42, 0xd8, 0x35, 0xd1, 0xb4, 0xa2, 0x79, 0xa2, 0xe6, 0xee, 0x67, + 0x4f, 0xbf, 0x2a, 0x93, 0x41, 0x88, 0x25, 0x56, 0x29, 0x90, 0x1a, 0x06, + 0xba, 0xfe, 0x9f, 0xa6, 0x1a, 0x74, 0xe8, 0x7e, 0x85, 0x4a, 0xc8, 0x58, + 0x60, 0xb1, 0x7b, 0x18, 0xdf, 0x77, 0x59, 0x46, 0x04, 0xc1, 0xff, 0x4b, + 0x9b, 0xcb, 0xad, 0xfe, 0x91, 0x28, 0xf0, 0x01, 0xc1, 0x33, 0xd0, 0x99, + 0x99, 0x2e, 0x0c, 0x86, 0x84, 0x67, 0x4d, 0x37, 0xa4, 0x42, 0x45, 0x10, + 0xdc, 0x8f, 0xdb, 0x6f, 0xa6, 0x9b, 0xee, 0x8a, 0x60, 0xa5, 0x1f, 0x95, + 0x3f, 0x8f, 0xf5, 0x31, 0x4b, 0x1d, 0x48, 0x1e, 0x45, 0xff, 0x79, 0x5c, + 0xbe, 0x72, 0xfc, 0x56, 0xed, 0x6d, 0x1a, 0x99, 0x7f, 0x23, 0x7c, 0xd1, + 0xa5, 0x50, 0x9e, 0xb0, 0x4d, 0x61, 0x37, 0xa5, 0xcb, 0x24, 0x71, 0x3b, + 0xa3, 0x60, 0x51, 0x2e, 0x80, 0x83, 0x8b, 0xe0, 0x55, 0x50, 0xa7, 0x1e, + 0xcc, 0x9f, 0xac, 0x41, 0x77, 0x2c, 0x79, 0x22, 0x30, 0x09, 0x1b, 0x1a, + 0x83, 0x5b, 0x2c, 0x48, 0xdc, 0x09, 0x7d, 0x59, 0x0d, 0xf0, 0x54, 0x17, + 0xfb, 0x5e, 0x38, 0x68, 0xde, 0xdb, 0xc5, 0x93, 0xab, 0x17, 0x5f, 0x4b, + 0x4d, 0x6d, 0xf2, 0xc7, 0x4e, 0x15, 0x1e, 0x10, 0x76, 0xc4, 0xcb, 0x87, + 0xd8, 0xb7, 0x9d, 0xa8, 0xbf, 0xc5, 0x2e, 0x5e, 0xfc, 0xd3, 0x6c, 0x45, + 0xd4, 0x5d, 0x72, 0x0f, 0x66, 0xeb, 0x67, 0x86, 0xfa, 0x6c, 0xd6, 0x80, + 0xa4, 0x23, 0xcb, 0x5d, 0xed, 0x3c, 0xde, 0xdc, 0x5b, 0x3d, 0xca, 0x95, + 0x43, 0x4b, 0xdc, 0xe8, 0x49, 0xd3, 0xe1, 0x01, 0xd4, 0xf1, 0xe4, 0x47, + 0xcf, 0x56, 0xba, 0x71, 0xb4, 0x69, 0xed, 0xe7, 0xdb, 0x0f, 0x89, 0xd6, + 0xbb, 0xcd, 0x1a, 0xff, 0xb4, 0xbe, 0x72, 0x26, 0xdc, 0x76, 0x79, 0xb3, + 0x1a, 0x4b, 0xe6, 0x8d, 0x9b, 0x8e, 0xd9, 0xe9, 0xe6, 0xf9, 0xff, 0xa5, 0x00 + }; byte hash[sizeof(large_digest) - 1]; testVector a, b, c, d, e; testVector test_sha[5]; @@ -7955,15 +7972,18 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t shake128_test(void) #else byte large_input[SHAKE128_LARGE_INPUT_BUFSIZ]; #endif - static const char large_digest[] = - "\x88\xd7\x0e\x86\x46\x72\x6b\x3d\x7d\x22\xe1\xa9\x2d\x02\xdb\x35" - "\x92\x4f\x1b\x03\x90\xee\xa3\xce\xd1\x3a\x08\x3a\xd7\x4e\x10\xdf" - "\x09\x67\x33\x35\x4f\xdd\x38\x50\x5b\xcb\x75\xc7\xba\x65\xe5\xe8" - "\xb8\x76\xde\xc5\xee\xd7\xf1\x65\x93\x4e\x5e\xc4\xb1\xd7\x6b\xee" - "\x4b\x57\x48\xf5\x38\x49\x9e\x45\xa0\xf7\x32\xe9\x05\x26\x6a\x10" - "\x70\xd4\x7c\x19\x01\x1f\x6d\x37\xba\x7b\x74\xc2\xbc\xb6\xbc\x74" - "\xa3\x66\x6c\x9b\x11\x84\x9d\x4a\x36\xbc\x8a\x0d\x4c\xe3\x39\xfa" - "\xfa\x1b"; + static const byte large_digest[] = { + 0x88, 0xd7, 0x0e, 0x86, 0x46, 0x72, 0x6b, 0x3d, 0x7d, 0x22, 0xe1, 0xa9, + 0x2d, 0x02, 0xdb, 0x35, 0x92, 0x4f, 0x1b, 0x03, 0x90, 0xee, 0xa3, 0xce, + 0xd1, 0x3a, 0x08, 0x3a, 0xd7, 0x4e, 0x10, 0xdf, 0x09, 0x67, 0x33, 0x35, + 0x4f, 0xdd, 0x38, 0x50, 0x5b, 0xcb, 0x75, 0xc7, 0xba, 0x65, 0xe5, 0xe8, + 0xb8, 0x76, 0xde, 0xc5, 0xee, 0xd7, 0xf1, 0x65, 0x93, 0x4e, 0x5e, 0xc4, + 0xb1, 0xd7, 0x6b, 0xee, 0x4b, 0x57, 0x48, 0xf5, 0x38, 0x49, 0x9e, 0x45, + 0xa0, 0xf7, 0x32, 0xe9, 0x05, 0x26, 0x6a, 0x10, 0x70, 0xd4, 0x7c, 0x19, + 0x01, 0x1f, 0x6d, 0x37, 0xba, 0x7b, 0x74, 0xc2, 0xbc, 0xb6, 0xbc, 0x74, + 0xa3, 0x66, 0x6c, 0x9b, 0x11, 0x84, 0x9d, 0x4a, 0x36, 0xbc, 0x8a, 0x0d, + 0x4c, 0xe3, 0x39, 0xfa, 0xfa, 0x1b, 0x00 + }; /* ** https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHAKE128_Msg0.pdf ** d/e: NIST CAVP - full rate / multi-block output. @@ -8149,24 +8169,31 @@ static wc_test_ret_t shake256_absorb_test(wc_Shake* sha, byte *large_input_buf, { wc_test_ret_t ret = 0; int i; - static const char large_digest[] = - "\x21\x25\x8e\xae\x6e\x4f\xa7\xe1\xb9\x6d\xa7\xc9\x7d\x46\x03\x69" - "\x29\x0d\x81\x49\xba\x5d\xaf\x37\xfd\xeb\x25\x52\x1d\xd9\xbd\x65" - "\xfa\x99\xb9\xd1\x70\x6b\xeb\xd4\xc1\x2c\xea\x24\x20\x27\xa7\xcd" - "\xfa\xe1\x81\xd9\xd5\xc1\x1c\xc7\xe9\x70\xc3\xc7\x21\x6f\x32\x22" - "\xe3\x27\xdb\x58\x5e\xea\x18\x2d\x63\x4d\x14\x6c\x94\xcf\x2b\x7e" - "\x6e\x2a\x74\xf3\xe0\xac\xb3\xb2\xcc\xef\x38\xe9\xe7\x35\xb3\xc5" - "\x77\x9d\xff\xe3\x08\x8e\xf8\x2c\x89\xbb\x45\x22\x16\x99\x91\xc0" - "\xe7\x71\x57\x75\xc5\xb1\xc6\xaf\x27\xcb\x64\x8c\xc4\xee\x3d\x5f" - "\x4c\x35\xfb\x1c\xf3\xf8\x0e\xfd\x5e\xfc\x07\xd8\x4d\x55\x32\x49" - "\x45\x0d\xab\x4a\x49\xc4\x83\xde\xd2\x50\xc9\x33\x8f\x85\xcd\x93" - "\x7a\xe6\x6b\xb4\x36\xf3\xb4\x02\x6e\x85\x9f\xda\x1c\xa5\x71\x43" - "\x2f\x3b\xfc\x09\xe7\xc0\x3c\xa4\xd1\x83\xb7\x41\x11\x1c\xa0\x48" - "\x3d\x0e\xda\xbc\x03\xfe\xb2\x3b\x17\xee\x48\xe8\x44\xba\x24\x08" - "\xd9\xdc\xfd\x01\x39\xd2\xe8\xc7\x31\x01\x25\xae\xe8\x01\xc6\x1a" - "\xb7\x90\x0d\x1e\xfc\x47\xc0\x78\x28\x17\x66\xf3\x61\xc5\xe6\x11" - "\x13\x46\x23\x5e\x1d\xc3\x83\x25\x66\x6c\x68\x1b\x30\xdd\xc4\xe6" - "\x83\x8b\x0f\x23\x58\x7e\x06\x5f\x4a\x2b\xed\xc9\x6c\x97\x68\x44"; + static const byte large_digest[] = { + 0x21, 0x25, 0x8e, 0xae, 0x6e, 0x4f, 0xa7, 0xe1, 0xb9, 0x6d, 0xa7, 0xc9, + 0x7d, 0x46, 0x03, 0x69, 0x29, 0x0d, 0x81, 0x49, 0xba, 0x5d, 0xaf, 0x37, + 0xfd, 0xeb, 0x25, 0x52, 0x1d, 0xd9, 0xbd, 0x65, 0xfa, 0x99, 0xb9, 0xd1, + 0x70, 0x6b, 0xeb, 0xd4, 0xc1, 0x2c, 0xea, 0x24, 0x20, 0x27, 0xa7, 0xcd, + 0xfa, 0xe1, 0x81, 0xd9, 0xd5, 0xc1, 0x1c, 0xc7, 0xe9, 0x70, 0xc3, 0xc7, + 0x21, 0x6f, 0x32, 0x22, 0xe3, 0x27, 0xdb, 0x58, 0x5e, 0xea, 0x18, 0x2d, + 0x63, 0x4d, 0x14, 0x6c, 0x94, 0xcf, 0x2b, 0x7e, 0x6e, 0x2a, 0x74, 0xf3, + 0xe0, 0xac, 0xb3, 0xb2, 0xcc, 0xef, 0x38, 0xe9, 0xe7, 0x35, 0xb3, 0xc5, + 0x77, 0x9d, 0xff, 0xe3, 0x08, 0x8e, 0xf8, 0x2c, 0x89, 0xbb, 0x45, 0x22, + 0x16, 0x99, 0x91, 0xc0, 0xe7, 0x71, 0x57, 0x75, 0xc5, 0xb1, 0xc6, 0xaf, + 0x27, 0xcb, 0x64, 0x8c, 0xc4, 0xee, 0x3d, 0x5f, 0x4c, 0x35, 0xfb, 0x1c, + 0xf3, 0xf8, 0x0e, 0xfd, 0x5e, 0xfc, 0x07, 0xd8, 0x4d, 0x55, 0x32, 0x49, + 0x45, 0x0d, 0xab, 0x4a, 0x49, 0xc4, 0x83, 0xde, 0xd2, 0x50, 0xc9, 0x33, + 0x8f, 0x85, 0xcd, 0x93, 0x7a, 0xe6, 0x6b, 0xb4, 0x36, 0xf3, 0xb4, 0x02, + 0x6e, 0x85, 0x9f, 0xda, 0x1c, 0xa5, 0x71, 0x43, 0x2f, 0x3b, 0xfc, 0x09, + 0xe7, 0xc0, 0x3c, 0xa4, 0xd1, 0x83, 0xb7, 0x41, 0x11, 0x1c, 0xa0, 0x48, + 0x3d, 0x0e, 0xda, 0xbc, 0x03, 0xfe, 0xb2, 0x3b, 0x17, 0xee, 0x48, 0xe8, + 0x44, 0xba, 0x24, 0x08, 0xd9, 0xdc, 0xfd, 0x01, 0x39, 0xd2, 0xe8, 0xc7, + 0x31, 0x01, 0x25, 0xae, 0xe8, 0x01, 0xc6, 0x1a, 0xb7, 0x90, 0x0d, 0x1e, + 0xfc, 0x47, 0xc0, 0x78, 0x28, 0x17, 0x66, 0xf3, 0x61, 0xc5, 0xe6, 0x11, + 0x13, 0x46, 0x23, 0x5e, 0x1d, 0xc3, 0x83, 0x25, 0x66, 0x6c, 0x68, 0x1b, + 0x30, 0xdd, 0xc4, 0xe6, 0x83, 0x8b, 0x0f, 0x23, 0x58, 0x7e, 0x06, 0x5f, + 0x4a, 0x2b, 0xed, 0xc9, 0x6c, 0x97, 0x68, 0x44, 0x00 + }; byte hash[sizeof(large_digest) - 1]; testVector a, b, c, d, e; testVector test_sha[5]; @@ -8320,15 +8347,18 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t shake256_test(void) #else byte large_input[SHAKE256_LARGE_INPUT_BUFSIZ]; #endif - static const char large_digest[] = - "\x90\x32\x4a\xcc\xd1\xdf\xb8\x0b\x79\x1f\xb8\xc8\x5b\x54\xc8\xe7" - "\x45\xf5\x60\x6b\x38\x26\xb2\x0a\xee\x38\x01\xf3\xd9\xfa\x96\x9f" - "\x6a\xd7\x15\xdf\xb6\xc2\xf4\x20\x33\x44\x55\xe8\x2a\x09\x2b\x68" - "\x2e\x18\x65\x5e\x65\x93\x28\xbc\xb1\x9e\xe2\xb1\x92\xea\x98\xac" - "\x21\xef\x4c\xe1\xb4\xb7\xbe\x81\x5c\x1d\xd3\xb7\x17\xe5\xbb\xc5" - "\x8c\x68\xb7\xfb\xac\x55\x8a\x9b\x4d\x91\xe4\x9f\x72\xbb\x6e\x38" - "\xaf\x21\x7d\x21\xaa\x98\x4e\x75\xc4\xb4\x1c\x7c\x50\x45\x54\xf9" - "\xea\x26"; + static const byte large_digest[] = { + 0x90, 0x32, 0x4a, 0xcc, 0xd1, 0xdf, 0xb8, 0x0b, 0x79, 0x1f, 0xb8, 0xc8, + 0x5b, 0x54, 0xc8, 0xe7, 0x45, 0xf5, 0x60, 0x6b, 0x38, 0x26, 0xb2, 0x0a, + 0xee, 0x38, 0x01, 0xf3, 0xd9, 0xfa, 0x96, 0x9f, 0x6a, 0xd7, 0x15, 0xdf, + 0xb6, 0xc2, 0xf4, 0x20, 0x33, 0x44, 0x55, 0xe8, 0x2a, 0x09, 0x2b, 0x68, + 0x2e, 0x18, 0x65, 0x5e, 0x65, 0x93, 0x28, 0xbc, 0xb1, 0x9e, 0xe2, 0xb1, + 0x92, 0xea, 0x98, 0xac, 0x21, 0xef, 0x4c, 0xe1, 0xb4, 0xb7, 0xbe, 0x81, + 0x5c, 0x1d, 0xd3, 0xb7, 0x17, 0xe5, 0xbb, 0xc5, 0x8c, 0x68, 0xb7, 0xfb, + 0xac, 0x55, 0x8a, 0x9b, 0x4d, 0x91, 0xe4, 0x9f, 0x72, 0xbb, 0x6e, 0x38, + 0xaf, 0x21, 0x7d, 0x21, 0xaa, 0x98, 0x4e, 0x75, 0xc4, 0xb4, 0x1c, 0x7c, + 0x50, 0x45, 0x54, 0xf9, 0xea, 0x26, 0x00 + }; /* ** https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHAKE256_Msg0.pdf ** d/e: NIST CAVP - full rate / multi-block output. From 20e4053a9fa00b6e216185f2157ee9512c2ceb17 Mon Sep 17 00:00:00 2001 From: David Garske Date: Thu, 18 Jun 2026 11:44:21 -0700 Subject: [PATCH 4/4] dilithium: faster Montgomery q^-1 via 64-bit-widened multiply on 16-bit CPUs The TI cl2000 (C2000 C28x) compiler miscompiles the 32x32->32 low multiply used for the q^-1 step of mldsa_mont_red() - verified on a TMS320F28P550SJ, the ML-DSA-87 verify KAT fails (res=0) - but compiles the 32x64->64 widening multiply correctly. Compute the q^-1 product through the 64-bit path (MLDSA_MUL_QINV_WIDE64): correct on any conforming compiler and, on the C28x, ~4% faster than the shift-based reduction (305 vs 317 ms/op for ML-DSA-87 verify). dilithium.h auto-selects it for WC_16BIT_CPU and leaves the q multiply enabled (it compiles correctly); a user can still force the shift form with MLDSA_MUL_QINV_SLOW / MLDSA_MUL_Q_SLOW. Validated on hardware for keygen+sign+verify (round-trip res=1). No effect on 8-bit/>=32-bit-int builds. --- wolfcrypt/src/wc_mldsa.c | 6 ++++++ wolfssl/wolfcrypt/dilithium.h | 20 ++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/wolfcrypt/src/wc_mldsa.c b/wolfcrypt/src/wc_mldsa.c index 62538dc0e88..3464ab49d0e 100644 --- a/wolfcrypt/src/wc_mldsa.c +++ b/wolfcrypt/src/wc_mldsa.c @@ -5594,7 +5594,13 @@ static void mldsa_vec_use_hint(sword32* w1, byte k, sword32 gamma2, static sword32 mldsa_mont_red(sword64 a) { #ifndef MLDSA_MUL_QINV_SLOW +#ifdef MLDSA_MUL_QINV_WIDE64 + /* Low 32 bits of the q^-1 product via the 64-bit multiply; see + * MLDSA_MUL_QINV_WIDE64 in dilithium.h. */ + sword64 t = (sword32)((sword64)(sword32)a * (sword64)MLDSA_QINV); +#else sword64 t = (sword32)((sword32)a * (sword32)MLDSA_QINV); +#endif #else sword64 t = (sword32)((sword32)a + (sword32)((sword32)a << 13) - (sword32)((sword32)a << 23) + (sword32)((sword32)a << 26)); diff --git a/wolfssl/wolfcrypt/dilithium.h b/wolfssl/wolfcrypt/dilithium.h index a37b806bff6..2a344a711a4 100644 --- a/wolfssl/wolfcrypt/dilithium.h +++ b/wolfssl/wolfcrypt/dilithium.h @@ -306,19 +306,15 @@ #endif #endif -/* On a 16-bit-int CPU (WC_16BIT_CPU) the toolchain synthesizes 32-/64-bit - * multiplies, and some (e.g. TI cl2000 for the C2000 C28x) miscompile the - * multiply-based Montgomery reduction in mldsa_mont_red(). The shift-based - * reduction is mathematically identical (q = 2^23 - 2^13 + 1, and the q^-1 - * expansion likewise), avoids the wide multiply, and is also cheaper on such - * targets. Select it by default there; a user can still override either - * macro explicitly. */ +/* MLDSA_MUL_QINV_WIDE64: compute the q^-1 step of mldsa_mont_red() through the + * 32x64->64 widening multiply instead of a 32x32->32 low multiply. Some 16-bit + * toolchains miscompile the 32x32->32 form (e.g. TI cl2000 on the C28x); the + * widening form is correct on any conforming compiler and no slower. Default + * it on for WC_16BIT_CPU; a user can force the shift form with + * MLDSA_MUL_QINV_SLOW / MLDSA_MUL_Q_SLOW. */ #if defined(WC_16BIT_CPU) - #ifndef MLDSA_MUL_QINV_SLOW - #define MLDSA_MUL_QINV_SLOW - #endif - #ifndef MLDSA_MUL_Q_SLOW - #define MLDSA_MUL_Q_SLOW + #if !defined(MLDSA_MUL_QINV_SLOW) && !defined(MLDSA_MUL_QINV_WIDE64) + #define MLDSA_MUL_QINV_WIDE64 #endif #endif