@@ -64,132 +64,6 @@ namespace hud
6464 return true ;
6565 }
6666
67- [[nodiscard]] static constexpr bool is_valid_utf8_generic (const char8 *string, usize byte_count) noexcept
68- {
69- usize pos = 0 ;
70- u32 code_point = 0 ;
71- while (pos < byte_count) {
72- // Optimization step:
73- // If the next 16 bytes are guaranteed to be ASCII (all < 128),
74- // we can skip them all at once instead of checking byte by byte.
75- usize next_pos = pos + 16 ;
76- if (next_pos <= byte_count) { // Make sure we don't read past the buffer
77- u64 v1 = hud::memory::unaligned_load64 (string + pos); // load first 8 bytes
78- u64 v2 = hud::memory::unaligned_load64 (string + pos + sizeof (u64 )); // load next 8 bytes
79- // Bitwise OR combines both 8-byte blocks so we only need a single mask test below.
80- // If any byte in v1 or v2 has its high bit set (>= 0x80, non-ASCII),
81- // the result will also have that bit set. This lets us quickly check
82- // if all 16 bytes are ASCII with one comparison instead of two.
83- u64 v {v1 | v2};
84- if ((v & 0x8080808080808080 ) == 0 ) {
85- pos = next_pos; // all 16 bytes are ASCII → skip them at once
86- continue ;
87- }
88- }
89-
90- // Now process byte by byte
91- unsigned char byte = string[pos];
92-
93- // Consume consecutive ASCII bytes.
94- // This inner loop skips multiple ASCII chars in a row efficiently.
95- while ((byte & 0x80 ) == 0 ) {
96- if (++pos == byte_count) {
97- return true ;
98- }
99- byte = string[pos];
100- }
101-
102- // Case: 2-byte sequence -> 110xxxxx 10xxxxxx
103- // If we catch leading byte 110xxxxx
104- if ((byte & 0b11100000 ) == 0b11000000 ) {
105-
106- // Jump to next supposed code point (after 110xxxxx 10xxxxxx)
107- // If we go too far, then there is no continuous byte 10xxxxxx
108- next_pos = pos + 2 ;
109- if (next_pos > byte_count) {
110- return false ;
111- }
112- // Ensure 1st continuous byte is 10xxxxxx
113- if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
114- return false ;
115- }
116- // Read the code point
117- code_point = (byte & 0b00011111 ) << 6 | (string[pos + 1 ] & 0b00111111 );
118- // Ensure code point is [0x80, 0x7FF] aka [U+0080, U+07FF]
119- if ((code_point < 0x80 ) || (0x7ff < code_point)) {
120- return false ;
121- }
122- }
123- // Case: 3-byte sequence -> 1110xxxx 10xxxxxx 10xxxxxx
124- // If we catch leading byte 1110xxxx
125- else if ((byte & 0b11110000 ) == 0b11100000 ) {
126-
127- // Jump to next supposed code point (after 1110xxxx 10xxxxxx 10xxxxxx)
128- // If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx
129- next_pos = pos + 3 ;
130- if (next_pos > byte_count) {
131- return false ;
132- }
133- // Ensure 1st continuous byte is 10xxxxxx
134- if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
135- return false ;
136- }
137- // Ensure 2nd continuous byte is 10xxxxxx
138- if ((string[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
139- return false ;
140- }
141- // Read the code point
142- code_point = (byte & 0b00001111 ) << 12 | (string[pos + 1 ] & 0b00111111 ) << 6 | (string[pos + 2 ] & 0b00111111 );
143- // Check code point valid value
144- // - must not be overlong encoding (< 0x800 is invalid)
145- // - must be [0x0800, 0xFFFF] aka [U+0800, U+FFFF]
146- // - must not be in surrogate range [0xD800, 0xDFFF] aka [U+D800, U+DFFF]
147- if ((code_point < 0x800 ) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000 )) {
148- return false ;
149- }
150- }
151- // Case: 4-byte sequence -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
152- // If we catch leading byte 11110xxx
153- else if ((byte & 0b11111000 ) == 0b11110000 ) {
154- // Jump to next supposed code point (after 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
155- // If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx 10xxxxxx
156- next_pos = pos + 4 ;
157- if (next_pos > byte_count) {
158- return false ;
159- }
160- // Ensure 1st continuous byte is 10xxxxxx
161- if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
162- return false ;
163- }
164- // Ensure 2nd continuous byte is 10xxxxxx
165- if ((string[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
166- return false ;
167- }
168- // Ensure 3rd continuous byte is 10xxxxxx
169- if ((string[pos + 3 ] & 0b11000000 ) != 0b10000000 ) {
170- return false ;
171- }
172- // Read the code point
173- code_point = (byte & 0b00000111 ) << 18 | (string[pos + 1 ] & 0b00111111 ) << 12 | (string[pos + 2 ] & 0b00111111 ) << 6 | (string[pos + 3 ] & 0b00111111 );
174- // Check code point valid value
175- // - must be > 0xFFFF (otherwise it's overlong)
176- // - must not exceed Unicode max (0x10FFFF)
177- if (code_point <= 0xffff || 0x10ffff < code_point) {
178- return false ;
179- }
180- }
181- else {
182- // Any other pattern is invalid:
183- // e.g. a continuation byte without a proper leading byte
184- return false ;
185- }
186- // Move to the next character after validating the current one
187- pos = next_pos;
188- }
189- return true ;
190- }
191-
192- [[nodiscard]] static bool is_valid_utf8_sse () noexcept ;
19367 /* *
19468 * Test whether wide null-terminated string contains only pure ansi characters, checking string_size is not bigger than length of the string.
19569 * @param string The null-terminated string
0 commit comments