11#ifndef HD_INC_CORE_STRING_CSTRING_H
22#define HD_INC_CORE_STRING_CSTRING_H
33#include " ../character.h"
4+ #include " ../memory.h"
45#include < stdarg.h> // va_start, va_end
5- // For is_pure_ascii check : https://quick-bench.com/q/P_adhBeQdvHLTBB8EZCtLyrPRsM
6+ // For is_ascii check : https://quick-bench.com/q/P_adhBeQdvHLTBB8EZCtLyrPRsM
67namespace hud
78{
89
@@ -28,14 +29,14 @@ namespace hud
2829 * @param string The null-terminated string
2930 * @return Always return true
3031 */
31- [[nodiscard]] static HD_FORCEINLINE bool is_pure_ascii (const char8 *string) noexcept
32+ [[nodiscard]] static constexpr bool is_ascii (const char8 *string) noexcept
3233 {
3334 if (string == nullptr ) {
3435 return false ;
3536 }
3637
3738 while (!character::is_null (*string)) {
38- if (!character::is_pure_ascii (*string)) {
39+ if (!character::is_ascii (*string)) {
3940 return false ;
4041 }
4142 string++;
@@ -48,31 +49,133 @@ namespace hud
4849 * @param string The null-terminated string
4950 * @return true if the string contains only char8, false otherwise
5051 */
51- [[nodiscard]] static bool is_pure_ascii (const wchar *string) noexcept
52+ [[nodiscard]] static constexpr bool is_ascii (const wchar *string) noexcept
5253 {
5354 if (string == nullptr ) {
5455 return false ;
5556 }
5657
5758 while (!character::is_null (*string)) {
58- if (!character::is_pure_ascii (*string)) {
59+ if (!character::is_ascii (*string)) {
5960 return false ;
6061 }
6162 string++;
6263 }
6364 return true ;
6465 }
6566
67+ [[nodiscard]] static constexpr bool is_valid_utf8 (const char8 *string, usize byte_count) noexcept
68+ {
69+ u64 pos = 0 ;
70+ u32 code_point = 0 ;
71+ while (pos < byte_count) {
72+ // check of the next 16 bytes are ascii.
73+ u64 next_pos = pos + 16 ;
74+ if (next_pos <= byte_count) { // if it is safe to read 16 more bytes, check that they are ascii
75+ u64 v1 = hud::memory::unaligned_load64 (string + pos);
76+ // std::memcpy(&v1, string + pos, sizeof(u64));
77+ u64 v2 = hud::memory::unaligned_load64 (string + pos + sizeof (u64 ));
78+ // std::memcpy(&v2, string + pos + sizeof(u64), sizeof(u64));
79+ u64 v {v1 | v2};
80+ if ((v & 0x8080808080808080 ) == 0 ) {
81+ pos = next_pos;
82+ continue ;
83+ }
84+ }
85+ unsigned char byte = string[pos];
86+
87+ while (byte < 0b10000000 ) {
88+ if (++pos == byte_count) {
89+ return true ;
90+ }
91+ byte = string[pos];
92+ }
93+
94+ if ((byte & 0b11100000 ) == 0b11000000 ) {
95+ next_pos = pos + 2 ;
96+ if (next_pos > byte_count) {
97+ return false ;
98+ }
99+ if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
100+ return false ;
101+ }
102+ // range check
103+ code_point = (byte & 0b00011111 ) << 6 | (string[pos + 1 ] & 0b00111111 );
104+ if ((code_point < 0x80 ) || (0x7ff < code_point)) {
105+ return false ;
106+ }
107+ }
108+ else if ((byte & 0b11110000 ) == 0b11100000 ) {
109+ next_pos = pos + 3 ;
110+ if (next_pos > byte_count) {
111+ return false ;
112+ }
113+ if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
114+ return false ;
115+ }
116+ if ((string[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
117+ return false ;
118+ }
119+ // range check
120+ code_point = (byte & 0b00001111 ) << 12 | (string[pos + 1 ] & 0b00111111 ) << 6 | (string[pos + 2 ] & 0b00111111 );
121+ if ((code_point < 0x800 ) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000 )) {
122+ return false ;
123+ }
124+ }
125+ else if ((byte & 0b11111000 ) == 0b11110000 ) { // 0b11110000
126+ next_pos = pos + 4 ;
127+ if (next_pos > byte_count) {
128+ return false ;
129+ }
130+ if ((string[pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
131+ return false ;
132+ }
133+ if ((string[pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
134+ return false ;
135+ }
136+ if ((string[pos + 3 ] & 0b11000000 ) != 0b10000000 ) {
137+ return false ;
138+ }
139+ // range check
140+ code_point =
141+ (byte & 0b00000111 ) << 18 | (string[pos + 1 ] & 0b00111111 ) << 12 | (string[pos + 2 ] & 0b00111111 ) << 6 | (string[pos + 3 ] & 0b00111111 );
142+ if (code_point <= 0xffff || 0x10ffff < code_point) {
143+ return false ;
144+ }
145+ }
146+ else {
147+ // we may have a continuation
148+ return false ;
149+ }
150+ pos = next_pos;
151+ }
152+ return true ;
153+ }
154+
66155 /* *
67156 * Test whether wide null-terminated string contains only pure ansi characters, checking string_size is not bigger than length of the string.
68157 * @param string The null-terminated string
69158 * @param string_size Size of the string in characters to test
70159 * @return true if the string contains only char8 and reach null-terminator character or the string_size character.
71160 * false if the string contains non char8 character
72161 */
73- [[nodiscard]] static HD_FORCEINLINE bool is_pure_ascii_safe (const char8 *string, usize string_size) noexcept
162+ [[nodiscard]] static constexpr bool is_ascii_safe (const char8 *string, usize string_size) noexcept
74163 {
75- return string != nullptr ;
164+ if (string == nullptr ) {
165+ return false ;
166+ }
167+
168+ while (string_size-- > 0 ) {
169+ char8 cur = *string;
170+ if (character::is_null (cur)) {
171+ return true ;
172+ }
173+ if (!character::is_ascii (cur)) {
174+ return false ;
175+ }
176+ string++;
177+ }
178+ return true ;
76179 }
77180
78181 /* *
@@ -82,7 +185,7 @@ namespace hud
82185 * @return true if the string contains only char8 and reach null-terminator character or the string_size character.
83186 * false if the string contains non char8 character
84187 */
85- [[nodiscard]] static bool is_pure_ascii_safe (const wchar *string, usize string_size) noexcept
188+ [[nodiscard]] static constexpr bool is_ascii_safe (const wchar *string, usize string_size) noexcept
86189 {
87190 if (string == nullptr ) {
88191 return false ;
@@ -93,7 +196,7 @@ namespace hud
93196 if (character::is_null (cur)) {
94197 return true ;
95198 }
96- if (!character::is_pure_ascii (cur)) {
199+ if (!character::is_ascii (cur)) {
97200 return false ;
98201 }
99202 string++;
0 commit comments