Skip to content

Commit da2fd40

Browse files
author
Julian LALU
committed
add utf8 basic validation
1 parent 52f1194 commit da2fd40

File tree

11 files changed

+1310
-45
lines changed

11 files changed

+1310
-45
lines changed

interface/core/character/character.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ namespace hud
1010
static constexpr wchar WIDE_NULL_CHARACTER = L'\0';
1111

1212
/** Check whether the character is a pure ansi character. */
13-
static HD_FORCEINLINE constexpr bool is_pure_ascii(const char8 character) noexcept
13+
static HD_FORCEINLINE constexpr bool is_ascii(const char8 character) noexcept
1414
{
1515
return (character & 0x80) == 0;
1616
}
1717

1818
/** Check whether the character is a pure ansi character. */
19-
static HD_FORCEINLINE constexpr bool is_pure_ascii(const wchar character) noexcept
19+
static HD_FORCEINLINE constexpr bool is_ascii(const wchar character) noexcept
2020
{
2121
return (character & ~0x7F) == 0;
2222
}

interface/core/string/cstring.h

Lines changed: 112 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#ifndef HD_INC_CORE_STRING_CSTRING_H
22
#define HD_INC_CORE_STRING_CSTRING_H
33
#include "../character.h"
4+
#include "../memory.h"
45
#include <stdarg.h> // va_start, va_end
5-
// For is_pure_ascii check : https://quick-bench.com/q/P_adhBeQdvHLTBB8EZCtLyrPRsM
6+
// For is_ascii check : https://quick-bench.com/q/P_adhBeQdvHLTBB8EZCtLyrPRsM
67
namespace hud
78
{
89

@@ -28,14 +29,14 @@ namespace hud
2829
* @param string The null-terminated string
2930
* @return Always return true
3031
*/
31-
[[nodiscard]] static HD_FORCEINLINE bool is_pure_ascii(const char8 *string) noexcept
32+
[[nodiscard]] static constexpr bool is_ascii(const char8 *string) noexcept
3233
{
3334
if (string == nullptr) {
3435
return false;
3536
}
3637

3738
while (!character::is_null(*string)) {
38-
if (!character::is_pure_ascii(*string)) {
39+
if (!character::is_ascii(*string)) {
3940
return false;
4041
}
4142
string++;
@@ -48,31 +49,133 @@ namespace hud
4849
* @param string The null-terminated string
4950
* @return true if the string contains only char8, false otherwise
5051
*/
51-
[[nodiscard]] static bool is_pure_ascii(const wchar *string) noexcept
52+
[[nodiscard]] static constexpr bool is_ascii(const wchar *string) noexcept
5253
{
5354
if (string == nullptr) {
5455
return false;
5556
}
5657

5758
while (!character::is_null(*string)) {
58-
if (!character::is_pure_ascii(*string)) {
59+
if (!character::is_ascii(*string)) {
5960
return false;
6061
}
6162
string++;
6263
}
6364
return true;
6465
}
6566

67+
[[nodiscard]] static constexpr bool is_valid_utf8(const char8 *string, usize byte_count) noexcept
68+
{
69+
u64 pos = 0;
70+
u32 code_point = 0;
71+
while (pos < byte_count) {
72+
// check of the next 16 bytes are ascii.
73+
u64 next_pos = pos + 16;
74+
if (next_pos <= byte_count) { // if it is safe to read 16 more bytes, check that they are ascii
75+
u64 v1 = hud::memory::unaligned_load64(string + pos);
76+
// std::memcpy(&v1, string + pos, sizeof(u64));
77+
u64 v2 = hud::memory::unaligned_load64(string + pos + sizeof(u64));
78+
// std::memcpy(&v2, string + pos + sizeof(u64), sizeof(u64));
79+
u64 v {v1 | v2};
80+
if ((v & 0x8080808080808080) == 0) {
81+
pos = next_pos;
82+
continue;
83+
}
84+
}
85+
unsigned char byte = string[pos];
86+
87+
while (byte < 0b10000000) {
88+
if (++pos == byte_count) {
89+
return true;
90+
}
91+
byte = string[pos];
92+
}
93+
94+
if ((byte & 0b11100000) == 0b11000000) {
95+
next_pos = pos + 2;
96+
if (next_pos > byte_count) {
97+
return false;
98+
}
99+
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
100+
return false;
101+
}
102+
// range check
103+
code_point = (byte & 0b00011111) << 6 | (string[pos + 1] & 0b00111111);
104+
if ((code_point < 0x80) || (0x7ff < code_point)) {
105+
return false;
106+
}
107+
}
108+
else if ((byte & 0b11110000) == 0b11100000) {
109+
next_pos = pos + 3;
110+
if (next_pos > byte_count) {
111+
return false;
112+
}
113+
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
114+
return false;
115+
}
116+
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
117+
return false;
118+
}
119+
// range check
120+
code_point = (byte & 0b00001111) << 12 | (string[pos + 1] & 0b00111111) << 6 | (string[pos + 2] & 0b00111111);
121+
if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
122+
return false;
123+
}
124+
}
125+
else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
126+
next_pos = pos + 4;
127+
if (next_pos > byte_count) {
128+
return false;
129+
}
130+
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
131+
return false;
132+
}
133+
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
134+
return false;
135+
}
136+
if ((string[pos + 3] & 0b11000000) != 0b10000000) {
137+
return false;
138+
}
139+
// range check
140+
code_point =
141+
(byte & 0b00000111) << 18 | (string[pos + 1] & 0b00111111) << 12 | (string[pos + 2] & 0b00111111) << 6 | (string[pos + 3] & 0b00111111);
142+
if (code_point <= 0xffff || 0x10ffff < code_point) {
143+
return false;
144+
}
145+
}
146+
else {
147+
// we may have a continuation
148+
return false;
149+
}
150+
pos = next_pos;
151+
}
152+
return true;
153+
}
154+
66155
/**
67156
* Test whether wide null-terminated string contains only pure ansi characters, checking string_size is not bigger than length of the string.
68157
* @param string The null-terminated string
69158
* @param string_size Size of the string in characters to test
70159
* @return true if the string contains only char8 and reach null-terminator character or the string_size character.
71160
* false if the string contains non char8 character
72161
*/
73-
[[nodiscard]] static HD_FORCEINLINE bool is_pure_ascii_safe(const char8 *string, usize string_size) noexcept
162+
[[nodiscard]] static constexpr bool is_ascii_safe(const char8 *string, usize string_size) noexcept
74163
{
75-
return string != nullptr;
164+
if (string == nullptr) {
165+
return false;
166+
}
167+
168+
while (string_size-- > 0) {
169+
char8 cur = *string;
170+
if (character::is_null(cur)) {
171+
return true;
172+
}
173+
if (!character::is_ascii(cur)) {
174+
return false;
175+
}
176+
string++;
177+
}
178+
return true;
76179
}
77180

78181
/**
@@ -82,7 +185,7 @@ namespace hud
82185
* @return true if the string contains only char8 and reach null-terminator character or the string_size character.
83186
* false if the string contains non char8 character
84187
*/
85-
[[nodiscard]] static bool is_pure_ascii_safe(const wchar *string, usize string_size) noexcept
188+
[[nodiscard]] static constexpr bool is_ascii_safe(const wchar *string, usize string_size) noexcept
86189
{
87190
if (string == nullptr) {
88191
return false;
@@ -93,7 +196,7 @@ namespace hud
93196
if (character::is_null(cur)) {
94197
return true;
95198
}
96-
if (!character::is_pure_ascii(cur)) {
199+
if (!character::is_ascii(cur)) {
97200
return false;
98201
}
99202
string++;

interface/core/string/cstring_view.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ namespace hud
8383
// Do the rest 1 byte at a time
8484
const char8 *p = ptr_ + i;
8585
while (*p != '\0') {
86-
if (!character::is_pure_ascii(*p)) {
86+
if (!character::is_ascii(*p)) {
8787
return false;
8888
}
8989
p++;

interface/core/string/wstring_view.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ namespace hud
102102
// Do the rest 1 byte at a time
103103
const wchar *p = ptr_ + i;
104104
while (*p != '\0') {
105-
if (!character::is_pure_ascii(*p)) {
105+
if (!character::is_ascii(*p)) {
106106
return false;
107107
}
108108
p++;

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
set( src
44
hash/crc32.cpp
55
debugger.cpp
6+
string/cstring.cpp
67
)
78

89
set( interface

src/string/cstring.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#include <core/string/cstring.h>
2+
#include <core/memory.h>
3+
namespace hud
4+
{
5+
// [[nodiscard]] constexpr bool cstring::is_valid_utf8(const char8 *string, usize byte_count) noexcept
6+
// {
7+
// u64 pos = 0;
8+
// u32 code_point = 0;
9+
// while (pos < byte_count) {
10+
// // check of the next 16 bytes are ascii.
11+
// u64 next_pos = pos + 16;
12+
// if (next_pos <= byte_count) { // if it is safe to read 16 more bytes, check that they are ascii
13+
// u64 v1 = hud::memory::unaligned_load64(string + pos);
14+
// // std::memcpy(&v1, string + pos, sizeof(u64));
15+
// u64 v2 = hud::memory::unaligned_load64(string + pos + sizeof(u64));
16+
// // std::memcpy(&v2, string + pos + sizeof(u64), sizeof(u64));
17+
// u64 v {v1 | v2};
18+
// if ((v & 0x8080808080808080) == 0) {
19+
// pos = next_pos;
20+
// continue;
21+
// }
22+
// }
23+
// unsigned char byte = string[pos];
24+
25+
// while (byte < 0b10000000) {
26+
// if (++pos == byte_count) {
27+
// return true;
28+
// }
29+
// byte = string[pos];
30+
// }
31+
32+
// if ((byte & 0b11100000) == 0b11000000) {
33+
// next_pos = pos + 2;
34+
// if (next_pos > byte_count) {
35+
// return false;
36+
// }
37+
// if ((string[pos + 1] & 0b11000000) != 0b10000000) {
38+
// return false;
39+
// }
40+
// // range check
41+
// code_point = (byte & 0b00011111) << 6 | (string[pos + 1] & 0b00111111);
42+
// if ((code_point < 0x80) || (0x7ff < code_point)) {
43+
// return false;
44+
// }
45+
// }
46+
// else if ((byte & 0b11110000) == 0b11100000) {
47+
// next_pos = pos + 3;
48+
// if (next_pos > byte_count) {
49+
// return false;
50+
// }
51+
// if ((string[pos + 1] & 0b11000000) != 0b10000000) {
52+
// return false;
53+
// }
54+
// if ((string[pos + 2] & 0b11000000) != 0b10000000) {
55+
// return false;
56+
// }
57+
// // range check
58+
// code_point = (byte & 0b00001111) << 12 | (string[pos + 1] & 0b00111111) << 6 | (string[pos + 2] & 0b00111111);
59+
// if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
60+
// return false;
61+
// }
62+
// }
63+
// else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
64+
// next_pos = pos + 4;
65+
// if (next_pos > byte_count) {
66+
// return false;
67+
// }
68+
// if ((string[pos + 1] & 0b11000000) != 0b10000000) {
69+
// return false;
70+
// }
71+
// if ((string[pos + 2] & 0b11000000) != 0b10000000) {
72+
// return false;
73+
// }
74+
// if ((string[pos + 3] & 0b11000000) != 0b10000000) {
75+
// return false;
76+
// }
77+
// // range check
78+
// code_point =
79+
// (byte & 0b00000111) << 18 | (string[pos + 1] & 0b00111111) << 12 | (string[pos + 2] & 0b00111111) << 6 | (string[pos + 3] & 0b00111111);
80+
// if (code_point <= 0xffff || 0x10ffff < code_point) {
81+
// return false;
82+
// }
83+
// }
84+
// else {
85+
// // we may have a continuation
86+
// return false;
87+
// }
88+
// pos = next_pos;
89+
// }
90+
// return true;
91+
// }
92+
} // namespace hud

src/string/cstring_view.cpp

Lines changed: 0 additions & 1 deletion
This file was deleted.

test/character.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,21 @@ static constexpr wchar wide_control[] = {L'\x0', L'\x1', L'\x2', L'\x3', L'\x4',
1515
static constexpr char8 punc[] = {'!', '\"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'};
1616
static constexpr char8 wide_punc[] = {L'!', L'\"', L'#', L'$', L'%', L'&', L'\'', L'(', L')', L'*', L'+', L',', L'-', L'.', L'/', L':', L';', L'<', L'=', L'>', L'?', L'@', L'[', L'\\', L']', L'^', L'_', L'`', L'{', L'|', L'}', L'~'};
1717

18-
GTEST_TEST(character, is_pure_ascii)
18+
GTEST_TEST(character, is_ascii)
1919
{
2020
// for (char8 cur = 0; cur < hud::char8_max; cur++)
2121
// {
22-
// hud_assert_true(hud::character::is_pure_ascii(cur));
22+
// hud_assert_true(hud::character::is_ascii(cur));
2323
// }
2424
// for (wchar cur = 0; cur < hud::wchar_max; cur++)
2525
// {
2626
// if (cur <= hud::char8_max)
2727
// {
28-
// hud_assert_true(hud::character::is_pure_ascii(cur));
28+
// hud_assert_true(hud::character::is_ascii(cur));
2929
// }
3030
// else
3131
// {
32-
// hud_assert_false(hud::character::is_pure_ascii(cur));
32+
// hud_assert_false(hud::character::is_ascii(cur));
3333
// }
3434
// }
3535
}

0 commit comments

Comments
 (0)