Skip to content

Commit 7d443ff

Browse files
author
Julian LALU
committed
Improve CMake and unicode
1 parent 3ce5993 commit 7d443ff

File tree

15 files changed

+272
-831
lines changed

15 files changed

+272
-831
lines changed

.clang-format

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ AccessModifierOffset: -4
5959
NamespaceIndentation: All
6060

6161
IndentCaseBlocks: true
62-
IndentPPDirectives: BeforeHash
62+
IndentPPDirectives: None
6363
IndentRequiresClause: false
6464
# InsertNewlineAtEOF: true # clang-format 16
6565
InsertTrailingCommas: Wrapped

interface/core/debugger/debugger_linux.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ namespace hud::linux
1616
static HD_FORCEINLINE void break_here() noexcept
1717
{
1818
// LCOV_EXCL_START ( We don't covert the code that break the debugger )
19-
if (is_present())
20-
{
19+
if (is_present()) {
2120
// With clang/gcc we can break the debugger on x86 by invoking int3
2221
#if defined(HD_TARGET_X86_FAMILY)
2322

interface/core/string/cstring.h

Lines changed: 0 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -64,132 +64,6 @@ namespace hud
6464
return true;
6565
}
6666

67-
[[nodiscard]] static constexpr bool is_valid_utf8_generic(const char8 *string, usize byte_count) noexcept
68-
{
69-
usize pos = 0;
70-
u32 code_point = 0;
71-
while (pos < byte_count) {
72-
// Optimization step:
73-
// If the next 16 bytes are guaranteed to be ASCII (all < 128),
74-
// we can skip them all at once instead of checking byte by byte.
75-
usize next_pos = pos + 16;
76-
if (next_pos <= byte_count) { // Make sure we don't read past the buffer
77-
u64 v1 = hud::memory::unaligned_load64(string + pos); // load first 8 bytes
78-
u64 v2 = hud::memory::unaligned_load64(string + pos + sizeof(u64)); // load next 8 bytes
79-
// Bitwise OR combines both 8-byte blocks so we only need a single mask test below.
80-
// If any byte in v1 or v2 has its high bit set (>= 0x80, non-ASCII),
81-
// the result will also have that bit set. This lets us quickly check
82-
// if all 16 bytes are ASCII with one comparison instead of two.
83-
u64 v {v1 | v2};
84-
if ((v & 0x8080808080808080) == 0) {
85-
pos = next_pos; // all 16 bytes are ASCII → skip them at once
86-
continue;
87-
}
88-
}
89-
90-
// Now process byte by byte
91-
unsigned char byte = string[pos];
92-
93-
// Consume consecutive ASCII bytes.
94-
// This inner loop skips multiple ASCII chars in a row efficiently.
95-
while ((byte & 0x80) == 0) {
96-
if (++pos == byte_count) {
97-
return true;
98-
}
99-
byte = string[pos];
100-
}
101-
102-
// Case: 2-byte sequence -> 110xxxxx 10xxxxxx
103-
// If we catch leading byte 110xxxxx
104-
if ((byte & 0b11100000) == 0b11000000) {
105-
106-
// Jump to next supposed code point (after 110xxxxx 10xxxxxx)
107-
// If we go too far, then there is no continuous byte 10xxxxxx
108-
next_pos = pos + 2;
109-
if (next_pos > byte_count) {
110-
return false;
111-
}
112-
// Ensure 1st continuous byte is 10xxxxxx
113-
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
114-
return false;
115-
}
116-
// Read the code point
117-
code_point = (byte & 0b00011111) << 6 | (string[pos + 1] & 0b00111111);
118-
// Ensure code point is [0x80, 0x7FF] aka [U+0080, U+07FF]
119-
if ((code_point < 0x80) || (0x7ff < code_point)) {
120-
return false;
121-
}
122-
}
123-
// Case: 3-byte sequence -> 1110xxxx 10xxxxxx 10xxxxxx
124-
// If we catch leading byte 1110xxxx
125-
else if ((byte & 0b11110000) == 0b11100000) {
126-
127-
// Jump to next supposed code point (after 1110xxxx 10xxxxxx 10xxxxxx)
128-
// If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx
129-
next_pos = pos + 3;
130-
if (next_pos > byte_count) {
131-
return false;
132-
}
133-
// Ensure 1st continuous byte is 10xxxxxx
134-
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
135-
return false;
136-
}
137-
// Ensure 2nd continuous byte is 10xxxxxx
138-
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
139-
return false;
140-
}
141-
// Read the code point
142-
code_point = (byte & 0b00001111) << 12 | (string[pos + 1] & 0b00111111) << 6 | (string[pos + 2] & 0b00111111);
143-
// Check code point valid value
144-
// - must not be overlong encoding (< 0x800 is invalid)
145-
// - must be [0x0800, 0xFFFF] aka [U+0800, U+FFFF]
146-
// - must not be in surrogate range [0xD800, 0xDFFF] aka [U+D800, U+DFFF]
147-
if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
148-
return false;
149-
}
150-
}
151-
// Case: 4-byte sequence -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
152-
// If we catch leading byte 11110xxx
153-
else if ((byte & 0b11111000) == 0b11110000) {
154-
// Jump to next supposed code point (after 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
155-
// If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx 10xxxxxx
156-
next_pos = pos + 4;
157-
if (next_pos > byte_count) {
158-
return false;
159-
}
160-
// Ensure 1st continuous byte is 10xxxxxx
161-
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
162-
return false;
163-
}
164-
// Ensure 2nd continuous byte is 10xxxxxx
165-
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
166-
return false;
167-
}
168-
// Ensure 3rd continuous byte is 10xxxxxx
169-
if ((string[pos + 3] & 0b11000000) != 0b10000000) {
170-
return false;
171-
}
172-
// Read the code point
173-
code_point = (byte & 0b00000111) << 18 | (string[pos + 1] & 0b00111111) << 12 | (string[pos + 2] & 0b00111111) << 6 | (string[pos + 3] & 0b00111111);
174-
// Check code point valid value
175-
// - must be > 0xFFFF (otherwise it's overlong)
176-
// - must not exceed Unicode max (0x10FFFF)
177-
if (code_point <= 0xffff || 0x10ffff < code_point) {
178-
return false;
179-
}
180-
}
181-
else {
182-
// Any other pattern is invalid:
183-
// e.g. a continuation byte without a proper leading byte
184-
return false;
185-
}
186-
// Move to the next character after validating the current one
187-
pos = next_pos;
188-
}
189-
return true;
190-
}
191-
192-
[[nodiscard]] static bool is_valid_utf8_sse() noexcept;
19367
/**
19468
* Test whether wide null-terminated string contains only pure ansi characters, checking string_size is not bigger than length of the string.
19569
* @param string The null-terminated string

interface/core/string/string.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,28 @@ namespace hud
66
{
77
class string
88
{
9+
public:
910
constexpr string() noexcept = default;
1011
constexpr string(const string &) noexcept = default;
1112
constexpr string(string &&) noexcept = default;
1213
constexpr string &operator=(const string &) noexcept = default;
1314
constexpr string &operator=(string &&) noexcept = default;
15+
[[nodiscard]] constexpr usize count() const noexcept
16+
{
17+
return data_.count();
18+
}
19+
[[nodiscard]] constexpr usize max_count() const noexcept
20+
{
21+
return data_.max_count();
22+
}
23+
[[nodiscard]] constexpr const char8 *data() const noexcept
24+
{
25+
return data_.data();
26+
}
1427

1528
private:
1629
hud::vector<char8> data_;
1730
};
1831
} // namespace hud
1932

20-
#endif HD_INC_CORE_STRING_STRING_H
33+
#endif // HD_INC_CORE_STRING_STRING_H
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#ifndef HD_INC_CORE_STRING_UNICODE_UTF8_H
2+
#define HD_INC_CORE_STRING_UNICODE_UTF8_H
3+
4+
namespace hud::unicode
5+
{
6+
[[nodiscard]] static constexpr bool is_valid_utf8_portable(const char8 *string, usize byte_count) noexcept
7+
{
8+
usize pos = 0;
9+
u32 code_point = 0;
10+
while (pos < byte_count) {
11+
// Optimization step:
12+
// If the next 16 bytes are guaranteed to be ASCII (all < 128),
13+
// we can skip them all at once instead of checking byte by byte.
14+
usize next_pos = pos + 16;
15+
if (next_pos <= byte_count) { // Make sure we don't read past the buffer
16+
u64 v1 = hud::memory::unaligned_load64(string + pos); // load first 8 bytes
17+
u64 v2 = hud::memory::unaligned_load64(string + pos + sizeof(u64)); // load next 8 bytes
18+
// Bitwise OR combines both 8-byte blocks so we only need a single mask test below.
19+
// If any byte in v1 or v2 has its high bit set (>= 0x80, non-ASCII),
20+
// the result will also have that bit set. This lets us quickly check
21+
// if all 16 bytes are ASCII with one comparison instead of two.
22+
u64 v {v1 | v2};
23+
if ((v & 0x8080808080808080) == 0) {
24+
pos = next_pos; // all 16 bytes are ASCII → skip them at once
25+
continue;
26+
}
27+
}
28+
29+
// Now process byte by byte
30+
unsigned char byte = string[pos];
31+
32+
// Consume consecutive ASCII bytes.
33+
// This inner loop skips multiple ASCII chars in a row efficiently.
34+
while ((byte & 0x80) == 0) {
35+
if (++pos == byte_count) {
36+
return true;
37+
}
38+
byte = string[pos];
39+
}
40+
41+
// Case: 2-byte sequence -> 110xxxxx 10xxxxxx
42+
// If we catch leading byte 110xxxxx
43+
if ((byte & 0b11100000) == 0b11000000) {
44+
45+
// Jump to next supposed code point (after 110xxxxx 10xxxxxx)
46+
// If we go too far, then there is no continuous byte 10xxxxxx
47+
next_pos = pos + 2;
48+
if (next_pos > byte_count) {
49+
return false;
50+
}
51+
// Ensure 1st continuous byte is 10xxxxxx
52+
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
53+
return false;
54+
}
55+
// Read the code point
56+
code_point = (byte & 0b00011111) << 6 | (string[pos + 1] & 0b00111111);
57+
// Ensure code point is [0x80, 0x7FF] aka [U+0080, U+07FF]
58+
if ((code_point < 0x80) || (0x7ff < code_point)) {
59+
return false;
60+
}
61+
}
62+
// Case: 3-byte sequence -> 1110xxxx 10xxxxxx 10xxxxxx
63+
// If we catch leading byte 1110xxxx
64+
else if ((byte & 0b11110000) == 0b11100000) {
65+
66+
// Jump to next supposed code point (after 1110xxxx 10xxxxxx 10xxxxxx)
67+
// If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx
68+
next_pos = pos + 3;
69+
if (next_pos > byte_count) {
70+
return false;
71+
}
72+
// Ensure 1st continuous byte is 10xxxxxx
73+
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
74+
return false;
75+
}
76+
// Ensure 2nd continuous byte is 10xxxxxx
77+
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
78+
return false;
79+
}
80+
// Read the code point
81+
code_point = (byte & 0b00001111) << 12 | (string[pos + 1] & 0b00111111) << 6 | (string[pos + 2] & 0b00111111);
82+
// Check code point valid value
83+
// - must not be overlong encoding (< 0x800 is invalid)
84+
// - must be [0x0800, 0xFFFF] aka [U+0800, U+FFFF]
85+
// - must not be in surrogate range [0xD800, 0xDFFF] aka [U+D800, U+DFFF]
86+
if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
87+
return false;
88+
}
89+
}
90+
// Case: 4-byte sequence -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
91+
// If we catch leading byte 11110xxx
92+
else if ((byte & 0b11111000) == 0b11110000) {
93+
// Jump to next supposed code point (after 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
94+
// If we go too far, then there is no continuous bytes 10xxxxxx 10xxxxxx 10xxxxxx
95+
next_pos = pos + 4;
96+
if (next_pos > byte_count) {
97+
return false;
98+
}
99+
// Ensure 1st continuous byte is 10xxxxxx
100+
if ((string[pos + 1] & 0b11000000) != 0b10000000) {
101+
return false;
102+
}
103+
// Ensure 2nd continuous byte is 10xxxxxx
104+
if ((string[pos + 2] & 0b11000000) != 0b10000000) {
105+
return false;
106+
}
107+
// Ensure 3rd continuous byte is 10xxxxxx
108+
if ((string[pos + 3] & 0b11000000) != 0b10000000) {
109+
return false;
110+
}
111+
// Read the code point
112+
code_point = (byte & 0b00000111) << 18 | (string[pos + 1] & 0b00111111) << 12 | (string[pos + 2] & 0b00111111) << 6 | (string[pos + 3] & 0b00111111);
113+
// Check code point valid value
114+
// - must be > 0xFFFF (otherwise it's overlong)
115+
// - must not exceed Unicode max (0x10FFFF)
116+
if (code_point <= 0xffff || 0x10ffff < code_point) {
117+
return false;
118+
}
119+
}
120+
else {
121+
// Any other pattern is invalid:
122+
// e.g. a continuation byte without a proper leading byte
123+
return false;
124+
}
125+
// Move to the next character after validating the current one
126+
pos = next_pos;
127+
}
128+
return true;
129+
}
130+
} // namespace hud::unicode
131+
132+
#endif // HD_INC_CORE_STRING_UNICODE_UTF8_H

0 commit comments

Comments
 (0)