Skip to content

Commit fb0c2c6

Browse files
Wilco DijkstraSteve Kondik
authored andcommitted
libc: ARM64: improve performance in strlen
Change-Id: Ic20f93a0052a49bd76cd6795f51e8606ccfbf11c
1 parent cac9627 commit fb0c2c6

File tree

1 file changed

+172
-71
lines changed

1 file changed

+172
-71
lines changed
Lines changed: 172 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
/* Copyright (c) 2014, Linaro Limited
1+
/* Copyright (c) 2013-2015, Linaro Limited
22
All rights reserved.
33

44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions are met:
66
* Redistributions of source code must retain the above copyright
7-
notice, this list of conditions and the following disclaimer.
7+
notice, this list of conditions and the following disclaimer.
88
* Redistributions in binary form must reproduce the above copyright
9-
notice, this list of conditions and the following disclaimer in the
10-
documentation and/or other materials provided with the distribution.
9+
notice, this list of conditions and the following disclaimer in the
10+
documentation and/or other materials provided with the distribution.
1111
* Neither the name of the Linaro nor the
12-
names of its contributors may be used to endorse or promote products
13-
derived from this software without specific prior written permission.
12+
names of its contributors may be used to endorse or promote products
13+
derived from this software without specific prior written permission.
1414

1515
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1616
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -22,16 +22,19 @@
2222
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2323
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2424
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26-
*/
25+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
2726

2827
/* Assumptions:
2928
*
30-
* ARMv8-a, AArch64
29+
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
3130
*/
3231

3332
#include <private/bionic_asm.h>
3433

34+
/* To test the page crossing code path more thoroughly, compile with
35+
-DTEST_PAGE_CROSS - this will force all calls through the slower
36+
entry path. This option is not intended for production use. */
37+
3538
/* Arguments and results. */
3639
#define srcin x0
3740
#define len x0
@@ -40,87 +43,185 @@
4043
#define src x1
4144
#define data1 x2
4245
#define data2 x3
43-
#define data2a x4
44-
#define has_nul1 x5
45-
#define has_nul2 x6
46-
#define tmp1 x7
47-
#define tmp2 x8
48-
#define tmp3 x9
49-
#define tmp4 x10
50-
#define zeroones x11
51-
#define pos x12
46+
#define has_nul1 x4
47+
#define has_nul2 x5
48+
#define tmp1 x4
49+
#define tmp2 x5
50+
#define tmp3 x6
51+
#define tmp4 x7
52+
#define zeroones x8
53+
54+
#define L(l) .L ## l
55+
56+
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
57+
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
58+
can be done in parallel across the entire word. A faster check
59+
(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
60+
false hits for characters 129..255. */
5261

5362
#define REP8_01 0x0101010101010101
5463
#define REP8_7f 0x7f7f7f7f7f7f7f7f
5564
#define REP8_80 0x8080808080808080
5665

57-
/* Start of critial section -- keep to one 64Byte cache line. */
66+
#ifdef TEST_PAGE_CROSS
67+
# define MIN_PAGE_SIZE 15
68+
#else
69+
# define MIN_PAGE_SIZE 4096
70+
#endif
71+
72+
/* Since strings are short on average, we check the first 16 bytes
73+
of the string for a NUL character. In order to do an unaligned ldp
74+
safely we have to do a page cross check first. If there is a NUL
75+
byte we calculate the length from the 2 8-byte words using
76+
conditional select to reduce branch mispredictions (it is unlikely
77+
strlen will be repeatedly called on strings with the same length).
78+
79+
If the string is longer than 16 bytes, we align src so don't need
80+
further page cross checks, and process 32 bytes per iteration
81+
using the fast NUL check. If we encounter non-ASCII characters,
82+
fallback to a second loop using the full NUL check.
83+
84+
If the page cross check fails, we read 16 bytes from an aligned
85+
address, remove any characters before the string, and continue
86+
in the main loop using aligned loads. Since strings crossing a
87+
page in the first 16 bytes are rare (probability of
88+
16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
89+
90+
AArch64 systems have a minimum page size of 4k. We don't bother
91+
checking for larger page sizes - the cost of setting up the correct
92+
page size is just not worth the extra gain from a small reduction in
93+
the cases taking the slow path. Note that we only care about
94+
whether the first fetch, which may be misaligned, crosses a page
95+
boundary. */
96+
5897
ENTRY(strlen)
59-
mov zeroones, #REP8_01
60-
bic src, srcin, #15
61-
ands tmp1, srcin, #15
62-
b.ne .Lmisaligned
63-
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
64-
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
65-
can be done in parallel across the entire word. */
66-
/* The inner loop deals with two Dwords at a time. This has a
67-
slightly higher start-up cost, but we should win quite quickly,
68-
especially on cores with a high number of issue slots per
69-
cycle, as we get much better parallelism out of the operations. */
70-
.Lloop:
71-
ldp data1, data2, [src], #16
72-
.Lrealigned:
98+
and tmp1, srcin, MIN_PAGE_SIZE - 1
99+
mov zeroones, REP8_01
100+
cmp tmp1, MIN_PAGE_SIZE - 16
101+
b.gt L(page_cross)
102+
ldp data1, data2, [srcin]
103+
#ifdef __AARCH64EB__
104+
/* For big-endian, carry propagation (if the final byte in the
105+
string is 0x01) means we cannot use has_nul1/2 directly.
106+
Since we expect strings to be small and early-exit,
107+
byte-swap the data now so has_null1/2 will be correct. */
108+
rev data1, data1
109+
rev data2, data2
110+
#endif
73111
sub tmp1, data1, zeroones
74-
orr tmp2, data1, #REP8_7f
112+
orr tmp2, data1, REP8_7f
75113
sub tmp3, data2, zeroones
76-
orr tmp4, data2, #REP8_7f
77-
bic has_nul1, tmp1, tmp2
78-
bics has_nul2, tmp3, tmp4
79-
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
80-
b.eq .Lloop
81-
/* End of critical section -- keep to one 64Byte cache line. */
114+
orr tmp4, data2, REP8_7f
115+
bics has_nul1, tmp1, tmp2
116+
bic has_nul2, tmp3, tmp4
117+
ccmp has_nul2, 0, 0, eq
118+
beq L(main_loop_entry)
82119

83-
sub len, src, srcin
84-
cbz has_nul1, .Lnul_in_data2
85-
#ifdef __AARCH64EB__
86-
mov data2, data1
87-
#endif
88-
sub len, len, #8
89-
mov has_nul2, has_nul1
90-
.Lnul_in_data2:
120+
/* Enter with C = has_nul1 == 0. */
121+
csel has_nul1, has_nul1, has_nul2, cc
122+
mov len, 8
123+
rev has_nul1, has_nul1
124+
clz tmp1, has_nul1
125+
csel len, xzr, len, cc
126+
add len, len, tmp1, lsr 3
127+
ret
128+
129+
/* The inner loop processes 32 bytes per iteration and uses the fast
130+
NUL check. If we encounter non-ASCII characters, use a second
131+
loop with the accurate NUL check. */
132+
.p2align 4
133+
L(main_loop_entry):
134+
bic src, srcin, 15
135+
sub src, src, 16
136+
L(main_loop):
137+
ldp data1, data2, [src, 32]!
138+
.Lpage_cross_entry:
139+
sub tmp1, data1, zeroones
140+
sub tmp3, data2, zeroones
141+
orr tmp2, tmp1, tmp3
142+
tst tmp2, zeroones, lsl 7
143+
bne 1f
144+
ldp data1, data2, [src, 16]
145+
sub tmp1, data1, zeroones
146+
sub tmp3, data2, zeroones
147+
orr tmp2, tmp1, tmp3
148+
tst tmp2, zeroones, lsl 7
149+
beq L(main_loop)
150+
add src, src, 16
151+
1:
152+
/* The fast check failed, so do the slower, accurate NUL check. */
153+
orr tmp2, data1, REP8_7f
154+
orr tmp4, data2, REP8_7f
155+
bics has_nul1, tmp1, tmp2
156+
bic has_nul2, tmp3, tmp4
157+
ccmp has_nul2, 0, 0, eq
158+
beq L(nonascii_loop)
159+
160+
/* Enter with C = has_nul1 == 0. */
161+
L(tail):
91162
#ifdef __AARCH64EB__
92163
/* For big-endian, carry propagation (if the final byte in the
93-
string is 0x01) means we cannot use has_nul directly. The
164+
string is 0x01) means we cannot use has_nul1/2 directly. The
94165
easiest way to get the correct byte is to byte-swap the data
95166
and calculate the syndrome a second time. */
96-
rev data2, data2
97-
sub tmp1, data2, zeroones
98-
orr tmp2, data2, #REP8_7f
99-
bic has_nul2, tmp1, tmp2
167+
csel data1, data1, data2, cc
168+
rev data1, data1
169+
sub tmp1, data1, zeroones
170+
orr tmp2, data1, REP8_7f
171+
bic has_nul1, tmp1, tmp2
172+
#else
173+
csel has_nul1, has_nul1, has_nul2, cc
100174
#endif
101-
sub len, len, #8
102-
rev has_nul2, has_nul2
103-
clz pos, has_nul2
104-
add len, len, pos, lsr #3 /* Bits to bytes. */
175+
sub len, src, srcin
176+
rev has_nul1, has_nul1
177+
add tmp2, len, 8
178+
clz tmp1, has_nul1
179+
csel len, len, tmp2, cc
180+
add len, len, tmp1, lsr 3
105181
ret
106182

107-
.Lmisaligned:
108-
cmp tmp1, #8
109-
neg tmp1, tmp1
110-
ldp data1, data2, [src], #16
111-
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
112-
mov tmp2, #~0
183+
L(nonascii_loop):
184+
ldp data1, data2, [src, 16]!
185+
sub tmp1, data1, zeroones
186+
orr tmp2, data1, REP8_7f
187+
sub tmp3, data2, zeroones
188+
orr tmp4, data2, REP8_7f
189+
bics has_nul1, tmp1, tmp2
190+
bic has_nul2, tmp3, tmp4
191+
ccmp has_nul2, 0, 0, eq
192+
bne L(tail)
193+
ldp data1, data2, [src, 16]!
194+
sub tmp1, data1, zeroones
195+
orr tmp2, data1, REP8_7f
196+
sub tmp3, data2, zeroones
197+
orr tmp4, data2, REP8_7f
198+
bics has_nul1, tmp1, tmp2
199+
bic has_nul2, tmp3, tmp4
200+
ccmp has_nul2, 0, 0, eq
201+
beq L(nonascii_loop)
202+
b L(tail)
203+
204+
/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
205+
srcin to 0x7f, so we ignore any NUL bytes before the string.
206+
Then continue in the aligned loop. */
207+
L(page_cross):
208+
bic src, srcin, 15
209+
ldp data1, data2, [src]
210+
lsl tmp1, srcin, 3
211+
mov tmp4, -1
113212
#ifdef __AARCH64EB__
114-
/* Big-endian. Early bytes are at MSB. */
115-
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
213+
/* Big-endian. Early bytes are at MSB. */
214+
lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
116215
#else
117216
/* Little-endian. Early bytes are at LSB. */
118-
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
217+
lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
119218
#endif
120-
orr data1, data1, tmp2
121-
orr data2a, data2, tmp2
122-
csinv data1, data1, xzr, le
123-
csel data2, data2, data2a, le
124-
b .Lrealigned
219+
orr tmp1, tmp1, REP8_80
220+
orn data1, data1, tmp1
221+
orn tmp2, data2, tmp1
222+
tst srcin, 8
223+
csel data1, data1, tmp4, eq
224+
csel data2, data2, tmp2, eq
225+
b L(page_cross_entry)
125226

126227
END(strlen)

0 commit comments

Comments
 (0)