strlen-mte.S
5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/*
* strlen - calculate the length of a string
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
* ARMv8-a, AArch64.
*/
#include "../asmdefs.h"
/* Arguments and results. */
#define srcin x0
#define len x0
/* Locals and temporaries. */
#define src x1
#define data1 x2
#define data2 x3
#define has_nul1 x4
#define has_nul2 x5
#define tmp1 x4
#define tmp2 x5
#define tmp3 x6
#define tmp4 x7
#define zeroones x8
#define offset x9
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. A faster check
(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
/* This implementation is compatible with Memory Tagging. All loads
are 16 bytes in size and 16 bytes aligned. This also avoids the
need for page boundary checks. This implementation is correct
even without Memory Tagging, but other implementations could be
more beneficial if Memory Tagging is not enabled.
First load is aligned down and can contain bytes that are located
before the string. This is handled by modifying the "zeroones"
mask. The bytes that need to be ignored are set to zero.
If the string is aligned in such a way that 8 or more bytes from
the first load should be ignored, there is a special case
(skip_first_8_bytes) which only compares the second 8 bytes.
If there is a NUL byte in the first load, we calculate the length
from the 2 8-byte words using conditional select to reduce branch
mispredictions.
If the string is longer than 16 bytes, we check 32 bytes per
iteration using the fast NUL check (main_loop). If we encounter
non-ASCII characters, we fallback to a second loop
(nonascii_loop) using the full NUL check. */
ENTRY(__strlen_aarch64_mte)
bic src, srcin, 15 /* Align down to 16 bytes. */
mov zeroones, REP8_01
/* (offset & 63) holds number of bits to ignore in a register.*/
lsl offset, srcin, 3
ldp data1, data2, [src], -16
lsl tmp1, zeroones, offset /* Shift (offset & 63). */
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
e.g. 0x0100 - 0x0101 = 0xffff, so 0x01 will be mistaken for NUL.
Since we expect strings to be small and early-exit,
byte-swap the data now so has_null1/2 will be correct. */
rev data1, data1
rev data2, data2
#endif
tbnz srcin, 3, L(skip_first_8_bytes)
sub tmp1, data1, tmp1
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
/* If comparison happens, C flag is always set. */
ccmp has_nul2, 0, 0, eq
beq L(main_loop)
/* Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
and tmp2, srcin, 7 /* Bytes to ignore. */
rev has_nul1, has_nul1
neg tmp2, tmp2
clz tmp1, has_nul1 /* Count bits before NUL. */
/* Add 8 if NUL byte is not in first register. */
add tmp3, tmp2, 8
csel len, tmp2, tmp3, cc
add len, len, tmp1, lsr 3
ret
L(skip_first_8_bytes):
sub tmp1, data2, tmp1
orr tmp2, data2, REP8_7f
bics has_nul1, tmp1, tmp2
beq L(main_loop)
rev has_nul1, has_nul1
lsl tmp1, has_nul1, offset /* Ignore bytes before string. */
clz tmp1, tmp1 /* Count bits before NUL. */
lsr len, tmp1, 3
ret
/* The inner loop processes 32 bytes per iteration and uses the fast
NUL check. If we encounter non-ASCII characters, use a second
loop with the accurate NUL check. */
.p2align 4
L(main_loop):
ldp data1, data2, [src, 32]!
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
tst tmp2, zeroones, lsl 7
bne 1f
ldp data1, data2, [src, 16]
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
tst tmp2, zeroones, lsl 7
beq L(main_loop)
add src, src, 16
1:
/* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(nonascii_loop)
/* Enter with C = has_nul1 == 0. */
L(tail):
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
csel data1, data1, data2, cc
rev data1, data1
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
bic has_nul1, tmp1, tmp2
#else
csel has_nul1, has_nul1, has_nul2, cc
#endif
sub len, src, srcin
rev has_nul1, has_nul1
add tmp2, len, 8
clz tmp1, has_nul1
csel len, len, tmp2, cc
add len, len, tmp1, lsr 3
ret
L(nonascii_loop):
ldp data1, data2, [src, 16]!
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
bne L(tail)
ldp data1, data2, [src, 16]!
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(nonascii_loop)
b L(tail)
END(__strlen_aarch64_mte)