strchr-mte.S
3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*
* strchr - find a character in a string
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
* ARMv8-a, AArch64
* Neon Available.
*/
#include "../asmdefs.h"
/* Arguments and results. */
#define srcin x0
#define chrin w1
#define result x0
#define src x2
#define tmp1 x3
#define wtmp2 w4
#define tmp3 x5
#define vrepchr v0
#define qdata q1
#define vdata v1
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask_0 v4
#define vrepmask_c v5
#define vend v6
#define L(l) .L ## l
/* Core algorithm.
For each 16-byte chunk we calculate a 64-bit syndrome value, with
four bits per byte (LSB is always in bits 0 and 1, for both big
and little-endian systems). For each tuple, bit 0 is set if
the relevant byte matched the requested character; bit 1 is set
if the relevant byte matched the NUL end of string (we trigger
off bit0 for the special case of looking for NUL) and bits 2 and 3
are not used.
Since the bits in the syndrome reflect exactly the order in which
things occur in the original string a count_trailing_zeros()
operation will identify exactly which byte is causing the termination,
and why. */
/* Locals and temporaries. */
ENTRY(__strchr_aarch64_mte)
/* Magic constant 0x10011001 to allow us to identify which lane
matches the requested byte. Magic constant 0x20022002 used
similarly for NUL termination. */
mov wtmp2, #0x1001
movk wtmp2, #0x1001, lsl #16
dup vrepchr.16b, chrin
bic src, srcin, #15 /* Work with aligned 16-byte chunks. */
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #15
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
b.eq L(loop)
/* Input string is not 16-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
for all the bytes, but then mask off those bits of the
syndrome that are related to the padding. */
ldr qdata, [src], #16
cmeq vhas_nul.16b, vdata.16b, #0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
lsl tmp1, tmp1, #2
orr vend.16b, vhas_nul.16b, vhas_chr.16b
mov tmp3, #~0
addp vend.16b, vend.16b, vend.16b /* 128->64 */
lsl tmp1, tmp3, tmp1
mov tmp3, vend.d[0]
ands tmp1, tmp3, tmp1 /* Mask padding bits. */
b.ne L(tail)
L(loop):
ldr qdata, [src], #32
cmeq vhas_nul.16b, vdata.16b, #0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
/* Use a fast check for the termination condition. */
orr vend.16b, vhas_nul.16b, vhas_chr.16b
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov tmp1, vend.d[0]
cbnz tmp1, L(end)
ldr qdata, [src, #-16]
cmeq vhas_nul.16b, vdata.16b, #0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
/* Use a fast check for the termination condition. */
orr vend.16b, vhas_nul.16b, vhas_chr.16b
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov tmp1, vend.d[0]
cbz tmp1, L(loop)
/* Adjust src for next two subtractions. */
add src, src, #16
L(end):
/* Termination condition found. Now need to establish exactly why
we terminated. */
and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
sub src, src, #16
orr vend.16b, vhas_nul.16b, vhas_chr.16b
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov tmp1, vend.d[0]
L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
sub src, src, #16
clz tmp1, tmp1 /* And counting the leading zeros. */
/* Tmp1 is even if the target character was found first. Otherwise
we've found the end of string and we weren't looking for NUL. */
tst tmp1, #1
add result, src, tmp1, lsr #2
csel result, result, xzr, eq
ret
END(__strchr_aarch64_mte)