strchr-mte.S 3.81 KB
/*
 * strchr - find a character in a string
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64
 * Neon Available.
 */

#include "../asmdefs.h"

/* Arguments and results.  */
#define srcin		x0
#define chrin		w1

#define result		x0

#define src		x2
#define	tmp1		x3
#define wtmp2		w4
#define tmp3		x5

#define vrepchr		v0
#define qdata		q1
#define vdata		v1
#define vhas_nul	v2
#define vhas_chr	v3
#define vrepmask_0	v4
#define vrepmask_c	v5
#define vend		v6

#define L(l) .L ## l

/* Core algorithm.

   For each 16-byte chunk we calculate a 64-bit syndrome value, with
   four bits per byte (LSB is always in bits 0 and 1, for both big
   and little-endian systems).  For each tuple, bit 0 is set if
   the relevant byte matched the requested character; bit 1 is set
   if the relevant byte matched the NUL end of string (we trigger
   off bit0 for the special case of looking for NUL) and bits 2 and 3
   are not used.
   Since the bits in the syndrome reflect exactly the order in which
   things occur in the original string a count_trailing_zeros()
   operation will identify exactly which byte is causing the termination,
   and why. */

/* Locals and temporaries. */

ENTRY(__strchr_aarch64_mte)
	/* Magic constant 0x10011001 to allow us to identify which lane
	   matches the requested byte.  Magic constant 0x20022002 used
	   similarly for NUL termination. */
	mov	wtmp2, #0x1001
	movk	wtmp2, #0x1001, lsl #16
	dup	vrepchr.16b, chrin
	bic	src, srcin, #15		/* Work with aligned 16-byte chunks. */
	dup	vrepmask_c.4s, wtmp2
	ands	tmp1, srcin, #15
	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
	b.eq	L(loop)

	/* Input string is not 16-byte aligned.  Rather than forcing
	   the padding bytes to a safe value, we calculate the syndrome
	   for all the bytes, but then mask off those bits of the
	   syndrome that are related to the padding.  */
	ldr	qdata, [src], #16
	cmeq	vhas_nul.16b, vdata.16b, #0
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
	lsl	tmp1, tmp1, #2
	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
	mov	tmp3, #~0
	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
	lsl	tmp1, tmp3, tmp1

	mov	tmp3, vend.d[0]
	ands	tmp1, tmp3, tmp1	/* Mask padding bits. */
	b.ne	L(tail)

L(loop):
	ldr	qdata, [src], #32
	cmeq	vhas_nul.16b, vdata.16b, #0
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	/* Use a fast check for the termination condition.  */
	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
	mov	tmp1, vend.d[0]
	cbnz	tmp1, L(end)

	ldr	qdata, [src, #-16]
	cmeq	vhas_nul.16b, vdata.16b, #0
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	/* Use a fast check for the termination condition.  */
	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */
	mov	tmp1, vend.d[0]
	cbz	tmp1, L(loop)

	/* Adjust src for next two subtractions. */
	add	src, src, #16
L(end):
	/* Termination condition found.  Now need to establish exactly why
	   we terminated.  */
	and	vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
	and	vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
	sub	src, src, #16
	orr	vend.16b, vhas_nul.16b, vhas_chr.16b
	addp	vend.16b, vend.16b, vend.16b		/* 128->64 */

	mov	tmp1, vend.d[0]
L(tail):
	/* Count the trailing zeros, by bit reversing...  */
	rbit	tmp1, tmp1
	/* Re-bias source.  */
	sub	src, src, #16
	clz	tmp1, tmp1	/* And counting the leading zeros.  */
	/* Tmp1 is even if the target character was found first.  Otherwise
	   we've found the end of string and we weren't looking for NUL.  */
	tst	tmp1, #1
	add	result, src, tmp1, lsr #2
	csel	result, result, xzr, eq
	ret

END(__strchr_aarch64_mte)