memset_utils.h 5.05 KB
//===-- Memset utils --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H

#include "src/string/memory_utils/utils.h"

#include <stddef.h> // size_t

namespace __llvm_libc {

// Sets `kBlockSize` bytes starting from `src` to `value`.
template <size_t kBlockSize> static void SetBlock(char *dst, unsigned value) {
  // Theoretically the compiler is allowed to call memset here and end up with a
  // recursive call, practically it doesn't happen, however this should be
  // replaced with a __builtin_memset_inline once it's available in clang.
  __builtin_memset(dst, value, kBlockSize);
}

// Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`.
// Precondition: `count >= kBlockSize`.
template <size_t kBlockSize>
static void SetLastBlock(char *dst, unsigned value, size_t count) {
  SetBlock<kBlockSize>(dst + count - kBlockSize, value);
}

// Sets `kBlockSize` bytes twice with an overlap between the two.
//
// [1234567812345678123]
// [__XXXXXXXXXXXXXX___]
// [__XXXXXXXX_________]
// [________XXXXXXXX___]
//
// Precondition: `count >= kBlockSize && count <= kBlockSize`.
template <size_t kBlockSize>
static void SetBlockOverlap(char *dst, unsigned value, size_t count) {
  SetBlock<kBlockSize>(dst, value);
  SetLastBlock<kBlockSize>(dst, value, count);
}

// Sets `count` bytes by blocks of `kBlockSize` bytes.
// Sets at the start and end of the buffer are unaligned.
// Sets in the middle of the buffer are aligned to `kBlockSize`.
//
// e.g. with
// [12345678123456781234567812345678]
// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
// [__XXXXXXXX______________________]
// [________XXXXXXXX________________]
// [________________XXXXXXXX________]
// [_____________________XXXXXXXX___]
//
// Precondition: `count > 2 * kBlockSize` for efficiency.
//               `count >= kBlockSize` for correctness.
template <size_t kBlockSize>
static void SetAlignedBlocks(char *dst, unsigned value, size_t count) {
  SetBlock<kBlockSize>(dst, value); // Set first block

  // Set aligned blocks
  size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
  for (; offset + kBlockSize < count; offset += kBlockSize)
    SetBlock<kBlockSize>(dst + offset, value);

  SetLastBlock<kBlockSize>(dst, value, count); // Set last block
}

// A general purpose implementation assuming cheap unaligned writes for sizes:
// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
// or 64 Bytes at a time, the compiler will expand them as needed.
//
// This implementation is subject to change as we benchmark more processors. We
// may also want to customize it for processors with specialized instructions
// that performs better (e.g. `rep stosb`).
//
// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
// We want to balance two things here:
//  - The number of redundant writes (when using `SetBlockOverlap`),
//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
//    such sizes).
//
// For the range 64-128:
//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
//  is wasteful near 65 but efficient toward 128.
//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
//  96 or 128 Bytes.
//  - Another approach could be to use an hybrid approach Copy<64>+Overlap<32>
//  for 65-96 and Copy<96>+Overlap<32> for 97-128
//
// Benchmarks showed that redundant writes were cheap (for Intel X86) but
// conditional were expensive, even on processor that do not support writing 64B
// at a time (pre-AVX512F). We also want to favor short functions that allow
// more hot code to fit in the iL1 cache.
//
// Above 128 we have to use conditionals since we don't know the upper bound in
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void GeneralPurposeMemset(char *dst, unsigned char value,
                                        size_t count) {
  if (count == 0)
    return;
  if (count == 1)
    return SetBlock<1>(dst, value);
  if (count == 2)
    return SetBlock<2>(dst, value);
  if (count == 3)
    return SetBlock<3>(dst, value);
  if (count == 4)
    return SetBlock<4>(dst, value);
  if (count <= 8)
    return SetBlockOverlap<4>(dst, value, count);
  if (count <= 16)
    return SetBlockOverlap<8>(dst, value, count);
  if (count <= 32)
    return SetBlockOverlap<16>(dst, value, count);
  if (count <= 64)
    return SetBlockOverlap<32>(dst, value, count);
  if (count <= 128)
    return SetBlockOverlap<64>(dst, value, count);
  return SetAlignedBlocks<32>(dst, value, count);
}

} // namespace __llvm_libc

#endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H