AArch64AdvSIMDScalarPass.cpp 16.1 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// When profitable, replace GPR targeting i64 instructions with their
// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
// as minimizing the number of cross-class register copies.
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// TODO: Graph based predicate heuristics.
// Walking the instruction list linearly will get many, perhaps most, of
// the cases, but to do a truly thorough job of this, we need a more
// wholistic approach.
//
// This optimization is very similar in spirit to the register allocator's
// spill placement, only here we're determining where to place cross-class
// register copies rather than spills. As such, a similar approach is
// called for.
//
// We want to build up a set of graphs of all instructions which are candidates
// for transformation along with instructions which generate their inputs and
// consume their outputs. For each edge in the graph, we assign a weight
// based on whether there is a copy required there (weight zero if not) and
// the block frequency of the block containing the defining or using
// instruction, whichever is less. Our optimization is then a graph problem
// to minimize the total weight of all the graphs, then transform instructions
// and add or remove copy instructions as called for to implement the
// solution.
//===----------------------------------------------------------------------===//

#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;

#define DEBUG_TYPE "aarch64-simd-scalar"

// Allow forcing all i64 operations with equivalent SIMD instructions to use
// them. For stress-testing the transformation function.
static cl::opt<bool>
TransformAll("aarch64-simd-scalar-force-all",
             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
             cl::init(false), cl::Hidden);

STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");

#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"

namespace {
class AArch64AdvSIMDScalar : public MachineFunctionPass {
  MachineRegisterInfo *MRI;
  const TargetInstrInfo *TII;

private:
  // isProfitableToTransform - Predicate function to determine whether an
  // instruction should be transformed to its equivalent AdvSIMD scalar
  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
  bool isProfitableToTransform(const MachineInstr &MI) const;

  // transformInstruction - Perform the transformation of an instruction
  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
  // to be the correct register class, minimizing cross-class copies.
  void transformInstruction(MachineInstr &MI);

  // processMachineBasicBlock - Main optimzation loop.
  bool processMachineBasicBlock(MachineBasicBlock *MBB);

public:
  static char ID; // Pass identification, replacement for typeid.
  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
    initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
  }

  bool runOnMachineFunction(MachineFunction &F) override;

  StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }
};
char AArch64AdvSIMDScalar::ID = 0;
} // end anonymous namespace

INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
                AARCH64_ADVSIMD_NAME, false, false)

static bool isGPR64(unsigned Reg, unsigned SubReg,
                    const MachineRegisterInfo *MRI) {
  if (SubReg)
    return false;
  if (Register::isVirtualRegister(Reg))
    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
  return AArch64::GPR64RegClass.contains(Reg);
}

static bool isFPR64(unsigned Reg, unsigned SubReg,
                    const MachineRegisterInfo *MRI) {
  if (Register::isVirtualRegister(Reg))
    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
            SubReg == 0) ||
           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
            SubReg == AArch64::dsub);
  // Physical register references just check the register class directly.
  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
}

// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
// copy instruction. Return zero_reg if the instruction is not a copy.
static MachineOperand *getSrcFromCopy(MachineInstr *MI,
                                      const MachineRegisterInfo *MRI,
                                      unsigned &SubReg) {
  SubReg = 0;
  // The "FMOV Xd, Dn" instruction is the typical form.
  if (MI->getOpcode() == AArch64::FMOVDXr ||
      MI->getOpcode() == AArch64::FMOVXDr)
    return &MI->getOperand(1);
  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
  // these at this stage, but it's easy to check for.
  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
    SubReg = AArch64::dsub;
    return &MI->getOperand(1);
  }
  // Or just a plain COPY instruction. This can be directly to/from FPR64,
  // or it can be a dsub subreg reference to an FPR128.
  if (MI->getOpcode() == AArch64::COPY) {
    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                MRI) &&
        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
      return &MI->getOperand(1);
    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                MRI) &&
        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
                MRI)) {
      SubReg = MI->getOperand(1).getSubReg();
      return &MI->getOperand(1);
    }
  }

  // Otherwise, this is some other kind of instruction.
  return nullptr;
}

// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
// that we're considering transforming to, return that AdvSIMD opcode. For all
// others, return the original opcode.
static unsigned getTransformOpcode(unsigned Opc) {
  switch (Opc) {
  default:
    break;
  // FIXME: Lots more possibilities.
  case AArch64::ADDXrr:
    return AArch64::ADDv1i64;
  case AArch64::SUBXrr:
    return AArch64::SUBv1i64;
  case AArch64::ANDXrr:
    return AArch64::ANDv8i8;
  case AArch64::EORXrr:
    return AArch64::EORv8i8;
  case AArch64::ORRXrr:
    return AArch64::ORRv8i8;
  }
  // No AdvSIMD equivalent, so just return the original opcode.
  return Opc;
}

static bool isTransformable(const MachineInstr &MI) {
  unsigned Opc = MI.getOpcode();
  return Opc != getTransformOpcode(Opc);
}

// isProfitableToTransform - Predicate function to determine whether an
// instruction should be transformed to its equivalent AdvSIMD scalar
// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
bool AArch64AdvSIMDScalar::isProfitableToTransform(
    const MachineInstr &MI) const {
  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
  // early exit since that's the common case.
  if (!isTransformable(MI))
    return false;

  // Count the number of copies we'll need to add and approximate the number
  // of copies that a transform will enable us to remove.
  unsigned NumNewCopies = 3;
  unsigned NumRemovableCopies = 0;

  Register OrigSrc0 = MI.getOperand(1).getReg();
  Register OrigSrc1 = MI.getOperand(2).getReg();
  unsigned SubReg0;
  unsigned SubReg1;
  if (!MRI->def_empty(OrigSrc0)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc0);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
    // If the source was from a copy, we don't need to insert a new copy.
    if (MOSrc0)
      --NumNewCopies;
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
      ++NumRemovableCopies;
  }
  if (!MRI->def_empty(OrigSrc1)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc1);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
    if (MOSrc1)
      --NumNewCopies;
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
      ++NumRemovableCopies;
  }

  // If any of the uses of the original instructions is a cross class copy,
  // that's a copy that will be removable if we transform. Likewise, if
  // any of the uses is a transformable instruction, it's likely the tranforms
  // will chain, enabling us to save a copy there, too. This is an aggressive
  // heuristic that approximates the graph based cost analysis described above.
  Register Dst = MI.getOperand(0).getReg();
  bool AllUsesAreCopies = true;
  for (MachineRegisterInfo::use_instr_nodbg_iterator
           Use = MRI->use_instr_nodbg_begin(Dst),
           E = MRI->use_instr_nodbg_end();
       Use != E; ++Use) {
    unsigned SubReg;
    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
      ++NumRemovableCopies;
    // If the use is an INSERT_SUBREG, that's still something that can
    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
    // preferable to have it use the FPR64 in most cases, as if the source
    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
    // Ditto for a lane insert.
    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
             Use->getOpcode() == AArch64::INSvi64gpr)
      ;
    else
      AllUsesAreCopies = false;
  }
  // If all of the uses of the original destination register are copies to
  // FPR64, then we won't end up having a new copy back to GPR64 either.
  if (AllUsesAreCopies)
    --NumNewCopies;

  // If a transform will not increase the number of cross-class copies required,
  // return true.
  if (NumNewCopies <= NumRemovableCopies)
    return true;

  // Finally, even if we otherwise wouldn't transform, check if we're forcing
  // transformation of everything.
  return TransformAll;
}

static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
                                unsigned Dst, unsigned Src, bool IsKill) {
  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
                                    TII->get(AArch64::COPY), Dst)
                                .addReg(Src, getKillRegState(IsKill));
  LLVM_DEBUG(dbgs() << "    adding copy: " << *MIB);
  ++NumCopiesInserted;
  return MIB;
}

// transformInstruction - Perform the transformation of an instruction
// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
// to be the correct register class, minimizing cross-class copies.
void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
  LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);

  MachineBasicBlock *MBB = MI.getParent();
  unsigned OldOpc = MI.getOpcode();
  unsigned NewOpc = getTransformOpcode(OldOpc);
  assert(OldOpc != NewOpc && "transform an instruction to itself?!");

  // Check if we need a copy for the source registers.
  Register OrigSrc0 = MI.getOperand(1).getReg();
  Register OrigSrc1 = MI.getOperand(2).getReg();
  unsigned Src0 = 0, SubReg0;
  unsigned Src1 = 0, SubReg1;
  bool KillSrc0 = false, KillSrc1 = false;
  if (!MRI->def_empty(OrigSrc0)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc0);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (MOSrc0) {
      Src0 = MOSrc0->getReg();
      KillSrc0 = MOSrc0->isKill();
      // Src0 is going to be reused, thus, it cannot be killed anymore.
      MOSrc0->setIsKill(false);
      if (MRI->hasOneNonDBGUse(OrigSrc0)) {
        assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
        Def->eraseFromParent();
        ++NumCopiesDeleted;
      }
    }
  }
  if (!MRI->def_empty(OrigSrc1)) {
    MachineRegisterInfo::def_instr_iterator Def =
        MRI->def_instr_begin(OrigSrc1);
    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
    // If there are no other users of the original source, we can delete
    // that instruction.
    if (MOSrc1) {
      Src1 = MOSrc1->getReg();
      KillSrc1 = MOSrc1->isKill();
      // Src0 is going to be reused, thus, it cannot be killed anymore.
      MOSrc1->setIsKill(false);
      if (MRI->hasOneNonDBGUse(OrigSrc1)) {
        assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
        Def->eraseFromParent();
        ++NumCopiesDeleted;
      }
    }
  }
  // If we weren't able to reference the original source directly, create a
  // copy.
  if (!Src0) {
    SubReg0 = 0;
    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
    insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
    KillSrc0 = true;
  }
  if (!Src1) {
    SubReg1 = 0;
    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
    insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
    KillSrc1 = true;
  }

  // Create a vreg for the destination.
  // FIXME: No need to do this if the ultimate user expects an FPR64.
  // Check for that and avoid the copy if possible.
  Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);

  // For now, all of the new instructions have the same simple three-register
  // form, so no need to special case based on what instruction we're
  // building.
  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
      .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
      .addReg(Src1, getKillRegState(KillSrc1), SubReg1);

  // Now copy the result back out to a GPR.
  // FIXME: Try to avoid this if all uses could actually just use the FPR64
  // directly.
  insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);

  // Erase the old instruction.
  MI.eraseFromParent();

  ++NumScalarInsnsUsed;
}

// processMachineBasicBlock - Main optimzation loop.
bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
  bool Changed = false;
  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
    MachineInstr &MI = *I++;
    if (isProfitableToTransform(MI)) {
      transformInstruction(MI);
      Changed = true;
    }
  }
  return Changed;
}

// runOnMachineFunction - Pass entry point from PassManager.
bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
  bool Changed = false;
  LLVM_DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");

  if (skipFunction(mf.getFunction()))
    return false;

  MRI = &mf.getRegInfo();
  TII = mf.getSubtarget().getInstrInfo();

  // Just check things on a one-block-at-a-time basis.
  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
    if (processMachineBasicBlock(&*I))
      Changed = true;
  return Changed;
}

// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
// to add the pass to the PassManager.
FunctionPass *llvm::createAArch64AdvSIMDScalar() {
  return new AArch64AdvSIMDScalar();
}