AArch64A57FPLoadBalancing.cpp 25.7 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// For best-case performance on Cortex-A57, we should try to use a balanced
// mix of odd and even D-registers when performing a critical sequence of
// independent, non-quadword FP/ASIMD floating-point multiply or
// multiply-accumulate operations.
//
// This pass attempts to detect situations where the register allocation may
// adversely affect this load balancing and to change the registers used so as
// to better utilize the CPU.
//
// Ideally we'd just take each multiply or multiply-accumulate in turn and
// allocate it alternating even or odd registers. However, multiply-accumulates
// are most efficiently performed in the same functional unit as their
// accumulation operand. Therefore this pass tries to find maximal sequences
// ("Chains") of multiply-accumulates linked via their accumulation operand,
// and assign them all the same "color" (oddness/evenness).
//
// This optimization affects S-register and D-register floating point
// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
// not affected.
//===----------------------------------------------------------------------===//

#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;

#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"

// Enforce the algorithm to use the scavenged register even when the original
// destination register is the correct color. Used for testing.
static cl::opt<bool>
TransformAll("aarch64-a57-fp-load-balancing-force-all",
             cl::desc("Always modify dest registers regardless of color"),
             cl::init(false), cl::Hidden);

// Never use the balance information obtained from chains - return a specific
// color always. Used for testing.
static cl::opt<unsigned>
OverrideBalance("aarch64-a57-fp-load-balancing-override",
              cl::desc("Ignore balance information, always return "
                       "(1: Even, 2: Odd)."),
              cl::init(0), cl::Hidden);

//===----------------------------------------------------------------------===//
// Helper functions

// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
static bool isMul(MachineInstr *MI) {
  switch (MI->getOpcode()) {
  case AArch64::FMULSrr:
  case AArch64::FNMULSrr:
  case AArch64::FMULDrr:
  case AArch64::FNMULDrr:
    return true;
  default:
    return false;
  }
}

// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
static bool isMla(MachineInstr *MI) {
  switch (MI->getOpcode()) {
  case AArch64::FMSUBSrrr:
  case AArch64::FMADDSrrr:
  case AArch64::FNMSUBSrrr:
  case AArch64::FNMADDSrrr:
  case AArch64::FMSUBDrrr:
  case AArch64::FMADDDrrr:
  case AArch64::FNMSUBDrrr:
  case AArch64::FNMADDDrrr:
    return true;
  default:
    return false;
  }
}

//===----------------------------------------------------------------------===//

namespace {
/// A "color", which is either even or odd. Yes, these aren't really colors
/// but the algorithm is conceptually doing two-color graph coloring.
enum class Color { Even, Odd };
#ifndef NDEBUG
static const char *ColorNames[2] = { "Even", "Odd" };
#endif

class Chain;

class AArch64A57FPLoadBalancing : public MachineFunctionPass {
  MachineRegisterInfo *MRI;
  const TargetRegisterInfo *TRI;
  RegisterClassInfo RCI;

public:
  static char ID;
  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {
    initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry());
  }

  bool runOnMachineFunction(MachineFunction &F) override;

  MachineFunctionProperties getRequiredProperties() const override {
    return MachineFunctionProperties().set(
        MachineFunctionProperties::Property::NoVRegs);
  }

  StringRef getPassName() const override {
    return "A57 FP Anti-dependency breaker";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    MachineFunctionPass::getAnalysisUsage(AU);
  }

private:
  bool runOnBasicBlock(MachineBasicBlock &MBB);
  bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
                     int &Balance);
  bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
  int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
  void scanInstruction(MachineInstr *MI, unsigned Idx,
                       std::map<unsigned, Chain*> &Active,
                       std::vector<std::unique_ptr<Chain>> &AllChains);
  void maybeKillChain(MachineOperand &MO, unsigned Idx,
                      std::map<unsigned, Chain*> &RegChains);
  Color getColor(unsigned Register);
  Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
};
}

char AArch64A57FPLoadBalancing::ID = 0;

INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE,
                      "AArch64 A57 FP Load-Balancing", false, false)
INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
                    "AArch64 A57 FP Load-Balancing", false, false)

namespace {
/// A Chain is a sequence of instructions that are linked together by
/// an accumulation operand. For example:
///
///   fmul def d0, ?
///   fmla def d1, ?, ?, killed d0
///   fmla def d2, ?, ?, killed d1
///
/// There may be other instructions interleaved in the sequence that
/// do not belong to the chain. These other instructions must not use
/// the "chain" register at any point.
///
/// We currently only support chains where the "chain" operand is killed
/// at each link in the chain for simplicity.
/// A chain has three important instructions - Start, Last and Kill.
///   * The start instruction is the first instruction in the chain.
///   * Last is the final instruction in the chain.
///   * Kill may or may not be defined. If defined, Kill is the instruction
///     where the outgoing value of the Last instruction is killed.
///     This information is important as if we know the outgoing value is
///     killed with no intervening uses, we can safely change its register.
///
/// Without a kill instruction, we must assume the outgoing value escapes
/// beyond our model and either must not change its register or must
/// create a fixup FMOV to keep the old register value consistent.
///
class Chain {
public:
  /// The important (marker) instructions.
  MachineInstr *StartInst, *LastInst, *KillInst;
  /// The index, from the start of the basic block, that each marker
  /// appears. These are stored so we can do quick interval tests.
  unsigned StartInstIdx, LastInstIdx, KillInstIdx;
  /// All instructions in the chain.
  std::set<MachineInstr*> Insts;
  /// True if KillInst cannot be modified. If this is true,
  /// we cannot change LastInst's outgoing register.
  /// This will be true for tied values and regmasks.
  bool KillIsImmutable;
  /// The "color" of LastInst. This will be the preferred chain color,
  /// as changing intermediate nodes is easy but changing the last
  /// instruction can be more tricky.
  Color LastColor;

  Chain(MachineInstr *MI, unsigned Idx, Color C)
      : StartInst(MI), LastInst(MI), KillInst(nullptr),
        StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
        LastColor(C) {
    Insts.insert(MI);
  }

  /// Add a new instruction into the chain. The instruction's dest operand
  /// has the given color.
  void add(MachineInstr *MI, unsigned Idx, Color C) {
    LastInst = MI;
    LastInstIdx = Idx;
    LastColor = C;
    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
           "Chain: broken invariant. A Chain can only be killed after its last "
           "def");

    Insts.insert(MI);
  }

  /// Return true if MI is a member of the chain.
  bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }

  /// Return the number of instructions in the chain.
  unsigned size() const {
    return Insts.size();
  }

  /// Inform the chain that its last active register (the dest register of
  /// LastInst) is killed by MI with no intervening uses or defs.
  void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
    KillInst = MI;
    KillInstIdx = Idx;
    KillIsImmutable = Immutable;
    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
           "Chain: broken invariant. A Chain can only be killed after its last "
           "def");
  }

  /// Return the first instruction in the chain.
  MachineInstr *getStart() const { return StartInst; }
  /// Return the last instruction in the chain.
  MachineInstr *getLast() const { return LastInst; }
  /// Return the "kill" instruction (as set with setKill()) or NULL.
  MachineInstr *getKill() const { return KillInst; }
  /// Return an instruction that can be used as an iterator for the end
  /// of the chain. This is the maximum of KillInst (if set) and LastInst.
  MachineBasicBlock::iterator end() const {
    return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
  }
  MachineBasicBlock::iterator begin() const { return getStart(); }

  /// Can the Kill instruction (assuming one exists) be modified?
  bool isKillImmutable() const { return KillIsImmutable; }

  /// Return the preferred color of this chain.
  Color getPreferredColor() {
    if (OverrideBalance != 0)
      return OverrideBalance == 1 ? Color::Even : Color::Odd;
    return LastColor;
  }

  /// Return true if this chain (StartInst..KillInst) overlaps with Other.
  bool rangeOverlapsWith(const Chain &Other) const {
    unsigned End = KillInst ? KillInstIdx : LastInstIdx;
    unsigned OtherEnd = Other.KillInst ?
      Other.KillInstIdx : Other.LastInstIdx;

    return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End;
  }

  /// Return true if this chain starts before Other.
  bool startsBefore(const Chain *Other) const {
    return StartInstIdx < Other->StartInstIdx;
  }

  /// Return true if the group will require a fixup MOV at the end.
  bool requiresFixup() const {
    return (getKill() && isKillImmutable()) || !getKill();
  }

  /// Return a simple string representation of the chain.
  std::string str() const {
    std::string S;
    raw_string_ostream OS(S);

    OS << "{";
    StartInst->print(OS, /* SkipOpers= */true);
    OS << " -> ";
    LastInst->print(OS, /* SkipOpers= */true);
    if (KillInst) {
      OS << " (kill @ ";
      KillInst->print(OS, /* SkipOpers= */true);
      OS << ")";
    }
    OS << "}";

    return OS.str();
  }

};

} // end anonymous namespace

//===----------------------------------------------------------------------===//

bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
  if (skipFunction(F.getFunction()))
    return false;

  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
    return false;

  bool Changed = false;
  LLVM_DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");

  MRI = &F.getRegInfo();
  TRI = F.getRegInfo().getTargetRegisterInfo();
  RCI.runOnMachineFunction(F);

  for (auto &MBB : F) {
    Changed |= runOnBasicBlock(MBB);
  }

  return Changed;
}

bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
  bool Changed = false;
  LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB
                    << " - scanning instructions...\n");

  // First, scan the basic block producing a set of chains.

  // The currently "active" chains - chains that can be added to and haven't
  // been killed yet. This is keyed by register - all chains can only have one
  // "link" register between each inst in the chain.
  std::map<unsigned, Chain*> ActiveChains;
  std::vector<std::unique_ptr<Chain>> AllChains;
  unsigned Idx = 0;
  for (auto &MI : MBB)
    scanInstruction(&MI, Idx++, ActiveChains, AllChains);

  LLVM_DEBUG(dbgs() << "Scan complete, " << AllChains.size()
                    << " chains created.\n");

  // Group the chains into disjoint sets based on their liveness range. This is
  // a poor-man's version of graph coloring. Ideally we'd create an interference
  // graph and perform full-on graph coloring on that, but;
  //   (a) That's rather heavyweight for only two colors.
  //   (b) We expect multiple disjoint interference regions - in practice the live
  //       range of chains is quite small and they are clustered between loads
  //       and stores.
  EquivalenceClasses<Chain*> EC;
  for (auto &I : AllChains)
    EC.insert(I.get());

  for (auto &I : AllChains)
    for (auto &J : AllChains)
      if (I != J && I->rangeOverlapsWith(*J))
        EC.unionSets(I.get(), J.get());
  LLVM_DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");

  // Now we assume that every member of an equivalence class interferes
  // with every other member of that class, and with no members of other classes.

  // Convert the EquivalenceClasses to a simpler set of sets.
  std::vector<std::vector<Chain*> > V;
  for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
    std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
    if (Cs.empty()) continue;
    V.push_back(std::move(Cs));
  }

  // Now we have a set of sets, order them by start address so
  // we can iterate over them sequentially.
  llvm::sort(V,
             [](const std::vector<Chain *> &A, const std::vector<Chain *> &B) {
               return A.front()->startsBefore(B.front());
             });

  // As we only have two colors, we can track the global (BB-level) balance of
  // odds versus evens. We aim to keep this near zero to keep both execution
  // units fed.
  // Positive means we're even-heavy, negative we're odd-heavy.
  //
  // FIXME: If chains have interdependencies, for example:
  //   mul r0, r1, r2
  //   mul r3, r0, r1
  // We do not model this and may color each one differently, assuming we'll
  // get ILP when we obviously can't. This hasn't been seen to be a problem
  // in practice so far, so we simplify the algorithm by ignoring it.
  int Parity = 0;

  for (auto &I : V)
    Changed |= colorChainSet(std::move(I), MBB, Parity);

  return Changed;
}

Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
                                                  std::vector<Chain*> &L) {
  if (L.empty())
    return nullptr;

  // We try and get the best candidate from L to color next, given that our
  // preferred color is "PreferredColor". L is ordered from larger to smaller
  // chains. It is beneficial to color the large chains before the small chains,
  // but if we can't find a chain of the maximum length with the preferred color,
  // we fuzz the size and look for slightly smaller chains before giving up and
  // returning a chain that must be recolored.

  // FIXME: Does this need to be configurable?
  const unsigned SizeFuzz = 1;
  unsigned MinSize = L.front()->size() - SizeFuzz;
  for (auto I = L.begin(), E = L.end(); I != E; ++I) {
    if ((*I)->size() <= MinSize) {
      // We've gone past the size limit. Return the previous item.
      Chain *Ch = *--I;
      L.erase(I);
      return Ch;
    }

    if ((*I)->getPreferredColor() == PreferredColor) {
      Chain *Ch = *I;
      L.erase(I);
      return Ch;
    }
  }

  // Bailout case - just return the first item.
  Chain *Ch = L.front();
  L.erase(L.begin());
  return Ch;
}

bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
                                              MachineBasicBlock &MBB,
                                              int &Parity) {
  bool Changed = false;
  LLVM_DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");

  // Sort by descending size order so that we allocate the most important
  // sets first.
  // Tie-break equivalent sizes by sorting chains requiring fixups before
  // those without fixups. The logic here is that we should look at the
  // chains that we cannot change before we look at those we can,
  // so the parity counter is updated and we know what color we should
  // change them to!
  // Final tie-break with instruction order so pass output is stable (i.e. not
  // dependent on malloc'd pointer values).
  llvm::sort(GV, [](const Chain *G1, const Chain *G2) {
    if (G1->size() != G2->size())
      return G1->size() > G2->size();
    if (G1->requiresFixup() != G2->requiresFixup())
      return G1->requiresFixup() > G2->requiresFixup();
    // Make sure startsBefore() produces a stable final order.
    assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) &&
           "Starts before not total order!");
    return G1->startsBefore(G2);
  });

  Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
  while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
    // Start off by assuming we'll color to our own preferred color.
    Color C = PreferredColor;
    if (Parity == 0)
      // But if we really don't care, use the chain's preferred color.
      C = G->getPreferredColor();

    LLVM_DEBUG(dbgs() << " - Parity=" << Parity
                      << ", Color=" << ColorNames[(int)C] << "\n");

    // If we'll need a fixup FMOV, don't bother. Testing has shown that this
    // happens infrequently and when it does it has at least a 50% chance of
    // slowing code down instead of speeding it up.
    if (G->requiresFixup() && C != G->getPreferredColor()) {
      C = G->getPreferredColor();
      LLVM_DEBUG(dbgs() << " - " << G->str()
                        << " - not worthwhile changing; "
                           "color remains "
                        << ColorNames[(int)C] << "\n");
    }

    Changed |= colorChain(G, C, MBB);

    Parity += (C == Color::Even) ? G->size() : -G->size();
    PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
  }

  return Changed;
}

int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
                                                MachineBasicBlock &MBB) {
  // Can we find an appropriate register that is available throughout the life
  // of the chain? Simulate liveness backwards until the end of the chain.
  LiveRegUnits Units(*TRI);
  Units.addLiveOuts(MBB);
  MachineBasicBlock::iterator I = MBB.end();
  MachineBasicBlock::iterator ChainEnd = G->end();
  while (I != ChainEnd) {
    --I;
    Units.stepBackward(*I);
  }

  // Check which register units are alive throughout the chain.
  MachineBasicBlock::iterator ChainBegin = G->begin();
  assert(ChainBegin != ChainEnd && "Chain should contain instructions");
  do {
    --I;
    Units.accumulate(*I);
  } while (I != ChainBegin);

  // Make sure we allocate in-order, to get the cheapest registers first.
  unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass;
  auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
  for (auto Reg : Ord) {
    if (!Units.available(Reg))
      continue;
    if (C == getColor(Reg))
      return Reg;
  }

  return -1;
}

bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
                                           MachineBasicBlock &MBB) {
  bool Changed = false;
  LLVM_DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
                    << ColorNames[(int)C] << ")\n");

  // Try and obtain a free register of the right class. Without a register
  // to play with we cannot continue.
  int Reg = scavengeRegister(G, C, MBB);
  if (Reg == -1) {
    LLVM_DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
    return false;
  }
  LLVM_DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n");

  std::map<unsigned, unsigned> Substs;
  for (MachineInstr &I : *G) {
    if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable()))
      continue;

    // I is a member of G, or I is a mutable instruction that kills G.

    std::vector<unsigned> ToErase;
    for (auto &U : I.operands()) {
      if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
        Register OrigReg = U.getReg();
        U.setReg(Substs[OrigReg]);
        if (U.isKill())
          // Don't erase straight away, because there may be other operands
          // that also reference this substitution!
          ToErase.push_back(OrigReg);
      } else if (U.isRegMask()) {
        for (auto J : Substs) {
          if (U.clobbersPhysReg(J.first))
            ToErase.push_back(J.first);
        }
      }
    }
    // Now it's safe to remove the substs identified earlier.
    for (auto J : ToErase)
      Substs.erase(J);

    // Only change the def if this isn't the last instruction.
    if (&I != G->getKill()) {
      MachineOperand &MO = I.getOperand(0);

      bool Change = TransformAll || getColor(MO.getReg()) != C;
      if (G->requiresFixup() && &I == G->getLast())
        Change = false;

      if (Change) {
        Substs[MO.getReg()] = Reg;
        MO.setReg(Reg);

        Changed = true;
      }
    }
  }
  assert(Substs.size() == 0 && "No substitutions should be left active!");

  if (G->getKill()) {
    LLVM_DEBUG(dbgs() << " - Kill instruction seen.\n");
  } else {
    // We didn't have a kill instruction, but we didn't seem to need to change
    // the destination register anyway.
    LLVM_DEBUG(dbgs() << " - Destination register not changed.\n");
  }
  return Changed;
}

void AArch64A57FPLoadBalancing::scanInstruction(
    MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains,
    std::vector<std::unique_ptr<Chain>> &AllChains) {
  // Inspect "MI", updating ActiveChains and AllChains.

  if (isMul(MI)) {

    for (auto &I : MI->uses())
      maybeKillChain(I, Idx, ActiveChains);
    for (auto &I : MI->defs())
      maybeKillChain(I, Idx, ActiveChains);

    // Create a new chain. Multiplies don't require forwarding so can go on any
    // unit.
    Register DestReg = MI->getOperand(0).getReg();

    LLVM_DEBUG(dbgs() << "New chain started for register "
                      << printReg(DestReg, TRI) << " at " << *MI);

    auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
    ActiveChains[DestReg] = G.get();
    AllChains.push_back(std::move(G));

  } else if (isMla(MI)) {

    // It is beneficial to keep MLAs on the same functional unit as their
    // accumulator operand.
    Register DestReg = MI->getOperand(0).getReg();
    Register AccumReg = MI->getOperand(3).getReg();

    maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
    maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
    if (DestReg != AccumReg)
      maybeKillChain(MI->getOperand(0), Idx, ActiveChains);

    if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
      LLVM_DEBUG(dbgs() << "Chain found for accumulator register "
                        << printReg(AccumReg, TRI) << " in MI " << *MI);

      // For simplicity we only chain together sequences of MULs/MLAs where the
      // accumulator register is killed on each instruction. This means we don't
      // need to track other uses of the registers we want to rewrite.
      //
      // FIXME: We could extend to handle the non-kill cases for more coverage.
      if (MI->getOperand(3).isKill()) {
        // Add to chain.
        LLVM_DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
        ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
        // Handle cases where the destination is not the same as the accumulator.
        if (DestReg != AccumReg) {
          ActiveChains[DestReg] = ActiveChains[AccumReg];
          ActiveChains.erase(AccumReg);
        }
        return;
      }

      LLVM_DEBUG(
          dbgs() << "Cannot add to chain because accumulator operand wasn't "
                 << "marked <kill>!\n");
      maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
    }

    LLVM_DEBUG(dbgs() << "Creating new chain for dest register "
                      << printReg(DestReg, TRI) << "\n");
    auto G = std::make_unique<Chain>(MI, Idx, getColor(DestReg));
    ActiveChains[DestReg] = G.get();
    AllChains.push_back(std::move(G));

  } else {

    // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
    // lists.
    for (auto &I : MI->uses())
      maybeKillChain(I, Idx, ActiveChains);
    for (auto &I : MI->defs())
      maybeKillChain(I, Idx, ActiveChains);

  }
}

void AArch64A57FPLoadBalancing::
maybeKillChain(MachineOperand &MO, unsigned Idx,
               std::map<unsigned, Chain*> &ActiveChains) {
  // Given an operand and the set of active chains (keyed by register),
  // determine if a chain should be ended and remove from ActiveChains.
  MachineInstr *MI = MO.getParent();

  if (MO.isReg()) {

    // If this is a KILL of a current chain, record it.
    if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
      LLVM_DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI)
                        << "\n");
      ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
    }
    ActiveChains.erase(MO.getReg());

  } else if (MO.isRegMask()) {

    for (auto I = ActiveChains.begin(), E = ActiveChains.end();
         I != E;) {
      if (MO.clobbersPhysReg(I->first)) {
        LLVM_DEBUG(dbgs() << "Kill (regmask) seen for chain "
                          << printReg(I->first, TRI) << "\n");
        I->second->setKill(MI, Idx, /*Immutable=*/true);
        ActiveChains.erase(I++);
      } else
        ++I;
    }

  }
}

Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
  if ((TRI->getEncodingValue(Reg) % 2) == 0)
    return Color::Even;
  else
    return Color::Odd;
}

// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
  return new AArch64A57FPLoadBalancing();
}