Training courses

Kernel and Embedded Linux

Bootlin training courses

Embedded Linux, kernel,
Yocto Project, Buildroot, real-time,
graphics, boot time, debugging...

Bootlin logo

Elixir Cross Referencer

//=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass does post-instruction-selection optimizations in the GlobalISel
// pipeline, before the rest of codegen runs.
//
//===----------------------------------------------------------------------===//

#include "AArch64.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Support/Debug.h"

#define DEBUG_TYPE "aarch64-post-select-optimize"

using namespace llvm;

namespace {
class AArch64PostSelectOptimize : public MachineFunctionPass {
public:
  static char ID;

  AArch64PostSelectOptimize();

  StringRef getPassName() const override {
    return "AArch64 Post Select Optimizer";
  }

  bool runOnMachineFunction(MachineFunction &MF) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override;

private:
  bool optimizeNZCVDefs(MachineBasicBlock &MBB);
};
} // end anonymous namespace

void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.addRequired<TargetPassConfig>();
  AU.setPreservesCFG();
  getSelectionDAGFallbackAnalysisUsage(AU);
  MachineFunctionPass::getAnalysisUsage(AU);
}

AArch64PostSelectOptimize::AArch64PostSelectOptimize()
    : MachineFunctionPass(ID) {
  initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
}

unsigned getNonFlagSettingVariant(unsigned Opc) {
  switch (Opc) {
  default:
    return 0;
  case AArch64::SUBSXrr:
    return AArch64::SUBXrr;
  case AArch64::SUBSWrr:
    return AArch64::SUBWrr;
  case AArch64::SUBSXrs:
    return AArch64::SUBXrs;
  case AArch64::SUBSXri:
    return AArch64::SUBXri;
  case AArch64::SUBSWri:
    return AArch64::SUBWri;
  }
}

bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
  // Consider the following code:
  //  FCMPSrr %0, %1, implicit-def $nzcv
  //  %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
  //  %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
  //  FCMPSrr %0, %1, implicit-def $nzcv
  //  %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
  // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
  // when we have a single IR fcmp being used by two selects. During selection,
  // to ensure that there can be no clobbering of nzcv between the fcmp and the
  // csel, we have to generate an fcmp immediately before each csel is
  // selected.
  // However, often we can essentially CSE these together later in MachineCSE.
  // This doesn't work though if there are unrelated flag-setting instructions
  // in between the two FCMPs. In this case, the SUBS defines NZCV
  // but it doesn't have any users, being overwritten by the second FCMP.
  //
  // Our solution here is to try to convert flag setting operations between
  // a interval of identical FCMPs, so that CSE will be able to eliminate one.
  bool Changed = false;
  auto &MF = *MBB.getParent();
  auto &Subtarget = MF.getSubtarget();
  const auto &TII = Subtarget.getInstrInfo();
  auto TRI = Subtarget.getRegisterInfo();
  auto RBI = Subtarget.getRegBankInfo();
  auto &MRI = MF.getRegInfo();

  // The first step is to find the first and last FCMPs. If we have found
  // at least two, then set the limit of the bottom-up walk to the first FCMP
  // found since we're only interested in dealing with instructions between
  // them.
  MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr;
  for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) {
    if (MI.getOpcode() == AArch64::FCMPSrr ||
        MI.getOpcode() == AArch64::FCMPDrr) {
      if (!FirstCmp)
        FirstCmp = &MI;
      else
        LastCmp = &MI;
    }
  }

  // In addition to converting flag-setting ops in fcmp ranges into non-flag
  // setting ops, across the whole basic block we also detect when nzcv
  // implicit-defs are dead, and mark them as dead. Peephole optimizations need
  // this information later.

  LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
  LRU.addLiveOuts(MBB);
  bool NZCVDead = LRU.available(AArch64::NZCV);
  bool InsideCmpRange = false;
  for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
    LRU.stepBackward(II);

    if (LastCmp) { // There's a range present in this block.
      // If we're inside an fcmp range, look for begin instruction.
      if (InsideCmpRange && &II == FirstCmp)
        InsideCmpRange = false;
      else if (&II == LastCmp)
        InsideCmpRange = true;
    }

    // Did this instruction define NZCV?
    bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV);
    if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) {
      // If we have a def and NZCV is dead, then we may convert this op.
      unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
      int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV);
      if (DeadNZCVIdx != -1) {
        // If we're inside an fcmp range, then convert flag setting ops.
        if (InsideCmpRange && NewOpc) {
          LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
                               "op in fcmp range: "
                            << II);
          II.setDesc(TII->get(NewOpc));
          II.RemoveOperand(DeadNZCVIdx);
          // Changing the opcode can result in differing regclass requirements,
          // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
          // Constrain the regclasses, possibly introducing a copy.
          constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
                                   II.getOperand(0), 0);
          Changed |= true;
        } else {
          // Otherwise, we just set the nzcv imp-def operand to be dead, so the
          // peephole optimizations can optimize them further.
          II.getOperand(DeadNZCVIdx).setIsDead();
        }
      }
    }

    NZCVDead = NZCVDeadAtCurrInstr;
  }
  return Changed;
}

bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
  if (MF.getProperties().hasProperty(
          MachineFunctionProperties::Property::FailedISel))
    return false;
  assert(MF.getProperties().hasProperty(
             MachineFunctionProperties::Property::Selected) &&
         "Expected a selected MF");

  bool Changed = false;
  for (auto &BB : MF)
    Changed |= optimizeNZCVDefs(BB);
  return true;
}

char AArch64PostSelectOptimize::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
                      "Optimize AArch64 selected instructions",
                      false, false)
INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
                    "Optimize AArch64 selected instructions", false,
                    false)

namespace llvm {
FunctionPass *createAArch64PostSelectOptimize() {
  return new AArch64PostSelectOptimize();
}
} // end namespace llvm