[SLP][AMDGPU] Reland: Vectorize operands of non-trivially-vectorizable intrinsic calls#194980
[SLP][AMDGPU] Reland: Vectorize operands of non-trivially-vectorizable intrinsic calls#194980mssefat wants to merge 2 commits into
Conversation
…torizable intrinsic calls" (llvm#191153)" This reverts commit 9cf8152.
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Syadus Sefat (mssefat) ChangesThe original patch was reverted by 9cf8152 because it exposed AOMP flang-288613* runtime failures. Those tests have since been fixed in ROCm/aomp#2204. Since the revert, SLP introduced -slp-inst-count-check. This reland updates the affected tests accordingly and adds checks for both enabled and disabled modes. Patch is 61.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/194980.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 877c83291170b..4e87abaa93dae 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -148,6 +148,11 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
bool vectorizeInserts(InstSetVector &Instructions, BasicBlock *BB,
slpvectorizer::BoUpSLP &R);
+ /// Tries to vectorize the operands of the non-trivially-vectorizable
+ /// intrinsic calls.
+ bool vectorizeNonTriviallyVectrizableIntrinsicCallOperand(
+ InstSetVector &IIs, BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+
/// Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc5a206cacbd0..e1456f0ebbac2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -270,6 +270,34 @@ static const int MinScheduleRegionSize = 16;
/// Maximum allowed number of operands in the PHI nodes.
static const unsigned MaxPHINumOperands = 128;
+/// For the non-trivially vectorizable intrinsics calls, try to vectorize their
+/// operands.
+/// FIXME: Extend for all non-vectorized functions.
+SmallVector<Value *, 4>
+getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+
+ auto *II = dyn_cast<IntrinsicInst>(V);
+ if (!II || isAssumeLikeIntrinsic(II))
+ return {};
+
+ if (isTriviallyVectorizable(II->getIntrinsicID()))
+ return {};
+
+ // Skip vector-returning intrinsics in non-revec mode.
+ if (!SLPReVec && II->getType()->isVectorTy())
+ return {};
+
+ // FIXME: Add non-instructions operands to the list.
+ SmallVector<Value *, 4> Operands;
+ for (Value *ArgOp : II->args()) {
+ if (auto *I = dyn_cast<Instruction>(ArgOp)) {
+ Operands.emplace_back(I);
+ }
+ }
+
+ return Operands;
+}
+
/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
@@ -30263,6 +30291,89 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
return Changed;
}
+bool SLPVectorizerPass::vectorizeNonTriviallyVectrizableIntrinsicCallOperand(
+ InstSetVector &IIs, BasicBlock *BB, BoUpSLP &R) {
+
+ bool Changed = false;
+
+ // Pass1 - try to find horizontal reductions of operands.
+ for (Instruction *I : IIs) {
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (!II || R.isDeleted(II))
+ continue;
+ for (Value *Op : II->args())
+ if (auto *RootOp = dyn_cast<Instruction>(Op)) {
+ Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
+ if (R.isDeleted(II))
+ break;
+ }
+ }
+ // Operands sorter.
+ auto OperandSorter = [this](Value *V1, Value *V2) -> bool {
+ if (V1 == V2)
+ return false;
+ auto *I1 = cast<Instruction>(V1);
+ auto *I2 = cast<Instruction>(V2);
+ if (I1->getType()->getTypeID() != I2->getType()->getTypeID())
+ return I1->getType()->getTypeID() < I2->getType()->getTypeID();
+ if (I1->getType()->getScalarSizeInBits() !=
+ I2->getType()->getScalarSizeInBits())
+ return I1->getType()->getScalarSizeInBits() <
+ I2->getType()->getScalarSizeInBits();
+ DomTreeNodeBase<BasicBlock> *Node1 = DT->getNode(I1->getParent());
+ DomTreeNodeBase<BasicBlock> *Node2 = DT->getNode(I2->getParent());
+ if (!Node1)
+ return Node2 != nullptr;
+ if (!Node2)
+ return false;
+ if (Node1->getDFSNumIn() == Node2->getDFSNumIn()) {
+ if (I1->getOpcode() != I2->getOpcode())
+ return I1->getOpcode() < I2->getOpcode();
+ return I1->comesBefore(I2);
+ }
+ return Node1->getDFSNumIn() < Node2->getDFSNumIn();
+ };
+
+ // Compatibility checker for the operands.
+ auto AreCompatibleOperands = [](ArrayRef<Value *> VL, Value *V) -> bool {
+ if (VL.empty() || VL.back() == V)
+ return true;
+ auto *I1 = cast<Instruction>(VL.back());
+ auto *I2 = cast<Instruction>(V);
+ return I1->getType() == I2->getType() &&
+ I1->getParent() == I2->getParent() &&
+ I1->getOpcode() == I2->getOpcode();
+ };
+
+ // Collect the operands of the non-trivially-vectorizable intrinsic calls.
+ SmallVector<Value *, 4> CandidateSeeds;
+ for (Instruction *I : IIs) {
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (!II || R.isDeleted(II))
+ continue;
+ SmallVector<Value *, 4> Ops =
+ getNonTriviallyVectorizableIntrinsicCallOperand(II);
+ for (Value *Op : Ops)
+ if (isa<Instruction>(Op))
+ CandidateSeeds.push_back(Op);
+ }
+
+ auto CandidatesFiltered = make_filter_range(CandidateSeeds, [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && !R.isDeleted(I) && isValidElementType(I->getType());
+ });
+ SmallVector<Value *, 4> CandidateVec(CandidatesFiltered);
+ // Pass2 - try to vectorize the operands.
+ Changed |= tryToVectorizeSequence<Value>(
+ CandidateVec, OperandSorter, AreCompatibleOperands,
+ [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
+ return tryToVectorizeList(Candidates, R, MaxVFOnly);
+ },
+ /*MaxVFOnly=*/true, R);
+
+ return Changed;
+}
+
bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
BasicBlock *BB, BoUpSLP &R) {
assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
@@ -30535,21 +30646,33 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
InstSetVector PostProcessInserts;
SmallSetVector<CmpInst *, 8> PostProcessCmps;
- // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
- // also vectorizes `PostProcessCmps`.
- auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
+ InstSetVector PostProcessIntrinsicCalls;
+ // Vectorizes Inserts in `PostProcessInserts` and if `AtTerminator` is true
+ // also vectorizes `PostProcessCmps` and `PostProcessIntrinsicCalls`.
+ auto VectorizeInsertsAndCmps = [&](bool AtTerminator) {
bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
- if (VectorizeCmps) {
+ if (AtTerminator) {
Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
PostProcessCmps.clear();
+ Changed |= vectorizeNonTriviallyVectrizableIntrinsicCallOperand(
+ PostProcessIntrinsicCalls, BB, R);
+ PostProcessIntrinsicCalls.clear();
}
PostProcessInserts.clear();
return Changed;
};
+ auto isNonTriviallyVectorizableIntrinsic = [](const IntrinsicInst *II) {
+ return !isTriviallyVectorizable(II->getIntrinsicID()) &&
+ !isAssumeLikeIntrinsic(II) &&
+ !(!SLPReVec && II->getType()->isVectorTy());
+ };
// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
auto IsInPostProcessInstrs = [&](Instruction *I) {
if (auto *Cmp = dyn_cast<CmpInst>(I))
return PostProcessCmps.contains(Cmp);
+ if (auto *II = dyn_cast<IntrinsicInst>(I);
+ II && isNonTriviallyVectorizableIntrinsic(II))
+ return PostProcessIntrinsicCalls.contains(II);
return isa<InsertElementInst, InsertValueInst>(I) &&
PostProcessInserts.contains(I);
};
@@ -30560,6 +30683,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
return I->use_empty() &&
(I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
};
+ SmallMapVector<std::pair<Intrinsic::ID, unsigned>, // (ID, OpIndex)
+ SmallMapVector<unsigned, // Opcode
+ SmallVector<Value *, 4>, 4>,
+ 4>
+ IntrinsicSeedOps;
for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
// Skip instructions with scalable type. The num of elements is unknown at
// compile-time for scalable type.
@@ -30572,7 +30700,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*It).second) {
if (HasNoUsers(&*It) &&
- VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
+ VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator())) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
@@ -30652,7 +30780,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// top-tree instructions to try to vectorize as many instructions as
// possible.
OpsChanged |=
- VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
+ VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator());
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
@@ -30667,6 +30795,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
PostProcessInserts.insert(&*It);
else if (isa<CmpInst>(It))
PostProcessCmps.insert(cast<CmpInst>(&*It));
+ else if (auto *II = dyn_cast<IntrinsicInst>(&*It);
+ II && isNonTriviallyVectorizableIntrinsic(II))
+ PostProcessIntrinsicCalls.insert(II);
}
return Changed;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
index 26d61cb6bc28d..b8df8d44968f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -1,28 +1,51 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=slp-vectorizer -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefix GCN
+; RUN: opt -passes=slp-vectorizer -slp-inst-count-check=true -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefixes GCN,INSTCNT
+; RUN: opt -passes=slp-vectorizer -slp-inst-count-check=false -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefixes GCN,NO-INSTCNT
define amdgpu_kernel void @test_with_wmma( ptr addrspace(1) %input, ptr addrspace(1) %output, float %scaled_max, <16 x i32> %A, <16 x i32> %B, i32 %scale_idx) {
-; GCN-LABEL: define amdgpu_kernel void @test_with_wmma(
-; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
-; GCN-NEXT: [[ENTRY:.*:]]
-; GCN-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
-; GCN-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
-; GCN-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
-; GCN-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
-; GCN-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALED_MAX]]
-; GCN-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALED_MAX]]
-; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
-; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
-; GCN-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
-; GCN-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
-; GCN-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
-; GCN-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
-; GCN-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
-; GCN-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
-; GCN-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
-; GCN-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
-; GCN-NEXT: ret void
+; INSTCNT-LABEL: define amdgpu_kernel void @test_with_wmma(
+; INSTCNT-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; INSTCNT-NEXT: [[ENTRY:.*:]]
+; INSTCNT-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
+; INSTCNT-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
+; INSTCNT-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
+; INSTCNT-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
+; INSTCNT-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
+; INSTCNT-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALED_MAX]]
+; INSTCNT-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALED_MAX]]
+; INSTCNT-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
+; INSTCNT-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
+; INSTCNT-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; INSTCNT-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; INSTCNT-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; INSTCNT-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; INSTCNT-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; INSTCNT-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; INSTCNT-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; INSTCNT-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; INSTCNT-NEXT: ret void
+;
+; NO-INSTCNT-LABEL: define amdgpu_kernel void @test_with_wmma(
+; NO-INSTCNT-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-INSTCNT-NEXT: [[ENTRY:.*:]]
+; NO-INSTCNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; NO-INSTCNT-NEXT: [[TMP1:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; NO-INSTCNT-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[SCALED_MAX]], i32 0
+; NO-INSTCNT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; NO-INSTCNT-NEXT: [[TMP4:%.*]] = fsub contract <2 x float> [[TMP1]], [[TMP3]]
+; NO-INSTCNT-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NO-INSTCNT-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; NO-INSTCNT-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NO-INSTCNT-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP6]])
+; NO-INSTCNT-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; NO-INSTCNT-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; NO-INSTCNT-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; NO-INSTCNT-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; NO-INSTCNT-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; NO-INSTCNT-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; NO-INSTCNT-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; NO-INSTCNT-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; NO-INSTCNT-NEXT: ret void
;
entry:
@@ -61,21 +84,17 @@ entry:
define amdgpu_kernel void @test_amdgcn_exp_log(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_log(
-; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
; GCN-NEXT: [[ENTRY:.*:]]
-; GCN-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
-; GCN-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
-; GCN-NEXT: [[SCALE0:%.*]] = load float, ptr addrspace(1) [[SCALES]], align 4
-; GCN-NEXT: [[SPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT: [[SCALE1:%.*]] = load float, ptr addrspace(1) [[SPTR1]], align 4
-; GCN-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
-; GCN-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
-; GCN-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALE0]]
-; GCN-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALE1]]
-; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP4]])
; GCN-NEXT: [[LOG0:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP0]])
-; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
+; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
; GCN-NEXT: [[LOG1:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP1]])
; GCN-NEXT: [[SUM:%.*]] = fadd fast float [[LOG0]], [[LOG1]]
; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
@@ -105,18 +124,14 @@ define amdgpu_kernel void @test_amdgcn_exp_f16(ptr addrspace(1) %input, ptr addr
; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_f16(
; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
; GCN-NEXT: [[ENTRY:.*:]]
-; GCN-NEXT: [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2
-; GCN-NEXT: [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT: [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2
-; GCN-NEXT: [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2
-; GCN-NEXT: [[SPTR1:%.*]] = getelementptr half, ptr a...
[truncated]
|
|
I was not sure if you're going to continue work on this one and prepared more general in #194248, which supports not only intrinsics, but also non-vectorizable functions, and a few other candidates |
|
Since #194248 replaces and extends this PR, I am going to close this. |
The original patch was reverted by 9cf8152 because it exposed AOMP flang-288613* runtime failures. Those tests have since been fixed in ROCm/aomp#2204.
Since the revert, SLP introduced -slp-inst-count-check. This reland updates the affected tests accordingly and adds checks for both enabled and disabled modes.