diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 877c83291170b..4e87abaa93dae 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -148,6 +148,11 @@ struct SLPVectorizerPass : public PassInfoMixin { bool vectorizeInserts(InstSetVector &Instructions, BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// Tries to vectorize the operands of the non-trivially-vectorizable + /// intrinsic calls. + bool vectorizeNonTriviallyVectrizableIntrinsicCallOperand( + InstSetVector &IIs, BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc5a206cacbd0..e1456f0ebbac2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -270,6 +270,34 @@ static const int MinScheduleRegionSize = 16; /// Maximum allowed number of operands in the PHI nodes. static const unsigned MaxPHINumOperands = 128; +/// For the non-trivially vectorizable intrinsics calls, try to vectorize their +/// operands. +/// FIXME: Extend for all non-vectorized functions. +SmallVector +getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) { + + auto *II = dyn_cast(V); + if (!II || isAssumeLikeIntrinsic(II)) + return {}; + + if (isTriviallyVectorizable(II->getIntrinsicID())) + return {}; + + // Skip vector-returning intrinsics in non-revec mode. + if (!SLPReVec && II->getType()->isVectorTy()) + return {}; + + // FIXME: Add non-instructions operands to the list. + SmallVector Operands; + for (Value *ArgOp : II->args()) { + if (auto *I = dyn_cast(ArgOp)) { + Operands.emplace_back(I); + } + } + + return Operands; +} + /// Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -30263,6 +30291,89 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range CmpInsts, return Changed; } +bool SLPVectorizerPass::vectorizeNonTriviallyVectrizableIntrinsicCallOperand( + InstSetVector &IIs, BasicBlock *BB, BoUpSLP &R) { + + bool Changed = false; + + // Pass1 - try to find horizontal reductions of operands. + for (Instruction *I : IIs) { + auto *II = dyn_cast(I); + if (!II || R.isDeleted(II)) + continue; + for (Value *Op : II->args()) + if (auto *RootOp = dyn_cast(Op)) { + Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R); + if (R.isDeleted(II)) + break; + } + } + // Operands sorter. + auto OperandSorter = [this](Value *V1, Value *V2) -> bool { + if (V1 == V2) + return false; + auto *I1 = cast(V1); + auto *I2 = cast(V2); + if (I1->getType()->getTypeID() != I2->getType()->getTypeID()) + return I1->getType()->getTypeID() < I2->getType()->getTypeID(); + if (I1->getType()->getScalarSizeInBits() != + I2->getType()->getScalarSizeInBits()) + return I1->getType()->getScalarSizeInBits() < + I2->getType()->getScalarSizeInBits(); + DomTreeNodeBase *Node1 = DT->getNode(I1->getParent()); + DomTreeNodeBase *Node2 = DT->getNode(I2->getParent()); + if (!Node1) + return Node2 != nullptr; + if (!Node2) + return false; + if (Node1->getDFSNumIn() == Node2->getDFSNumIn()) { + if (I1->getOpcode() != I2->getOpcode()) + return I1->getOpcode() < I2->getOpcode(); + return I1->comesBefore(I2); + } + return Node1->getDFSNumIn() < Node2->getDFSNumIn(); + }; + + // Compatibility checker for the operands. + auto AreCompatibleOperands = [](ArrayRef VL, Value *V) -> bool { + if (VL.empty() || VL.back() == V) + return true; + auto *I1 = cast(VL.back()); + auto *I2 = cast(V); + return I1->getType() == I2->getType() && + I1->getParent() == I2->getParent() && + I1->getOpcode() == I2->getOpcode(); + }; + + // Collect the operands of the non-trivially-vectorizable intrinsic calls. + SmallVector CandidateSeeds; + for (Instruction *I : IIs) { + auto *II = dyn_cast(I); + if (!II || R.isDeleted(II)) + continue; + SmallVector Ops = + getNonTriviallyVectorizableIntrinsicCallOperand(II); + for (Value *Op : Ops) + if (isa(Op)) + CandidateSeeds.push_back(Op); + } + + auto CandidatesFiltered = make_filter_range(CandidateSeeds, [&](Value *V) { + auto *I = dyn_cast(V); + return I && !R.isDeleted(I) && isValidElementType(I->getType()); + }); + SmallVector CandidateVec(CandidatesFiltered); + // Pass2 - try to vectorize the operands. + Changed |= tryToVectorizeSequence( + CandidateVec, OperandSorter, AreCompatibleOperands, + [this, &R](ArrayRef Candidates, bool MaxVFOnly) { + return tryToVectorizeList(Candidates, R, MaxVFOnly); + }, + /*MaxVFOnly=*/true, R); + + return Changed; +} + bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, BasicBlock *BB, BoUpSLP &R) { assert(all_of(Instructions, IsaPred) && @@ -30535,21 +30646,33 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { InstSetVector PostProcessInserts; SmallSetVector PostProcessCmps; - // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true - // also vectorizes `PostProcessCmps`. - auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { + InstSetVector PostProcessIntrinsicCalls; + // Vectorizes Inserts in `PostProcessInserts` and if `AtTerminator` is true + // also vectorizes `PostProcessCmps` and `PostProcessIntrinsicCalls`. + auto VectorizeInsertsAndCmps = [&](bool AtTerminator) { bool Changed = vectorizeInserts(PostProcessInserts, BB, R); - if (VectorizeCmps) { + if (AtTerminator) { Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R); PostProcessCmps.clear(); + Changed |= vectorizeNonTriviallyVectrizableIntrinsicCallOperand( + PostProcessIntrinsicCalls, BB, R); + PostProcessIntrinsicCalls.clear(); } PostProcessInserts.clear(); return Changed; }; + auto isNonTriviallyVectorizableIntrinsic = [](const IntrinsicInst *II) { + return !isTriviallyVectorizable(II->getIntrinsicID()) && + !isAssumeLikeIntrinsic(II) && + !(!SLPReVec && II->getType()->isVectorTy()); + }; // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. auto IsInPostProcessInstrs = [&](Instruction *I) { if (auto *Cmp = dyn_cast(I)) return PostProcessCmps.contains(Cmp); + if (auto *II = dyn_cast(I); + II && isNonTriviallyVectorizableIntrinsic(II)) + return PostProcessIntrinsicCalls.contains(II); return isa(I) && PostProcessInserts.contains(I); }; @@ -30560,6 +30683,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return I->use_empty() && (I->getType()->isVoidTy() || isa(I)); }; + SmallMapVector, // (ID, OpIndex) + SmallMapVector, 4>, + 4> + IntrinsicSeedOps; for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) { // Skip instructions with scalable type. The num of elements is unknown at // compile-time for scalable type. @@ -30572,7 +30700,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*It).second) { if (HasNoUsers(&*It) && - VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) { + VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator())) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; @@ -30652,7 +30780,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // top-tree instructions to try to vectorize as many instructions as // possible. OpsChanged |= - VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator()); + VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator()); if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. @@ -30667,6 +30795,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { PostProcessInserts.insert(&*It); else if (isa(It)) PostProcessCmps.insert(cast(&*It)); + else if (auto *II = dyn_cast(&*It); + II && isNonTriviallyVectorizableIntrinsic(II)) + PostProcessIntrinsicCalls.insert(II); } return Changed; diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll index 26d61cb6bc28d..b8df8d44968f0 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll @@ -1,28 +1,51 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -passes=slp-vectorizer -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefix GCN +; RUN: opt -passes=slp-vectorizer -slp-inst-count-check=true -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefixes GCN,INSTCNT +; RUN: opt -passes=slp-vectorizer -slp-inst-count-check=false -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefixes GCN,NO-INSTCNT define amdgpu_kernel void @test_with_wmma( ptr addrspace(1) %input, ptr addrspace(1) %output, float %scaled_max, <16 x i32> %A, <16 x i32> %B, i32 %scale_idx) { -; GCN-LABEL: define amdgpu_kernel void @test_with_wmma( -; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] { -; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4 -; GCN-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000 -; GCN-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000 -; GCN-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALED_MAX]] -; GCN-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALED_MAX]] -; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]]) -; GCN-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0 -; GCN-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1 -; GCN-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32> -; GCN-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0 -; GCN-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1 -; GCN-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false) -; GCN-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false) -; GCN-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32 -; GCN-NEXT: ret void +; INSTCNT-LABEL: define amdgpu_kernel void @test_with_wmma( +; INSTCNT-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] { +; INSTCNT-NEXT: [[ENTRY:.*:]] +; INSTCNT-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4 +; INSTCNT-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1 +; INSTCNT-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4 +; INSTCNT-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000 +; INSTCNT-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000 +; INSTCNT-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALED_MAX]] +; INSTCNT-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALED_MAX]] +; INSTCNT-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]]) +; INSTCNT-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]]) +; INSTCNT-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0 +; INSTCNT-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1 +; INSTCNT-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32> +; INSTCNT-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0 +; INSTCNT-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1 +; INSTCNT-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false) +; INSTCNT-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false) +; INSTCNT-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32 +; INSTCNT-NEXT: ret void +; +; NO-INSTCNT-LABEL: define amdgpu_kernel void @test_with_wmma( +; NO-INSTCNT-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-INSTCNT-NEXT: [[ENTRY:.*:]] +; NO-INSTCNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4 +; NO-INSTCNT-NEXT: [[TMP1:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000) +; NO-INSTCNT-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[SCALED_MAX]], i32 0 +; NO-INSTCNT-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; NO-INSTCNT-NEXT: [[TMP4:%.*]] = fsub contract <2 x float> [[TMP1]], [[TMP3]] +; NO-INSTCNT-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NO-INSTCNT-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]]) +; NO-INSTCNT-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NO-INSTCNT-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP6]]) +; NO-INSTCNT-NEXT: [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0 +; NO-INSTCNT-NEXT: [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1 +; NO-INSTCNT-NEXT: [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32> +; NO-INSTCNT-NEXT: [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0 +; NO-INSTCNT-NEXT: [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1 +; NO-INSTCNT-NEXT: [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false) +; NO-INSTCNT-NEXT: [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false) +; NO-INSTCNT-NEXT: store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32 +; NO-INSTCNT-NEXT: ret void ; entry: @@ -61,21 +84,17 @@ entry: define amdgpu_kernel void @test_amdgcn_exp_log(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) { ; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_log( -; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4 -; GCN-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4 -; GCN-NEXT: [[SCALE0:%.*]] = load float, ptr addrspace(1) [[SCALES]], align 4 -; GCN-NEXT: [[SPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 1 -; GCN-NEXT: [[SCALE1:%.*]] = load float, ptr addrspace(1) [[SPTR1]], align 4 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000 -; GCN-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000 -; GCN-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALE0]] -; GCN-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALE1]] -; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]]) +; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4 +; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000) +; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]] +; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP4]]) ; GCN-NEXT: [[LOG0:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]]) +; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]]) ; GCN-NEXT: [[LOG1:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP1]]) ; GCN-NEXT: [[SUM:%.*]] = fadd fast float [[LOG0]], [[LOG1]] ; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 @@ -105,18 +124,14 @@ define amdgpu_kernel void @test_amdgcn_exp_f16(ptr addrspace(1) %input, ptr addr ; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_f16( ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2 -; GCN-NEXT: [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2 -; GCN-NEXT: [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2 -; GCN-NEXT: [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1 -; GCN-NEXT: [[SCALE1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract half [[IN0]], 0xH3E14 -; GCN-NEXT: [[MUL1:%.*]] = fmul contract half [[IN1]], 0xH3E14 -; GCN-NEXT: [[SUB0:%.*]] = fsub contract half [[MUL0]], [[SCALE0]] -; GCN-NEXT: [[SUB1:%.*]] = fsub contract half [[MUL1]], [[SCALE1]] -; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB1]]) +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14) +; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]] +; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0 +; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]]) +; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1 +; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]]) ; GCN-NEXT: [[SUM:%.*]] = fadd fast half [[EXP0]], [[EXP1]] ; GCN-NEXT: store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2 ; GCN-NEXT: ret void @@ -143,18 +158,14 @@ define amdgpu_kernel void @kernel_f16(ptr addrspace(1) %input, ptr addrspace(1) ; GCN-LABEL: define amdgpu_kernel void @kernel_f16( ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2 -; GCN-NEXT: [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2 -; GCN-NEXT: [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2 -; GCN-NEXT: [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1 -; GCN-NEXT: [[SCALE1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract half [[IN0]], 0xH3E14 -; GCN-NEXT: [[MUL1:%.*]] = fmul contract half [[IN1]], 0xH3E14 -; GCN-NEXT: [[SUB0:%.*]] = fsub contract half [[MUL0]], [[SCALE0]] -; GCN-NEXT: [[SUB1:%.*]] = fsub contract half [[MUL1]], [[SCALE1]] -; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB1]]) +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14) +; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]] +; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0 +; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]]) +; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1 +; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]]) ; GCN-NEXT: [[LOG0:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP0]]) ; GCN-NEXT: [[LOG1:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP1]]) ; GCN-NEXT: [[SUM:%.*]] = fadd fast half [[LOG0]], [[LOG1]] @@ -185,22 +196,18 @@ define amdgpu_kernel void @look_through_reuse_shuffle( ; GCN-LABEL: define amdgpu_kernel void @look_through_reuse_shuffle( ; GCN-SAME: ptr addrspace(1) noalias [[INPUT:%.*]], ptr addrspace(1) noalias [[SCALES:%.*]], ptr addrspace(1) noalias [[OUTPUT:%.*]]) #[[ATTR0]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1 -; GCN-NEXT: [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2 -; GCN-NEXT: [[IN1:%.*]] = load half, ptr addrspace(1) [[IPTR1]], align 2 -; GCN-NEXT: [[S0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2 -; GCN-NEXT: [[S1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2 -; GCN-NEXT: [[ADD0:%.*]] = fadd contract half [[IN0]], 0xH3E14 -; GCN-NEXT: [[ADD1:%.*]] = fadd contract half [[IN1]], 0xH3E14 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract half [[ADD0]], [[S0]] -; GCN-NEXT: [[MUL1:%.*]] = fmul contract half [[ADD1]], [[S1]] -; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[MUL0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[MUL1]]) -; GCN-NEXT: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0 -; GCN-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[EXP1]], i32 1 -; GCN-NEXT: [[TMP2:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <4 x i32> -; GCN-NEXT: store <4 x half> [[TMP2]], ptr addrspace(1) [[OUTPUT]], align 2 +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = fadd contract <2 x half> [[TMP0]], splat (half 0xH3E14) +; GCN-NEXT: [[TMP3:%.*]] = fmul contract <2 x half> [[TMP2]], [[TMP1]] +; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0 +; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]]) +; GCN-NEXT: [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1 +; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]]) +; GCN-NEXT: [[TMP6:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0 +; GCN-NEXT: [[TMP7:%.*]] = insertelement <4 x half> [[TMP6]], half [[EXP1]], i32 1 +; GCN-NEXT: [[TMP8:%.*]] = shufflevector <4 x half> [[TMP7]], <4 x half> poison, <4 x i32> +; GCN-NEXT: store <4 x half> [[TMP8]], ptr addrspace(1) [[OUTPUT]], align 2 ; GCN-NEXT: ret void ; ptr addrspace(1) noalias %input, ptr addrspace(1) noalias %scales, @@ -237,32 +244,24 @@ define amdgpu_kernel void @wider_exp2_f32(ptr addrspace(1) %input, ptr addrspace ; GCN-LABEL: define amdgpu_kernel void @wider_exp2_f32( ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4 -; GCN-NEXT: [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4 ; GCN-NEXT: [[PTR2:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 2 -; GCN-NEXT: [[IN2:%.*]] = load float, ptr addrspace(1) [[PTR2]], align 4 -; GCN-NEXT: [[PTR3:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 3 -; GCN-NEXT: [[IN3:%.*]] = load float, ptr addrspace(1) [[PTR3]], align 4 -; GCN-NEXT: [[SCALE0:%.*]] = load float, ptr addrspace(1) [[SCALES]], align 4 -; GCN-NEXT: [[SPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 1 -; GCN-NEXT: [[SCALE1:%.*]] = load float, ptr addrspace(1) [[SPTR1]], align 4 ; GCN-NEXT: [[SPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 2 -; GCN-NEXT: [[SCALE2:%.*]] = load float, ptr addrspace(1) [[SPTR2]], align 4 -; GCN-NEXT: [[SPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 3 -; GCN-NEXT: [[SCALE3:%.*]] = load float, ptr addrspace(1) [[SPTR3]], align 4 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000 -; GCN-NEXT: [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000 -; GCN-NEXT: [[MUL2:%.*]] = fmul contract float [[IN2]], 0x3FC0527DC0000000 -; GCN-NEXT: [[MUL3:%.*]] = fmul contract float [[IN3]], 0x3FC0527DC0000000 -; GCN-NEXT: [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALE0]] -; GCN-NEXT: [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALE1]] -; GCN-NEXT: [[SUB2:%.*]] = fsub contract float [[MUL2]], [[SCALE2]] -; GCN-NEXT: [[SUB3:%.*]] = fsub contract float [[MUL3]], [[SCALE3]] -; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]]) -; GCN-NEXT: [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB2]]) -; GCN-NEXT: [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB3]]) +; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4 +; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000) +; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]] +; GCN-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR2]], align 4 +; GCN-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[SPTR2]], align 4 +; GCN-NEXT: [[TMP6:%.*]] = fmul contract <2 x float> [[TMP4]], splat (float 0x3FC0527DC0000000) +; GCN-NEXT: [[TMP7:%.*]] = fsub contract <2 x float> [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; GCN-NEXT: [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP8]]) +; GCN-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; GCN-NEXT: [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP9]]) +; GCN-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; GCN-NEXT: [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP10]]) +; GCN-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; GCN-NEXT: [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP11]]) ; GCN-NEXT: [[SUM01:%.*]] = fadd fast float [[EXP0]], [[EXP1]] ; GCN-NEXT: [[SUM23:%.*]] = fadd fast float [[EXP2]], [[EXP3]] ; GCN-NEXT: [[SUM:%.*]] = fadd fast float [[SUM01]], [[SUM23]] @@ -313,32 +312,24 @@ define amdgpu_kernel void @wider_exp2_half(ptr addrspace(1) %input, ptr addrspac ; GCN-LABEL: define amdgpu_kernel void @wider_exp2_half( ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2 -; GCN-NEXT: [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1 -; GCN-NEXT: [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2 ; GCN-NEXT: [[PTR2:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 2 -; GCN-NEXT: [[IN2:%.*]] = load half, ptr addrspace(1) [[PTR2]], align 2 -; GCN-NEXT: [[PTR3:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 3 -; GCN-NEXT: [[IN3:%.*]] = load half, ptr addrspace(1) [[PTR3]], align 2 -; GCN-NEXT: [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2 -; GCN-NEXT: [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1 -; GCN-NEXT: [[SCALE1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2 ; GCN-NEXT: [[SPTR2:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 2 -; GCN-NEXT: [[SCALE2:%.*]] = load half, ptr addrspace(1) [[SPTR2]], align 2 -; GCN-NEXT: [[SPTR3:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 3 -; GCN-NEXT: [[SCALE3:%.*]] = load half, ptr addrspace(1) [[SPTR3]], align 2 -; GCN-NEXT: [[MUL0:%.*]] = fmul contract half [[IN0]], 0xH3E14 -; GCN-NEXT: [[MUL1:%.*]] = fmul contract half [[IN1]], 0xH3E14 -; GCN-NEXT: [[MUL2:%.*]] = fmul contract half [[IN2]], 0xH3E14 -; GCN-NEXT: [[MUL3:%.*]] = fmul contract half [[IN3]], 0xH3E14 -; GCN-NEXT: [[SUB0:%.*]] = fsub contract half [[MUL0]], [[SCALE0]] -; GCN-NEXT: [[SUB1:%.*]] = fsub contract half [[MUL1]], [[SCALE1]] -; GCN-NEXT: [[SUB2:%.*]] = fsub contract half [[MUL2]], [[SCALE2]] -; GCN-NEXT: [[SUB3:%.*]] = fsub contract half [[MUL3]], [[SCALE3]] -; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB0]]) -; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB1]]) -; GCN-NEXT: [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB2]]) -; GCN-NEXT: [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB3]]) +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14) +; GCN-NEXT: [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]] +; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR2]], align 2 +; GCN-NEXT: [[TMP5:%.*]] = load <2 x half>, ptr addrspace(1) [[SPTR2]], align 2 +; GCN-NEXT: [[TMP6:%.*]] = fmul contract <2 x half> [[TMP4]], splat (half 0xH3E14) +; GCN-NEXT: [[TMP7:%.*]] = fsub contract <2 x half> [[TMP6]], [[TMP5]] +; GCN-NEXT: [[TMP8:%.*]] = extractelement <2 x half> [[TMP3]], i32 0 +; GCN-NEXT: [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP8]]) +; GCN-NEXT: [[TMP9:%.*]] = extractelement <2 x half> [[TMP3]], i32 1 +; GCN-NEXT: [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP9]]) +; GCN-NEXT: [[TMP10:%.*]] = extractelement <2 x half> [[TMP7]], i32 0 +; GCN-NEXT: [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP10]]) +; GCN-NEXT: [[TMP11:%.*]] = extractelement <2 x half> [[TMP7]], i32 1 +; GCN-NEXT: [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP11]]) ; GCN-NEXT: [[SUM01:%.*]] = fadd fast half [[EXP0]], [[EXP1]] ; GCN-NEXT: [[SUM23:%.*]] = fadd fast half [[EXP2]], [[EXP3]] ; GCN-NEXT: [[SUM:%.*]] = fadd fast half [[SUM01]], [[SUM23]] @@ -388,35 +379,64 @@ entry: } define amdgpu_kernel void @kernel_div_scale(ptr addrspace(1) %num, ptr addrspace(1) %den, ptr addrspace(1) %output) { -; GCN-LABEL: define amdgpu_kernel void @kernel_div_scale( -; GCN-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { -; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[N0:%.*]] = load float, ptr addrspace(1) [[NUM]], align 4 -; GCN-NEXT: [[NPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 1 -; GCN-NEXT: [[N1:%.*]] = load float, ptr addrspace(1) [[NPTR1]], align 4 -; GCN-NEXT: [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2 -; GCN-NEXT: [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4 -; GCN-NEXT: [[D0:%.*]] = load float, ptr addrspace(1) [[DEN]], align 4 -; GCN-NEXT: [[DPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 1 -; GCN-NEXT: [[D1:%.*]] = load float, ptr addrspace(1) [[DPTR1]], align 4 -; GCN-NEXT: [[DPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 2 -; GCN-NEXT: [[D2:%.*]] = load float, ptr addrspace(1) [[DPTR2]], align 4 -; GCN-NEXT: [[MUL_N0:%.*]] = fmul float [[N0]], 2.000000e+00 -; GCN-NEXT: [[MUL_N1:%.*]] = fmul float [[N1]], 2.000000e+00 -; GCN-NEXT: [[MUL_N2:%.*]] = fmul float [[N2]], 2.000000e+00 -; GCN-NEXT: [[MUL_D0:%.*]] = fmul float [[D0]], 4.000000e+00 -; GCN-NEXT: [[MUL_D1:%.*]] = fmul float [[D1]], 4.000000e+00 -; GCN-NEXT: [[MUL_D2:%.*]] = fmul float [[D2]], 4.000000e+00 -; GCN-NEXT: [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N0]], float [[MUL_D0]], i1 false) -; GCN-NEXT: [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N1]], float [[MUL_D1]], i1 false) -; GCN-NEXT: [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N2]], float [[MUL_D2]], i1 false) -; GCN-NEXT: [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0 -; GCN-NEXT: [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0 -; GCN-NEXT: [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0 -; GCN-NEXT: [[SUM01:%.*]] = fadd float [[R0]], [[R1]] -; GCN-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[R2]] -; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 -; GCN-NEXT: ret void +; INSTCNT-LABEL: define amdgpu_kernel void @kernel_div_scale( +; INSTCNT-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; INSTCNT-NEXT: [[ENTRY:.*:]] +; INSTCNT-NEXT: [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2 +; INSTCNT-NEXT: [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4 +; INSTCNT-NEXT: [[DPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 2 +; INSTCNT-NEXT: [[D2:%.*]] = load float, ptr addrspace(1) [[DPTR2]], align 4 +; INSTCNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[NUM]], align 4 +; INSTCNT-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], splat (float 2.000000e+00) +; INSTCNT-NEXT: [[MUL_N2:%.*]] = fmul float [[N2]], 2.000000e+00 +; INSTCNT-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[DEN]], align 4 +; INSTCNT-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], splat (float 4.000000e+00) +; INSTCNT-NEXT: [[MUL_D2:%.*]] = fmul float [[D2]], 4.000000e+00 +; INSTCNT-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; INSTCNT-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; INSTCNT-NEXT: [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP4]], float [[TMP5]], i1 false) +; INSTCNT-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; INSTCNT-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; INSTCNT-NEXT: [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP6]], float [[TMP7]], i1 false) +; INSTCNT-NEXT: [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N2]], float [[MUL_D2]], i1 false) +; INSTCNT-NEXT: [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0 +; INSTCNT-NEXT: [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0 +; INSTCNT-NEXT: [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0 +; INSTCNT-NEXT: [[SUM01:%.*]] = fadd float [[R0]], [[R1]] +; INSTCNT-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[R2]] +; INSTCNT-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 +; INSTCNT-NEXT: ret void +; +; NO-INSTCNT-LABEL: define amdgpu_kernel void @kernel_div_scale( +; NO-INSTCNT-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; NO-INSTCNT-NEXT: [[ENTRY:.*:]] +; NO-INSTCNT-NEXT: [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2 +; NO-INSTCNT-NEXT: [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4 +; NO-INSTCNT-NEXT: [[D0:%.*]] = load float, ptr addrspace(1) [[DEN]], align 4 +; NO-INSTCNT-NEXT: [[DPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 1 +; NO-INSTCNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[NUM]], align 4 +; NO-INSTCNT-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], splat (float 2.000000e+00) +; NO-INSTCNT-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[N2]], i32 0 +; NO-INSTCNT-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[D0]], i32 1 +; NO-INSTCNT-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], +; NO-INSTCNT-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[DPTR1]], align 4 +; NO-INSTCNT-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], splat (float 4.000000e+00) +; NO-INSTCNT-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; NO-INSTCNT-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NO-INSTCNT-NEXT: [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP7]], float [[TMP8]], i1 false) +; NO-INSTCNT-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; NO-INSTCNT-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; NO-INSTCNT-NEXT: [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP9]], float [[TMP10]], i1 false) +; NO-INSTCNT-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NO-INSTCNT-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; NO-INSTCNT-NEXT: [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP11]], float [[TMP12]], i1 false) +; NO-INSTCNT-NEXT: [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0 +; NO-INSTCNT-NEXT: [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0 +; NO-INSTCNT-NEXT: [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0 +; NO-INSTCNT-NEXT: [[SUM01:%.*]] = fadd float [[R0]], [[R1]] +; NO-INSTCNT-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[R2]] +; NO-INSTCNT-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 +; NO-INSTCNT-NEXT: ret void ; entry: %n0 = load float, ptr addrspace(1) %num, align 4 @@ -451,21 +471,18 @@ define amdgpu_kernel void @kernel_fmed3(ptr addrspace(1) %a, ptr addrspace(1) %b ; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3( ; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { ; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4 -; GCN-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1 -; GCN-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4 ; GCN-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 ; GCN-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 -; GCN-NEXT: [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4 -; GCN-NEXT: [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1 -; GCN-NEXT: [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4 ; GCN-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 ; GCN-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4 -; GCN-NEXT: [[ADD0:%.*]] = fadd float [[A0]], [[B0]] -; GCN-NEXT: [[ADD1:%.*]] = fadd float [[A1]], [[B1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 4 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4 +; GCN-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]] ; GCN-NEXT: [[ADD2:%.*]] = fadd float [[A2]], [[B2]] -; GCN-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD0]], float [[ADD0]], float 1.000000e+00) -; GCN-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD1]], float [[ADD1]], float 1.000000e+00) +; GCN-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; GCN-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP3]], float [[TMP3]], float 1.000000e+00) +; GCN-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; GCN-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP4]], float [[TMP4]], float 1.000000e+00) ; GCN-NEXT: [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[ADD2]], float 1.000000e+00) ; GCN-NEXT: [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]] ; GCN-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[MED2]] @@ -500,40 +517,75 @@ entry: } define amdgpu_kernel void @kernel_fmed3_1(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) { -; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3_1( -; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { -; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4 -; GCN-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1 -; GCN-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4 -; GCN-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 -; GCN-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 -; GCN-NEXT: [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3 -; GCN-NEXT: [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4 -; GCN-NEXT: [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4 -; GCN-NEXT: [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1 -; GCN-NEXT: [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4 -; GCN-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 -; GCN-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4 -; GCN-NEXT: [[BPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 3 -; GCN-NEXT: [[B3:%.*]] = load float, ptr addrspace(1) [[BPTR3]], align 4 -; GCN-NEXT: [[ADD0:%.*]] = fadd float 5.000000e+00, [[B0]] -; GCN-NEXT: [[ADD1:%.*]] = fadd float 5.000000e+00, [[B1]] -; GCN-NEXT: [[ADD2:%.*]] = fadd float 5.000000e+00, [[B2]] -; GCN-NEXT: [[ADD3:%.*]] = fadd float 5.000000e+00, [[B3]] -; GCN-NEXT: [[SUB0:%.*]] = fadd float 1.000000e+00, [[B0]] -; GCN-NEXT: [[SUB1:%.*]] = fadd float 1.000000e+00, [[B1]] -; GCN-NEXT: [[SUB2:%.*]] = fadd float 1.000000e+00, [[B2]] -; GCN-NEXT: [[SUB3:%.*]] = fadd float 1.000000e+00, [[B3]] -; GCN-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD0]], float [[SUB0]], float 1.000000e+00) -; GCN-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD1]], float [[SUB1]], float 1.000000e+00) -; GCN-NEXT: [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[SUB2]], float 1.000000e+00) -; GCN-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD3]], float [[SUB3]], float 1.000000e+00) -; GCN-NEXT: [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]] -; GCN-NEXT: [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]] -; GCN-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]] -; GCN-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 -; GCN-NEXT: ret void +; INSTCNT-LABEL: define amdgpu_kernel void @kernel_fmed3_1( +; INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; INSTCNT-NEXT: [[ENTRY:.*:]] +; INSTCNT-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; INSTCNT-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1 +; INSTCNT-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4 +; INSTCNT-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 +; INSTCNT-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 +; INSTCNT-NEXT: [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3 +; INSTCNT-NEXT: [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4 +; INSTCNT-NEXT: [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4 +; INSTCNT-NEXT: [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1 +; INSTCNT-NEXT: [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4 +; INSTCNT-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 +; INSTCNT-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4 +; INSTCNT-NEXT: [[BPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 3 +; INSTCNT-NEXT: [[B3:%.*]] = load float, ptr addrspace(1) [[BPTR3]], align 4 +; INSTCNT-NEXT: [[ADD0:%.*]] = fadd float 5.000000e+00, [[B0]] +; INSTCNT-NEXT: [[ADD1:%.*]] = fadd float 5.000000e+00, [[B1]] +; INSTCNT-NEXT: [[ADD2:%.*]] = fadd float 5.000000e+00, [[B2]] +; INSTCNT-NEXT: [[ADD3:%.*]] = fadd float 5.000000e+00, [[B3]] +; INSTCNT-NEXT: [[SUB0:%.*]] = fadd float 1.000000e+00, [[B0]] +; INSTCNT-NEXT: [[SUB1:%.*]] = fadd float 1.000000e+00, [[B1]] +; INSTCNT-NEXT: [[SUB2:%.*]] = fadd float 1.000000e+00, [[B2]] +; INSTCNT-NEXT: [[SUB3:%.*]] = fadd float 1.000000e+00, [[B3]] +; INSTCNT-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD0]], float [[SUB0]], float 1.000000e+00) +; INSTCNT-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD1]], float [[SUB1]], float 1.000000e+00) +; INSTCNT-NEXT: [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[SUB2]], float 1.000000e+00) +; INSTCNT-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD3]], float [[SUB3]], float 1.000000e+00) +; INSTCNT-NEXT: [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]] +; INSTCNT-NEXT: [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]] +; INSTCNT-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]] +; INSTCNT-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 +; INSTCNT-NEXT: ret void +; +; NO-INSTCNT-LABEL: define amdgpu_kernel void @kernel_fmed3_1( +; NO-INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; NO-INSTCNT-NEXT: [[ENTRY:.*:]] +; NO-INSTCNT-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; NO-INSTCNT-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1 +; NO-INSTCNT-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4 +; NO-INSTCNT-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 +; NO-INSTCNT-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 +; NO-INSTCNT-NEXT: [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3 +; NO-INSTCNT-NEXT: [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4 +; NO-INSTCNT-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 +; NO-INSTCNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4 +; NO-INSTCNT-NEXT: [[TMP1:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP0]] +; NO-INSTCNT-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[BPTR2]], align 4 +; NO-INSTCNT-NEXT: [[TMP3:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP2]] +; NO-INSTCNT-NEXT: [[TMP4:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP0]] +; NO-INSTCNT-NEXT: [[TMP5:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP2]] +; NO-INSTCNT-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; NO-INSTCNT-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NO-INSTCNT-NEXT: [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP6]], float [[TMP7]], float 1.000000e+00) +; NO-INSTCNT-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; NO-INSTCNT-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NO-INSTCNT-NEXT: [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP8]], float [[TMP9]], float 1.000000e+00) +; NO-INSTCNT-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; NO-INSTCNT-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; NO-INSTCNT-NEXT: [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP10]], float [[TMP11]], float 1.000000e+00) +; NO-INSTCNT-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; NO-INSTCNT-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; NO-INSTCNT-NEXT: [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP12]], float [[TMP13]], float 1.000000e+00) +; NO-INSTCNT-NEXT: [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]] +; NO-INSTCNT-NEXT: [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]] +; NO-INSTCNT-NEXT: [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]] +; NO-INSTCNT-NEXT: store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4 +; NO-INSTCNT-NEXT: ret void ; entry: %a0 = load float, ptr addrspace(1) %a, align 4 @@ -680,44 +732,81 @@ entry: } define amdgpu_kernel void @kernel_alternate(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) { -; GCN-LABEL: define amdgpu_kernel void @kernel_alternate( -; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { -; GCN-NEXT: [[ENTRY:.*:]] -; GCN-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4 -; GCN-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1 -; GCN-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4 -; GCN-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 -; GCN-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 -; GCN-NEXT: [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4 -; GCN-NEXT: [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1 -; GCN-NEXT: [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4 -; GCN-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 -; GCN-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4 -; GCN-NEXT: [[ADD0:%.*]] = fadd float [[A0]], [[B0]] -; GCN-NEXT: [[SUB0:%.*]] = fsub float [[A0]], [[B0]] -; GCN-NEXT: [[ADD1:%.*]] = fadd float [[A1]], [[B1]] -; GCN-NEXT: [[SUB1:%.*]] = fsub float [[A1]], [[B1]] -; GCN-NEXT: [[ADD2:%.*]] = fadd float [[A2]], [[B2]] -; GCN-NEXT: [[SUB2:%.*]] = fsub float [[A2]], [[B2]] -; GCN-NEXT: [[E0:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD0]]) -; GCN-NEXT: [[E1:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB0]]) -; GCN-NEXT: [[E2:%.*]] = call float @llvm.amdgcn.log.f32(float [[ADD1]]) -; GCN-NEXT: [[E3:%.*]] = call float @llvm.amdgcn.log.f32(float [[SUB1]]) -; GCN-NEXT: [[E4:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD2]]) -; GCN-NEXT: [[E5:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB2]]) -; GCN-NEXT: [[OPTR0:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 0 -; GCN-NEXT: [[OPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 1 -; GCN-NEXT: [[OPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 2 -; GCN-NEXT: [[OPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 3 -; GCN-NEXT: [[OPTR4:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 4 -; GCN-NEXT: [[OPTR5:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 5 -; GCN-NEXT: store float [[E0]], ptr addrspace(1) [[OPTR0]], align 4 -; GCN-NEXT: store float [[E1]], ptr addrspace(1) [[OPTR1]], align 4 -; GCN-NEXT: store float [[E2]], ptr addrspace(1) [[OPTR2]], align 4 -; GCN-NEXT: store float [[E3]], ptr addrspace(1) [[OPTR3]], align 4 -; GCN-NEXT: store float [[E4]], ptr addrspace(1) [[OPTR4]], align 4 -; GCN-NEXT: store float [[E5]], ptr addrspace(1) [[OPTR5]], align 4 -; GCN-NEXT: ret void +; INSTCNT-LABEL: define amdgpu_kernel void @kernel_alternate( +; INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; INSTCNT-NEXT: [[ENTRY:.*:]] +; INSTCNT-NEXT: [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; INSTCNT-NEXT: [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1 +; INSTCNT-NEXT: [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4 +; INSTCNT-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 +; INSTCNT-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 +; INSTCNT-NEXT: [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4 +; INSTCNT-NEXT: [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1 +; INSTCNT-NEXT: [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4 +; INSTCNT-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 +; INSTCNT-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4 +; INSTCNT-NEXT: [[ADD0:%.*]] = fadd float [[A0]], [[B0]] +; INSTCNT-NEXT: [[SUB0:%.*]] = fsub float [[A0]], [[B0]] +; INSTCNT-NEXT: [[ADD1:%.*]] = fadd float [[A1]], [[B1]] +; INSTCNT-NEXT: [[SUB1:%.*]] = fsub float [[A1]], [[B1]] +; INSTCNT-NEXT: [[ADD2:%.*]] = fadd float [[A2]], [[B2]] +; INSTCNT-NEXT: [[SUB2:%.*]] = fsub float [[A2]], [[B2]] +; INSTCNT-NEXT: [[E0:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD0]]) +; INSTCNT-NEXT: [[E1:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB0]]) +; INSTCNT-NEXT: [[E2:%.*]] = call float @llvm.amdgcn.log.f32(float [[ADD1]]) +; INSTCNT-NEXT: [[E3:%.*]] = call float @llvm.amdgcn.log.f32(float [[SUB1]]) +; INSTCNT-NEXT: [[E4:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD2]]) +; INSTCNT-NEXT: [[E5:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB2]]) +; INSTCNT-NEXT: [[OPTR0:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 0 +; INSTCNT-NEXT: [[OPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 1 +; INSTCNT-NEXT: [[OPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 2 +; INSTCNT-NEXT: [[OPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 3 +; INSTCNT-NEXT: [[OPTR4:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 4 +; INSTCNT-NEXT: [[OPTR5:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 5 +; INSTCNT-NEXT: store float [[E0]], ptr addrspace(1) [[OPTR0]], align 4 +; INSTCNT-NEXT: store float [[E1]], ptr addrspace(1) [[OPTR1]], align 4 +; INSTCNT-NEXT: store float [[E2]], ptr addrspace(1) [[OPTR2]], align 4 +; INSTCNT-NEXT: store float [[E3]], ptr addrspace(1) [[OPTR3]], align 4 +; INSTCNT-NEXT: store float [[E4]], ptr addrspace(1) [[OPTR4]], align 4 +; INSTCNT-NEXT: store float [[E5]], ptr addrspace(1) [[OPTR5]], align 4 +; INSTCNT-NEXT: ret void +; +; NO-INSTCNT-LABEL: define amdgpu_kernel void @kernel_alternate( +; NO-INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] { +; NO-INSTCNT-NEXT: [[ENTRY:.*:]] +; NO-INSTCNT-NEXT: [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2 +; NO-INSTCNT-NEXT: [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4 +; NO-INSTCNT-NEXT: [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2 +; NO-INSTCNT-NEXT: [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4 +; NO-INSTCNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 4 +; NO-INSTCNT-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4 +; NO-INSTCNT-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]] +; NO-INSTCNT-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP0]], [[TMP1]] +; NO-INSTCNT-NEXT: [[ADD2:%.*]] = fadd float [[A2]], [[B2]] +; NO-INSTCNT-NEXT: [[SUB2:%.*]] = fsub float [[A2]], [[B2]] +; NO-INSTCNT-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; NO-INSTCNT-NEXT: [[E0:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[TMP4]]) +; NO-INSTCNT-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; NO-INSTCNT-NEXT: [[E1:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[TMP5]]) +; NO-INSTCNT-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; NO-INSTCNT-NEXT: [[E2:%.*]] = call float @llvm.amdgcn.log.f32(float [[TMP6]]) +; NO-INSTCNT-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; NO-INSTCNT-NEXT: [[E3:%.*]] = call float @llvm.amdgcn.log.f32(float [[TMP7]]) +; NO-INSTCNT-NEXT: [[E4:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD2]]) +; NO-INSTCNT-NEXT: [[E5:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB2]]) +; NO-INSTCNT-NEXT: [[OPTR0:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 0 +; NO-INSTCNT-NEXT: [[OPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 1 +; NO-INSTCNT-NEXT: [[OPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 2 +; NO-INSTCNT-NEXT: [[OPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 3 +; NO-INSTCNT-NEXT: [[OPTR4:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 4 +; NO-INSTCNT-NEXT: [[OPTR5:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 5 +; NO-INSTCNT-NEXT: store float [[E0]], ptr addrspace(1) [[OPTR0]], align 4 +; NO-INSTCNT-NEXT: store float [[E1]], ptr addrspace(1) [[OPTR1]], align 4 +; NO-INSTCNT-NEXT: store float [[E2]], ptr addrspace(1) [[OPTR2]], align 4 +; NO-INSTCNT-NEXT: store float [[E3]], ptr addrspace(1) [[OPTR3]], align 4 +; NO-INSTCNT-NEXT: store float [[E4]], ptr addrspace(1) [[OPTR4]], align 4 +; NO-INSTCNT-NEXT: store float [[E5]], ptr addrspace(1) [[OPTR5]], align 4 +; NO-INSTCNT-NEXT: ret void ; entry: %a0 = load float, ptr addrspace(1) %a, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll index e13dfce8c29f3..016726e5ae371 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll @@ -177,11 +177,10 @@ define ptr @test4() { ; NONPOWEROF2-NEXT: [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ] ; NONPOWEROF2-NEXT: br label [[TMP9:%.*]] ; NONPOWEROF2: 10: -; NONPOWEROF2-NEXT: [[TMP10:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> -; NONPOWEROF2-NEXT: [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]] -; NONPOWEROF2-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> -; NONPOWEROF2-NEXT: [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]] +; NONPOWEROF2-NEXT: [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]] +; NONPOWEROF2-NEXT: [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> ; NONPOWEROF2-NEXT: [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]]) +; NONPOWEROF2-NEXT: [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> ; NONPOWEROF2-NEXT: [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]]) ; NONPOWEROF2-NEXT: [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]]) ; NONPOWEROF2-NEXT: [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])