diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 877c83291170b..4e87abaa93dae 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -148,6 +148,11 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   bool vectorizeInserts(InstSetVector &Instructions, BasicBlock *BB,
                         slpvectorizer::BoUpSLP &R);
 
+  /// Tries to vectorize the operands of the non-trivially-vectorizable
+  /// intrinsic calls.
+  bool vectorizeNonTriviallyVectrizableIntrinsicCallOperand(
+      InstSetVector &IIs, BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+
   /// Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc5a206cacbd0..e1456f0ebbac2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -270,6 +270,34 @@ static const int MinScheduleRegionSize = 16;
 /// Maximum allowed number of operands in the PHI nodes.
 static const unsigned MaxPHINumOperands = 128;
 
+/// For the non-trivially vectorizable intrinsics calls, try to vectorize their
+/// operands.
+/// FIXME: Extend for all non-vectorized functions.
+SmallVector<Value *, 4>
+getNonTriviallyVectorizableIntrinsicCallOperand(Value *V) {
+
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  if (!II || isAssumeLikeIntrinsic(II))
+    return {};
+
+  if (isTriviallyVectorizable(II->getIntrinsicID()))
+    return {};
+
+  // Skip vector-returning intrinsics in non-revec mode.
+  if (!SLPReVec && II->getType()->isVectorTy())
+    return {};
+
+  // FIXME: Add non-instructions operands to the list.
+  SmallVector<Value *, 4> Operands;
+  for (Value *ArgOp : II->args()) {
+    if (auto *I = dyn_cast<Instruction>(ArgOp)) {
+      Operands.emplace_back(I);
+    }
+  }
+
+  return Operands;
+}
+
 /// Predicate for the element types that the SLP vectorizer supports.
 ///
 /// The most important thing to filter here are types which are invalid in LLVM
@@ -30263,6 +30291,89 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
   return Changed;
 }
 
+bool SLPVectorizerPass::vectorizeNonTriviallyVectrizableIntrinsicCallOperand(
+    InstSetVector &IIs, BasicBlock *BB, BoUpSLP &R) {
+
+  bool Changed = false;
+
+  // Pass1 - try to find horizontal reductions of operands.
+  for (Instruction *I : IIs) {
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II || R.isDeleted(II))
+      continue;
+    for (Value *Op : II->args())
+      if (auto *RootOp = dyn_cast<Instruction>(Op)) {
+        Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
+        if (R.isDeleted(II))
+          break;
+      }
+  }
+  // Operands sorter.
+  auto OperandSorter = [this](Value *V1, Value *V2) -> bool {
+    if (V1 == V2)
+      return false;
+    auto *I1 = cast<Instruction>(V1);
+    auto *I2 = cast<Instruction>(V2);
+    if (I1->getType()->getTypeID() != I2->getType()->getTypeID())
+      return I1->getType()->getTypeID() < I2->getType()->getTypeID();
+    if (I1->getType()->getScalarSizeInBits() !=
+        I2->getType()->getScalarSizeInBits())
+      return I1->getType()->getScalarSizeInBits() <
+             I2->getType()->getScalarSizeInBits();
+    DomTreeNodeBase<BasicBlock> *Node1 = DT->getNode(I1->getParent());
+    DomTreeNodeBase<BasicBlock> *Node2 = DT->getNode(I2->getParent());
+    if (!Node1)
+      return Node2 != nullptr;
+    if (!Node2)
+      return false;
+    if (Node1->getDFSNumIn() == Node2->getDFSNumIn()) {
+      if (I1->getOpcode() != I2->getOpcode())
+        return I1->getOpcode() < I2->getOpcode();
+      return I1->comesBefore(I2);
+    }
+    return Node1->getDFSNumIn() < Node2->getDFSNumIn();
+  };
+
+  // Compatibility checker for the operands.
+  auto AreCompatibleOperands = [](ArrayRef<Value *> VL, Value *V) -> bool {
+    if (VL.empty() || VL.back() == V)
+      return true;
+    auto *I1 = cast<Instruction>(VL.back());
+    auto *I2 = cast<Instruction>(V);
+    return I1->getType() == I2->getType() &&
+           I1->getParent() == I2->getParent() &&
+           I1->getOpcode() == I2->getOpcode();
+  };
+
+  // Collect the operands of the non-trivially-vectorizable intrinsic calls.
+  SmallVector<Value *, 4> CandidateSeeds;
+  for (Instruction *I : IIs) {
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II || R.isDeleted(II))
+      continue;
+    SmallVector<Value *, 4> Ops =
+        getNonTriviallyVectorizableIntrinsicCallOperand(II);
+    for (Value *Op : Ops)
+      if (isa<Instruction>(Op))
+        CandidateSeeds.push_back(Op);
+  }
+
+  auto CandidatesFiltered = make_filter_range(CandidateSeeds, [&](Value *V) {
+    auto *I = dyn_cast<Instruction>(V);
+    return I && !R.isDeleted(I) && isValidElementType(I->getType());
+  });
+  SmallVector<Value *, 4> CandidateVec(CandidatesFiltered);
+  // Pass2 - try to vectorize the operands.
+  Changed |= tryToVectorizeSequence<Value>(
+      CandidateVec, OperandSorter, AreCompatibleOperands,
+      [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
+        return tryToVectorizeList(Candidates, R, MaxVFOnly);
+      },
+      /*MaxVFOnly=*/true, R);
+
+  return Changed;
+}
+
 bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
                                          BasicBlock *BB, BoUpSLP &R) {
   assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
@@ -30535,21 +30646,33 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
   InstSetVector PostProcessInserts;
   SmallSetVector<CmpInst *, 8> PostProcessCmps;
-  // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
-  // also vectorizes `PostProcessCmps`.
-  auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
+  InstSetVector PostProcessIntrinsicCalls;
+  // Vectorizes Inserts in `PostProcessInserts` and if `AtTerminator` is true
+  // also vectorizes `PostProcessCmps` and `PostProcessIntrinsicCalls`.
+  auto VectorizeInsertsAndCmps = [&](bool AtTerminator) {
     bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
-    if (VectorizeCmps) {
+    if (AtTerminator) {
       Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
       PostProcessCmps.clear();
+      Changed |= vectorizeNonTriviallyVectrizableIntrinsicCallOperand(
+          PostProcessIntrinsicCalls, BB, R);
+      PostProcessIntrinsicCalls.clear();
     }
     PostProcessInserts.clear();
     return Changed;
   };
+  auto isNonTriviallyVectorizableIntrinsic = [](const IntrinsicInst *II) {
+    return !isTriviallyVectorizable(II->getIntrinsicID()) &&
+           !isAssumeLikeIntrinsic(II) &&
+           !(!SLPReVec && II->getType()->isVectorTy());
+  };
   // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
   auto IsInPostProcessInstrs = [&](Instruction *I) {
     if (auto *Cmp = dyn_cast<CmpInst>(I))
       return PostProcessCmps.contains(Cmp);
+    if (auto *II = dyn_cast<IntrinsicInst>(I);
+        II && isNonTriviallyVectorizableIntrinsic(II))
+      return PostProcessIntrinsicCalls.contains(II);
     return isa<InsertElementInst, InsertValueInst>(I) &&
            PostProcessInserts.contains(I);
   };
@@ -30560,6 +30683,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     return I->use_empty() &&
            (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
   };
+  SmallMapVector<std::pair<Intrinsic::ID, unsigned>, // (ID, OpIndex)
+                 SmallMapVector<unsigned,            // Opcode
+                                SmallVector<Value *, 4>, 4>,
+                 4>
+      IntrinsicSeedOps;
   for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
     // Skip instructions with scalable type. The num of elements is unknown at
     // compile-time for scalable type.
@@ -30572,7 +30700,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     // We may go through BB multiple times so skip the one we have checked.
     if (!VisitedInstrs.insert(&*It).second) {
       if (HasNoUsers(&*It) &&
-          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
+          VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator())) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
         Changed = true;
@@ -30652,7 +30780,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // top-tree instructions to try to vectorize as many instructions as
       // possible.
       OpsChanged |=
-          VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
+          VectorizeInsertsAndCmps(/*AtTerminator=*/It->isTerminator());
       if (OpsChanged) {
         // We would like to start over since some instructions are deleted
         // and the iterator may become invalid value.
@@ -30667,6 +30795,9 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       PostProcessInserts.insert(&*It);
     else if (isa<CmpInst>(It))
       PostProcessCmps.insert(cast<CmpInst>(&*It));
+    else if (auto *II = dyn_cast<IntrinsicInst>(&*It);
+             II && isNonTriviallyVectorizableIntrinsic(II))
+      PostProcessIntrinsicCalls.insert(II);
   }
 
   return Changed;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
index 26d61cb6bc28d..b8df8d44968f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/notriviallyvectorizableintrinsicoperands.ll
@@ -1,28 +1,51 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=slp-vectorizer -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefix GCN
+; RUN: opt -passes=slp-vectorizer -slp-inst-count-check=true -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefixes GCN,INSTCNT
+; RUN: opt -passes=slp-vectorizer -slp-inst-count-check=false -S -mcpu=gfx1250 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck %s --check-prefixes GCN,NO-INSTCNT
 
 define amdgpu_kernel void @test_with_wmma( ptr addrspace(1) %input, ptr addrspace(1) %output, float %scaled_max, <16 x i32> %A, <16 x i32> %B, i32 %scale_idx) {
-; GCN-LABEL: define amdgpu_kernel void @test_with_wmma(
-; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
-; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
-; GCN-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALED_MAX]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALED_MAX]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
-; GCN-NEXT:    [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
-; GCN-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
-; GCN-NEXT:    [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
-; GCN-NEXT:    [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
-; GCN-NEXT:    [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
-; GCN-NEXT:    [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
-; GCN-NEXT:    [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
-; GCN-NEXT:    store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
-; GCN-NEXT:    ret void
+; INSTCNT-LABEL: define amdgpu_kernel void @test_with_wmma(
+; INSTCNT-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; INSTCNT-NEXT:  [[ENTRY:.*:]]
+; INSTCNT-NEXT:    [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
+; INSTCNT-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
+; INSTCNT-NEXT:    [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
+; INSTCNT-NEXT:    [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
+; INSTCNT-NEXT:    [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
+; INSTCNT-NEXT:    [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALED_MAX]]
+; INSTCNT-NEXT:    [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALED_MAX]]
+; INSTCNT-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
+; INSTCNT-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
+; INSTCNT-NEXT:    [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; INSTCNT-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; INSTCNT-NEXT:    [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; INSTCNT-NEXT:    [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; INSTCNT-NEXT:    [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; INSTCNT-NEXT:    [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; INSTCNT-NEXT:    [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; INSTCNT-NEXT:    store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; INSTCNT-NEXT:    ret void
+;
+; NO-INSTCNT-LABEL: define amdgpu_kernel void @test_with_wmma(
+; NO-INSTCNT-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[OUTPUT:%.*]], float [[SCALED_MAX:%.*]], <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i32 [[SCALE_IDX:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-INSTCNT-NEXT:  [[ENTRY:.*:]]
+; NO-INSTCNT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; NO-INSTCNT-NEXT:    [[TMP1:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; NO-INSTCNT-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[SCALED_MAX]], i32 0
+; NO-INSTCNT-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; NO-INSTCNT-NEXT:    [[TMP4:%.*]] = fsub contract <2 x float> [[TMP1]], [[TMP3]]
+; NO-INSTCNT-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NO-INSTCNT-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; NO-INSTCNT-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NO-INSTCNT-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP6]])
+; NO-INSTCNT-NEXT:    [[VEC0:%.*]] = insertelement <2 x float> poison, float [[EXP0]], i64 0
+; NO-INSTCNT-NEXT:    [[VEC1:%.*]] = insertelement <2 x float> [[VEC0]], float [[EXP1]], i64 1
+; NO-INSTCNT-NEXT:    [[VEC_I32:%.*]] = bitcast <2 x float> [[VEC1]] to <2 x i32>
+; NO-INSTCNT-NEXT:    [[SCALE0:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 0
+; NO-INSTCNT-NEXT:    [[SCALE1:%.*]] = extractelement <2 x i32> [[VEC_I32]], i64 1
+; NO-INSTCNT-NEXT:    [[WMMA0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 [[SCALE0]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; NO-INSTCNT-NEXT:    [[WMMA1:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> [[A]], i32 0, <16 x i32> [[B]], i16 0, <8 x float> [[WMMA0]], i32 0, i32 0, i32 [[SCALE1]], i32 0, i32 0, i32 [[SCALE_IDX]], i1 false, i1 false)
+; NO-INSTCNT-NEXT:    store <8 x float> [[WMMA1]], ptr addrspace(1) [[OUTPUT]], align 32
+; NO-INSTCNT-NEXT:    ret void
 ;
 entry:
 
@@ -61,21 +84,17 @@ entry:
 
 define amdgpu_kernel void @test_amdgcn_exp_log(ptr addrspace(1) %input, ptr addrspace(1) %scales, ptr addrspace(1) %output) {
 ; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_log(
-; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
-; GCN-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
-; GCN-NEXT:    [[SCALE0:%.*]] = load float, ptr addrspace(1) [[SCALES]], align 4
-; GCN-NEXT:    [[SPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT:    [[SCALE1:%.*]] = load float, ptr addrspace(1) [[SPTR1]], align 4
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALE0]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALE1]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP4]])
 ; GCN-NEXT:    [[LOG0:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
 ; GCN-NEXT:    [[LOG1:%.*]] = tail call float @llvm.amdgcn.log.f32(float [[EXP1]])
 ; GCN-NEXT:    [[SUM:%.*]] = fadd fast float [[LOG0]], [[LOG1]]
 ; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
@@ -105,18 +124,14 @@ define amdgpu_kernel void @test_amdgcn_exp_f16(ptr addrspace(1) %input, ptr addr
 ; GCN-LABEL: define amdgpu_kernel void @test_amdgcn_exp_f16(
 ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2
-; GCN-NEXT:    [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2
-; GCN-NEXT:    [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2
-; GCN-NEXT:    [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT:    [[SCALE1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract half [[IN0]], 0xH3E14
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract half [[IN1]], 0xH3E14
-; GCN-NEXT:    [[SUB0:%.*]] = fsub contract half [[MUL0]], [[SCALE0]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub contract half [[MUL1]], [[SCALE1]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB1]])
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
 ; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
 ; GCN-NEXT:    store half [[SUM]], ptr addrspace(1) [[OUTPUT]], align 2
 ; GCN-NEXT:    ret void
@@ -143,18 +158,14 @@ define amdgpu_kernel void @kernel_f16(ptr addrspace(1) %input, ptr addrspace(1)
 ; GCN-LABEL: define amdgpu_kernel void @kernel_f16(
 ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2
-; GCN-NEXT:    [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2
-; GCN-NEXT:    [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2
-; GCN-NEXT:    [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT:    [[SCALE1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract half [[IN0]], 0xH3E14
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract half [[IN1]], 0xH3E14
-; GCN-NEXT:    [[SUB0:%.*]] = fsub contract half [[MUL0]], [[SCALE0]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub contract half [[MUL1]], [[SCALE1]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB1]])
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
 ; GCN-NEXT:    [[LOG0:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP0]])
 ; GCN-NEXT:    [[LOG1:%.*]] = tail call half @llvm.amdgcn.log.f16(half [[EXP1]])
 ; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[LOG0]], [[LOG1]]
@@ -185,22 +196,18 @@ define amdgpu_kernel void @look_through_reuse_shuffle(
 ; GCN-LABEL: define amdgpu_kernel void @look_through_reuse_shuffle(
 ; GCN-SAME: ptr addrspace(1) noalias [[INPUT:%.*]], ptr addrspace(1) noalias [[SCALES:%.*]], ptr addrspace(1) noalias [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT:    [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2
-; GCN-NEXT:    [[IN1:%.*]] = load half, ptr addrspace(1) [[IPTR1]], align 2
-; GCN-NEXT:    [[S0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2
-; GCN-NEXT:    [[S1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2
-; GCN-NEXT:    [[ADD0:%.*]] = fadd contract half [[IN0]], 0xH3E14
-; GCN-NEXT:    [[ADD1:%.*]] = fadd contract half [[IN1]], 0xH3E14
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract half [[ADD0]], [[S0]]
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract half [[ADD1]], [[S1]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[MUL0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[MUL1]])
-; GCN-NEXT:    [[TMP0:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0
-; GCN-NEXT:    [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[EXP1]], i32 1
-; GCN-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; GCN-NEXT:    store <4 x half> [[TMP2]], ptr addrspace(1) [[OUTPUT]], align 2
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fadd contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fmul contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP4]])
+; GCN-NEXT:    [[TMP5:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP5]])
+; GCN-NEXT:    [[TMP6:%.*]] = insertelement <4 x half> poison, half [[EXP0]], i32 0
+; GCN-NEXT:    [[TMP7:%.*]] = insertelement <4 x half> [[TMP6]], half [[EXP1]], i32 1
+; GCN-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP7]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; GCN-NEXT:    store <4 x half> [[TMP8]], ptr addrspace(1) [[OUTPUT]], align 2
 ; GCN-NEXT:    ret void
 ;
   ptr addrspace(1) noalias %input, ptr addrspace(1) noalias %scales,
@@ -237,32 +244,24 @@ define amdgpu_kernel void @wider_exp2_f32(ptr addrspace(1) %input, ptr addrspace
 ; GCN-LABEL: define amdgpu_kernel void @wider_exp2_f32(
 ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IN0:%.*]] = load float, ptr addrspace(1) [[INPUT]], align 4
-; GCN-NEXT:    [[PTR1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[IN1:%.*]] = load float, ptr addrspace(1) [[PTR1]], align 4
 ; GCN-NEXT:    [[PTR2:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 2
-; GCN-NEXT:    [[IN2:%.*]] = load float, ptr addrspace(1) [[PTR2]], align 4
-; GCN-NEXT:    [[PTR3:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 3
-; GCN-NEXT:    [[IN3:%.*]] = load float, ptr addrspace(1) [[PTR3]], align 4
-; GCN-NEXT:    [[SCALE0:%.*]] = load float, ptr addrspace(1) [[SCALES]], align 4
-; GCN-NEXT:    [[SPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT:    [[SCALE1:%.*]] = load float, ptr addrspace(1) [[SPTR1]], align 4
 ; GCN-NEXT:    [[SPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 2
-; GCN-NEXT:    [[SCALE2:%.*]] = load float, ptr addrspace(1) [[SPTR2]], align 4
-; GCN-NEXT:    [[SPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[SCALES]], i64 3
-; GCN-NEXT:    [[SCALE3:%.*]] = load float, ptr addrspace(1) [[SPTR3]], align 4
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract float [[IN0]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract float [[IN1]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[MUL2:%.*]] = fmul contract float [[IN2]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[MUL3:%.*]] = fmul contract float [[IN3]], 0x3FC0527DC0000000
-; GCN-NEXT:    [[SUB0:%.*]] = fsub contract float [[MUL0]], [[SCALE0]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub contract float [[MUL1]], [[SCALE1]]
-; GCN-NEXT:    [[SUB2:%.*]] = fsub contract float [[MUL2]], [[SCALE2]]
-; GCN-NEXT:    [[SUB3:%.*]] = fsub contract float [[MUL3]], [[SCALE3]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB1]])
-; GCN-NEXT:    [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB2]])
-; GCN-NEXT:    [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[SUB3]])
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[INPUT]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[SCALES]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x float> [[TMP0]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x float> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr addrspace(1) [[PTR2]], align 4
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[SPTR2]], align 4
+; GCN-NEXT:    [[TMP6:%.*]] = fmul contract <2 x float> [[TMP4]], splat (float 0x3FC0527DC0000000)
+; GCN-NEXT:    [[TMP7:%.*]] = fsub contract <2 x float> [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP8]])
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP9]])
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; GCN-NEXT:    [[EXP2:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP10]])
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
+; GCN-NEXT:    [[EXP3:%.*]] = tail call float @llvm.amdgcn.exp2.f32(float [[TMP11]])
 ; GCN-NEXT:    [[SUM01:%.*]] = fadd fast float [[EXP0]], [[EXP1]]
 ; GCN-NEXT:    [[SUM23:%.*]] = fadd fast float [[EXP2]], [[EXP3]]
 ; GCN-NEXT:    [[SUM:%.*]] = fadd fast float [[SUM01]], [[SUM23]]
@@ -313,32 +312,24 @@ define amdgpu_kernel void @wider_exp2_half(ptr addrspace(1) %input, ptr addrspac
 ; GCN-LABEL: define amdgpu_kernel void @wider_exp2_half(
 ; GCN-SAME: ptr addrspace(1) [[INPUT:%.*]], ptr addrspace(1) [[SCALES:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[IN0:%.*]] = load half, ptr addrspace(1) [[INPUT]], align 2
-; GCN-NEXT:    [[PTR1:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 1
-; GCN-NEXT:    [[IN1:%.*]] = load half, ptr addrspace(1) [[PTR1]], align 2
 ; GCN-NEXT:    [[PTR2:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 2
-; GCN-NEXT:    [[IN2:%.*]] = load half, ptr addrspace(1) [[PTR2]], align 2
-; GCN-NEXT:    [[PTR3:%.*]] = getelementptr half, ptr addrspace(1) [[INPUT]], i64 3
-; GCN-NEXT:    [[IN3:%.*]] = load half, ptr addrspace(1) [[PTR3]], align 2
-; GCN-NEXT:    [[SCALE0:%.*]] = load half, ptr addrspace(1) [[SCALES]], align 2
-; GCN-NEXT:    [[SPTR1:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 1
-; GCN-NEXT:    [[SCALE1:%.*]] = load half, ptr addrspace(1) [[SPTR1]], align 2
 ; GCN-NEXT:    [[SPTR2:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 2
-; GCN-NEXT:    [[SCALE2:%.*]] = load half, ptr addrspace(1) [[SPTR2]], align 2
-; GCN-NEXT:    [[SPTR3:%.*]] = getelementptr half, ptr addrspace(1) [[SCALES]], i64 3
-; GCN-NEXT:    [[SCALE3:%.*]] = load half, ptr addrspace(1) [[SPTR3]], align 2
-; GCN-NEXT:    [[MUL0:%.*]] = fmul contract half [[IN0]], 0xH3E14
-; GCN-NEXT:    [[MUL1:%.*]] = fmul contract half [[IN1]], 0xH3E14
-; GCN-NEXT:    [[MUL2:%.*]] = fmul contract half [[IN2]], 0xH3E14
-; GCN-NEXT:    [[MUL3:%.*]] = fmul contract half [[IN3]], 0xH3E14
-; GCN-NEXT:    [[SUB0:%.*]] = fsub contract half [[MUL0]], [[SCALE0]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub contract half [[MUL1]], [[SCALE1]]
-; GCN-NEXT:    [[SUB2:%.*]] = fsub contract half [[MUL2]], [[SCALE2]]
-; GCN-NEXT:    [[SUB3:%.*]] = fsub contract half [[MUL3]], [[SCALE3]]
-; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB0]])
-; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB1]])
-; GCN-NEXT:    [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB2]])
-; GCN-NEXT:    [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[SUB3]])
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[INPUT]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[SCALES]], align 2
+; GCN-NEXT:    [[TMP2:%.*]] = fmul contract <2 x half> [[TMP0]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP3:%.*]] = fsub contract <2 x half> [[TMP2]], [[TMP1]]
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR2]], align 2
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x half>, ptr addrspace(1) [[SPTR2]], align 2
+; GCN-NEXT:    [[TMP6:%.*]] = fmul contract <2 x half> [[TMP4]], splat (half 0xH3E14)
+; GCN-NEXT:    [[TMP7:%.*]] = fsub contract <2 x half> [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x half> [[TMP3]], i32 0
+; GCN-NEXT:    [[EXP0:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP8]])
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x half> [[TMP3]], i32 1
+; GCN-NEXT:    [[EXP1:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP9]])
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x half> [[TMP7]], i32 0
+; GCN-NEXT:    [[EXP2:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP10]])
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x half> [[TMP7]], i32 1
+; GCN-NEXT:    [[EXP3:%.*]] = tail call half @llvm.amdgcn.exp2.f16(half [[TMP11]])
 ; GCN-NEXT:    [[SUM01:%.*]] = fadd fast half [[EXP0]], [[EXP1]]
 ; GCN-NEXT:    [[SUM23:%.*]] = fadd fast half [[EXP2]], [[EXP3]]
 ; GCN-NEXT:    [[SUM:%.*]] = fadd fast half [[SUM01]], [[SUM23]]
@@ -388,35 +379,64 @@ entry:
 }
 
 define amdgpu_kernel void @kernel_div_scale(ptr addrspace(1) %num, ptr addrspace(1) %den, ptr addrspace(1) %output) {
-; GCN-LABEL: define amdgpu_kernel void @kernel_div_scale(
-; GCN-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
-; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[N0:%.*]] = load float, ptr addrspace(1) [[NUM]], align 4
-; GCN-NEXT:    [[NPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 1
-; GCN-NEXT:    [[N1:%.*]] = load float, ptr addrspace(1) [[NPTR1]], align 4
-; GCN-NEXT:    [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2
-; GCN-NEXT:    [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4
-; GCN-NEXT:    [[D0:%.*]] = load float, ptr addrspace(1) [[DEN]], align 4
-; GCN-NEXT:    [[DPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 1
-; GCN-NEXT:    [[D1:%.*]] = load float, ptr addrspace(1) [[DPTR1]], align 4
-; GCN-NEXT:    [[DPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 2
-; GCN-NEXT:    [[D2:%.*]] = load float, ptr addrspace(1) [[DPTR2]], align 4
-; GCN-NEXT:    [[MUL_N0:%.*]] = fmul float [[N0]], 2.000000e+00
-; GCN-NEXT:    [[MUL_N1:%.*]] = fmul float [[N1]], 2.000000e+00
-; GCN-NEXT:    [[MUL_N2:%.*]] = fmul float [[N2]], 2.000000e+00
-; GCN-NEXT:    [[MUL_D0:%.*]] = fmul float [[D0]], 4.000000e+00
-; GCN-NEXT:    [[MUL_D1:%.*]] = fmul float [[D1]], 4.000000e+00
-; GCN-NEXT:    [[MUL_D2:%.*]] = fmul float [[D2]], 4.000000e+00
-; GCN-NEXT:    [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N0]], float [[MUL_D0]], i1 false)
-; GCN-NEXT:    [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N1]], float [[MUL_D1]], i1 false)
-; GCN-NEXT:    [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N2]], float [[MUL_D2]], i1 false)
-; GCN-NEXT:    [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0
-; GCN-NEXT:    [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0
-; GCN-NEXT:    [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0
-; GCN-NEXT:    [[SUM01:%.*]] = fadd float [[R0]], [[R1]]
-; GCN-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[R2]]
-; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
-; GCN-NEXT:    ret void
+; INSTCNT-LABEL: define amdgpu_kernel void @kernel_div_scale(
+; INSTCNT-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; INSTCNT-NEXT:  [[ENTRY:.*:]]
+; INSTCNT-NEXT:    [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2
+; INSTCNT-NEXT:    [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4
+; INSTCNT-NEXT:    [[DPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 2
+; INSTCNT-NEXT:    [[D2:%.*]] = load float, ptr addrspace(1) [[DPTR2]], align 4
+; INSTCNT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[NUM]], align 4
+; INSTCNT-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], splat (float 2.000000e+00)
+; INSTCNT-NEXT:    [[MUL_N2:%.*]] = fmul float [[N2]], 2.000000e+00
+; INSTCNT-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[DEN]], align 4
+; INSTCNT-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], splat (float 4.000000e+00)
+; INSTCNT-NEXT:    [[MUL_D2:%.*]] = fmul float [[D2]], 4.000000e+00
+; INSTCNT-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; INSTCNT-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; INSTCNT-NEXT:    [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP4]], float [[TMP5]], i1 false)
+; INSTCNT-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; INSTCNT-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; INSTCNT-NEXT:    [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP6]], float [[TMP7]], i1 false)
+; INSTCNT-NEXT:    [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[MUL_N2]], float [[MUL_D2]], i1 false)
+; INSTCNT-NEXT:    [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0
+; INSTCNT-NEXT:    [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0
+; INSTCNT-NEXT:    [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0
+; INSTCNT-NEXT:    [[SUM01:%.*]] = fadd float [[R0]], [[R1]]
+; INSTCNT-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[R2]]
+; INSTCNT-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; INSTCNT-NEXT:    ret void
+;
+; NO-INSTCNT-LABEL: define amdgpu_kernel void @kernel_div_scale(
+; NO-INSTCNT-SAME: ptr addrspace(1) [[NUM:%.*]], ptr addrspace(1) [[DEN:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; NO-INSTCNT-NEXT:  [[ENTRY:.*:]]
+; NO-INSTCNT-NEXT:    [[NPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[NUM]], i64 2
+; NO-INSTCNT-NEXT:    [[N2:%.*]] = load float, ptr addrspace(1) [[NPTR2]], align 4
+; NO-INSTCNT-NEXT:    [[D0:%.*]] = load float, ptr addrspace(1) [[DEN]], align 4
+; NO-INSTCNT-NEXT:    [[DPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[DEN]], i64 1
+; NO-INSTCNT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[NUM]], align 4
+; NO-INSTCNT-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], splat (float 2.000000e+00)
+; NO-INSTCNT-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[N2]], i32 0
+; NO-INSTCNT-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[D0]], i32 1
+; NO-INSTCNT-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], <float 2.000000e+00, float 4.000000e+00>
+; NO-INSTCNT-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr addrspace(1) [[DPTR1]], align 4
+; NO-INSTCNT-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], splat (float 4.000000e+00)
+; NO-INSTCNT-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NO-INSTCNT-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NO-INSTCNT-NEXT:    [[DS0:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP7]], float [[TMP8]], i1 false)
+; NO-INSTCNT-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NO-INSTCNT-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; NO-INSTCNT-NEXT:    [[DS1:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP9]], float [[TMP10]], i1 false)
+; NO-INSTCNT-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NO-INSTCNT-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; NO-INSTCNT-NEXT:    [[DS2:%.*]] = call { float, i1 } @llvm.amdgcn.div.scale.f32(float [[TMP11]], float [[TMP12]], i1 false)
+; NO-INSTCNT-NEXT:    [[R0:%.*]] = extractvalue { float, i1 } [[DS0]], 0
+; NO-INSTCNT-NEXT:    [[R1:%.*]] = extractvalue { float, i1 } [[DS1]], 0
+; NO-INSTCNT-NEXT:    [[R2:%.*]] = extractvalue { float, i1 } [[DS2]], 0
+; NO-INSTCNT-NEXT:    [[SUM01:%.*]] = fadd float [[R0]], [[R1]]
+; NO-INSTCNT-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[R2]]
+; NO-INSTCNT-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; NO-INSTCNT-NEXT:    ret void
 ;
 entry:
   %n0 = load float, ptr addrspace(1) %num, align 4
@@ -451,21 +471,18 @@ define amdgpu_kernel void @kernel_fmed3(ptr addrspace(1) %a, ptr addrspace(1) %b
 ; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3(
 ; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
 ; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
-; GCN-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
-; GCN-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
 ; GCN-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
 ; GCN-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
-; GCN-NEXT:    [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4
-; GCN-NEXT:    [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1
-; GCN-NEXT:    [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4
 ; GCN-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
 ; GCN-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
-; GCN-NEXT:    [[ADD0:%.*]] = fadd float [[A0]], [[B0]]
-; GCN-NEXT:    [[ADD1:%.*]] = fadd float [[A1]], [[B1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 4
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; GCN-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]]
 ; GCN-NEXT:    [[ADD2:%.*]] = fadd float [[A2]], [[B2]]
-; GCN-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD0]], float [[ADD0]], float 1.000000e+00)
-; GCN-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD1]], float [[ADD1]], float 1.000000e+00)
+; GCN-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; GCN-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP3]], float [[TMP3]], float 1.000000e+00)
+; GCN-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; GCN-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP4]], float [[TMP4]], float 1.000000e+00)
 ; GCN-NEXT:    [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[ADD2]], float 1.000000e+00)
 ; GCN-NEXT:    [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
 ; GCN-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[MED2]]
@@ -500,40 +517,75 @@ entry:
 }
 
 define amdgpu_kernel void @kernel_fmed3_1(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) {
-; GCN-LABEL: define amdgpu_kernel void @kernel_fmed3_1(
-; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
-; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
-; GCN-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
-; GCN-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
-; GCN-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
-; GCN-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
-; GCN-NEXT:    [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3
-; GCN-NEXT:    [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4
-; GCN-NEXT:    [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4
-; GCN-NEXT:    [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1
-; GCN-NEXT:    [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4
-; GCN-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
-; GCN-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
-; GCN-NEXT:    [[BPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 3
-; GCN-NEXT:    [[B3:%.*]] = load float, ptr addrspace(1) [[BPTR3]], align 4
-; GCN-NEXT:    [[ADD0:%.*]] = fadd float 5.000000e+00, [[B0]]
-; GCN-NEXT:    [[ADD1:%.*]] = fadd float 5.000000e+00, [[B1]]
-; GCN-NEXT:    [[ADD2:%.*]] = fadd float 5.000000e+00, [[B2]]
-; GCN-NEXT:    [[ADD3:%.*]] = fadd float 5.000000e+00, [[B3]]
-; GCN-NEXT:    [[SUB0:%.*]] = fadd float 1.000000e+00, [[B0]]
-; GCN-NEXT:    [[SUB1:%.*]] = fadd float 1.000000e+00, [[B1]]
-; GCN-NEXT:    [[SUB2:%.*]] = fadd float 1.000000e+00, [[B2]]
-; GCN-NEXT:    [[SUB3:%.*]] = fadd float 1.000000e+00, [[B3]]
-; GCN-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD0]], float [[SUB0]], float 1.000000e+00)
-; GCN-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD1]], float [[SUB1]], float 1.000000e+00)
-; GCN-NEXT:    [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[SUB2]], float 1.000000e+00)
-; GCN-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD3]], float [[SUB3]], float 1.000000e+00)
-; GCN-NEXT:    [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
-; GCN-NEXT:    [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]]
-; GCN-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]]
-; GCN-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
-; GCN-NEXT:    ret void
+; INSTCNT-LABEL: define amdgpu_kernel void @kernel_fmed3_1(
+; INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; INSTCNT-NEXT:  [[ENTRY:.*:]]
+; INSTCNT-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
+; INSTCNT-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
+; INSTCNT-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
+; INSTCNT-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; INSTCNT-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; INSTCNT-NEXT:    [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3
+; INSTCNT-NEXT:    [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4
+; INSTCNT-NEXT:    [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4
+; INSTCNT-NEXT:    [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1
+; INSTCNT-NEXT:    [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4
+; INSTCNT-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; INSTCNT-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
+; INSTCNT-NEXT:    [[BPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 3
+; INSTCNT-NEXT:    [[B3:%.*]] = load float, ptr addrspace(1) [[BPTR3]], align 4
+; INSTCNT-NEXT:    [[ADD0:%.*]] = fadd float 5.000000e+00, [[B0]]
+; INSTCNT-NEXT:    [[ADD1:%.*]] = fadd float 5.000000e+00, [[B1]]
+; INSTCNT-NEXT:    [[ADD2:%.*]] = fadd float 5.000000e+00, [[B2]]
+; INSTCNT-NEXT:    [[ADD3:%.*]] = fadd float 5.000000e+00, [[B3]]
+; INSTCNT-NEXT:    [[SUB0:%.*]] = fadd float 1.000000e+00, [[B0]]
+; INSTCNT-NEXT:    [[SUB1:%.*]] = fadd float 1.000000e+00, [[B1]]
+; INSTCNT-NEXT:    [[SUB2:%.*]] = fadd float 1.000000e+00, [[B2]]
+; INSTCNT-NEXT:    [[SUB3:%.*]] = fadd float 1.000000e+00, [[B3]]
+; INSTCNT-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD0]], float [[SUB0]], float 1.000000e+00)
+; INSTCNT-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD1]], float [[SUB1]], float 1.000000e+00)
+; INSTCNT-NEXT:    [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD2]], float [[SUB2]], float 1.000000e+00)
+; INSTCNT-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[ADD3]], float [[SUB3]], float 1.000000e+00)
+; INSTCNT-NEXT:    [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
+; INSTCNT-NEXT:    [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]]
+; INSTCNT-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]]
+; INSTCNT-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; INSTCNT-NEXT:    ret void
+;
+; NO-INSTCNT-LABEL: define amdgpu_kernel void @kernel_fmed3_1(
+; NO-INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; NO-INSTCNT-NEXT:  [[ENTRY:.*:]]
+; NO-INSTCNT-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
+; NO-INSTCNT-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
+; NO-INSTCNT-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
+; NO-INSTCNT-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; NO-INSTCNT-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; NO-INSTCNT-NEXT:    [[APTR3:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 3
+; NO-INSTCNT-NEXT:    [[A3:%.*]] = load float, ptr addrspace(1) [[APTR3]], align 4
+; NO-INSTCNT-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; NO-INSTCNT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; NO-INSTCNT-NEXT:    [[TMP1:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP0]]
+; NO-INSTCNT-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(1) [[BPTR2]], align 4
+; NO-INSTCNT-NEXT:    [[TMP3:%.*]] = fadd <2 x float> splat (float 5.000000e+00), [[TMP2]]
+; NO-INSTCNT-NEXT:    [[TMP4:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP0]]
+; NO-INSTCNT-NEXT:    [[TMP5:%.*]] = fadd <2 x float> splat (float 1.000000e+00), [[TMP2]]
+; NO-INSTCNT-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NO-INSTCNT-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; NO-INSTCNT-NEXT:    [[MED0:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP6]], float [[TMP7]], float 1.000000e+00)
+; NO-INSTCNT-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NO-INSTCNT-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; NO-INSTCNT-NEXT:    [[MED1:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP8]], float [[TMP9]], float 1.000000e+00)
+; NO-INSTCNT-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; NO-INSTCNT-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; NO-INSTCNT-NEXT:    [[MED2:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP10]], float [[TMP11]], float 1.000000e+00)
+; NO-INSTCNT-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; NO-INSTCNT-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; NO-INSTCNT-NEXT:    [[MED3:%.*]] = call float @llvm.amdgcn.fmed3.f32(float [[TMP12]], float [[TMP13]], float 1.000000e+00)
+; NO-INSTCNT-NEXT:    [[SUM01:%.*]] = fadd float [[MED0]], [[MED1]]
+; NO-INSTCNT-NEXT:    [[SUM02:%.*]] = fadd float [[MED2]], [[MED3]]
+; NO-INSTCNT-NEXT:    [[SUM:%.*]] = fadd float [[SUM01]], [[SUM02]]
+; NO-INSTCNT-NEXT:    store float [[SUM]], ptr addrspace(1) [[OUTPUT]], align 4
+; NO-INSTCNT-NEXT:    ret void
 ;
 entry:
   %a0 = load float, ptr addrspace(1) %a, align 4
@@ -680,44 +732,81 @@ entry:
 }
 
 define amdgpu_kernel void @kernel_alternate(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %output) {
-; GCN-LABEL: define amdgpu_kernel void @kernel_alternate(
-; GCN-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
-; GCN-NEXT:  [[ENTRY:.*:]]
-; GCN-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
-; GCN-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
-; GCN-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
-; GCN-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
-; GCN-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
-; GCN-NEXT:    [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4
-; GCN-NEXT:    [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1
-; GCN-NEXT:    [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4
-; GCN-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
-; GCN-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
-; GCN-NEXT:    [[ADD0:%.*]] = fadd float [[A0]], [[B0]]
-; GCN-NEXT:    [[SUB0:%.*]] = fsub float [[A0]], [[B0]]
-; GCN-NEXT:    [[ADD1:%.*]] = fadd float [[A1]], [[B1]]
-; GCN-NEXT:    [[SUB1:%.*]] = fsub float [[A1]], [[B1]]
-; GCN-NEXT:    [[ADD2:%.*]] = fadd float [[A2]], [[B2]]
-; GCN-NEXT:    [[SUB2:%.*]] = fsub float [[A2]], [[B2]]
-; GCN-NEXT:    [[E0:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD0]])
-; GCN-NEXT:    [[E1:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
-; GCN-NEXT:    [[E2:%.*]] = call float @llvm.amdgcn.log.f32(float [[ADD1]])
-; GCN-NEXT:    [[E3:%.*]] = call float @llvm.amdgcn.log.f32(float [[SUB1]])
-; GCN-NEXT:    [[E4:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD2]])
-; GCN-NEXT:    [[E5:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB2]])
-; GCN-NEXT:    [[OPTR0:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 0
-; GCN-NEXT:    [[OPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 1
-; GCN-NEXT:    [[OPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 2
-; GCN-NEXT:    [[OPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 3
-; GCN-NEXT:    [[OPTR4:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 4
-; GCN-NEXT:    [[OPTR5:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 5
-; GCN-NEXT:    store float [[E0]], ptr addrspace(1) [[OPTR0]], align 4
-; GCN-NEXT:    store float [[E1]], ptr addrspace(1) [[OPTR1]], align 4
-; GCN-NEXT:    store float [[E2]], ptr addrspace(1) [[OPTR2]], align 4
-; GCN-NEXT:    store float [[E3]], ptr addrspace(1) [[OPTR3]], align 4
-; GCN-NEXT:    store float [[E4]], ptr addrspace(1) [[OPTR4]], align 4
-; GCN-NEXT:    store float [[E5]], ptr addrspace(1) [[OPTR5]], align 4
-; GCN-NEXT:    ret void
+; INSTCNT-LABEL: define amdgpu_kernel void @kernel_alternate(
+; INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; INSTCNT-NEXT:  [[ENTRY:.*:]]
+; INSTCNT-NEXT:    [[A0:%.*]] = load float, ptr addrspace(1) [[A]], align 4
+; INSTCNT-NEXT:    [[APTR1:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 1
+; INSTCNT-NEXT:    [[A1:%.*]] = load float, ptr addrspace(1) [[APTR1]], align 4
+; INSTCNT-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; INSTCNT-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; INSTCNT-NEXT:    [[B0:%.*]] = load float, ptr addrspace(1) [[B]], align 4
+; INSTCNT-NEXT:    [[BPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 1
+; INSTCNT-NEXT:    [[B1:%.*]] = load float, ptr addrspace(1) [[BPTR1]], align 4
+; INSTCNT-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; INSTCNT-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
+; INSTCNT-NEXT:    [[ADD0:%.*]] = fadd float [[A0]], [[B0]]
+; INSTCNT-NEXT:    [[SUB0:%.*]] = fsub float [[A0]], [[B0]]
+; INSTCNT-NEXT:    [[ADD1:%.*]] = fadd float [[A1]], [[B1]]
+; INSTCNT-NEXT:    [[SUB1:%.*]] = fsub float [[A1]], [[B1]]
+; INSTCNT-NEXT:    [[ADD2:%.*]] = fadd float [[A2]], [[B2]]
+; INSTCNT-NEXT:    [[SUB2:%.*]] = fsub float [[A2]], [[B2]]
+; INSTCNT-NEXT:    [[E0:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD0]])
+; INSTCNT-NEXT:    [[E1:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB0]])
+; INSTCNT-NEXT:    [[E2:%.*]] = call float @llvm.amdgcn.log.f32(float [[ADD1]])
+; INSTCNT-NEXT:    [[E3:%.*]] = call float @llvm.amdgcn.log.f32(float [[SUB1]])
+; INSTCNT-NEXT:    [[E4:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD2]])
+; INSTCNT-NEXT:    [[E5:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB2]])
+; INSTCNT-NEXT:    [[OPTR0:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 0
+; INSTCNT-NEXT:    [[OPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 1
+; INSTCNT-NEXT:    [[OPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 2
+; INSTCNT-NEXT:    [[OPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 3
+; INSTCNT-NEXT:    [[OPTR4:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 4
+; INSTCNT-NEXT:    [[OPTR5:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 5
+; INSTCNT-NEXT:    store float [[E0]], ptr addrspace(1) [[OPTR0]], align 4
+; INSTCNT-NEXT:    store float [[E1]], ptr addrspace(1) [[OPTR1]], align 4
+; INSTCNT-NEXT:    store float [[E2]], ptr addrspace(1) [[OPTR2]], align 4
+; INSTCNT-NEXT:    store float [[E3]], ptr addrspace(1) [[OPTR3]], align 4
+; INSTCNT-NEXT:    store float [[E4]], ptr addrspace(1) [[OPTR4]], align 4
+; INSTCNT-NEXT:    store float [[E5]], ptr addrspace(1) [[OPTR5]], align 4
+; INSTCNT-NEXT:    ret void
+;
+; NO-INSTCNT-LABEL: define amdgpu_kernel void @kernel_alternate(
+; NO-INSTCNT-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
+; NO-INSTCNT-NEXT:  [[ENTRY:.*:]]
+; NO-INSTCNT-NEXT:    [[APTR2:%.*]] = getelementptr float, ptr addrspace(1) [[A]], i64 2
+; NO-INSTCNT-NEXT:    [[A2:%.*]] = load float, ptr addrspace(1) [[APTR2]], align 4
+; NO-INSTCNT-NEXT:    [[BPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[B]], i64 2
+; NO-INSTCNT-NEXT:    [[B2:%.*]] = load float, ptr addrspace(1) [[BPTR2]], align 4
+; NO-INSTCNT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 4
+; NO-INSTCNT-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[B]], align 4
+; NO-INSTCNT-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP0]], [[TMP1]]
+; NO-INSTCNT-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP0]], [[TMP1]]
+; NO-INSTCNT-NEXT:    [[ADD2:%.*]] = fadd float [[A2]], [[B2]]
+; NO-INSTCNT-NEXT:    [[SUB2:%.*]] = fsub float [[A2]], [[B2]]
+; NO-INSTCNT-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; NO-INSTCNT-NEXT:    [[E0:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[TMP4]])
+; NO-INSTCNT-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; NO-INSTCNT-NEXT:    [[E1:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[TMP5]])
+; NO-INSTCNT-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; NO-INSTCNT-NEXT:    [[E2:%.*]] = call float @llvm.amdgcn.log.f32(float [[TMP6]])
+; NO-INSTCNT-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; NO-INSTCNT-NEXT:    [[E3:%.*]] = call float @llvm.amdgcn.log.f32(float [[TMP7]])
+; NO-INSTCNT-NEXT:    [[E4:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[ADD2]])
+; NO-INSTCNT-NEXT:    [[E5:%.*]] = call float @llvm.amdgcn.exp2.f32(float [[SUB2]])
+; NO-INSTCNT-NEXT:    [[OPTR0:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 0
+; NO-INSTCNT-NEXT:    [[OPTR1:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 1
+; NO-INSTCNT-NEXT:    [[OPTR2:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 2
+; NO-INSTCNT-NEXT:    [[OPTR3:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 3
+; NO-INSTCNT-NEXT:    [[OPTR4:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 4
+; NO-INSTCNT-NEXT:    [[OPTR5:%.*]] = getelementptr float, ptr addrspace(1) [[OUTPUT]], i64 5
+; NO-INSTCNT-NEXT:    store float [[E0]], ptr addrspace(1) [[OPTR0]], align 4
+; NO-INSTCNT-NEXT:    store float [[E1]], ptr addrspace(1) [[OPTR1]], align 4
+; NO-INSTCNT-NEXT:    store float [[E2]], ptr addrspace(1) [[OPTR2]], align 4
+; NO-INSTCNT-NEXT:    store float [[E3]], ptr addrspace(1) [[OPTR3]], align 4
+; NO-INSTCNT-NEXT:    store float [[E4]], ptr addrspace(1) [[OPTR4]], align 4
+; NO-INSTCNT-NEXT:    store float [[E5]], ptr addrspace(1) [[OPTR5]], align 4
+; NO-INSTCNT-NEXT:    ret void
 ;
 entry:
   %a0    = load float, ptr addrspace(1) %a, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index e13dfce8c29f3..016726e5ae371 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -177,11 +177,10 @@ define ptr @test4() {
 ; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
 ; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
 ; NONPOWEROF2:       10:
-; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
-; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP8]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
+; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]]
+; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
+; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
 ; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
 ; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
 ; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])