From 13b1b19a46296b87427ad9a2484661c23c84a14b Mon Sep 17 00:00:00 2001
From: xla authors <google-ml-automation@google.com>
Date: Sun, 19 Jan 2025 12:53:09 -0800
Subject: [PATCH] Integrate LLVM at llvm/llvm-project@13c761789753

Updates LLVM usage to match
[13c761789753](https://github.com/llvm/llvm-project/commit/13c761789753)

PiperOrigin-RevId: 717293402
---
 third_party/llvm/generated.patch              | 1337 ++++++++++++--
 third_party/llvm/workspace.bzl                |    4 +-
 third_party/shardy/temporary.patch            | 1590 ++++++++++++++---
 third_party/shardy/workspace.bzl              |    4 +-
 third_party/stablehlo/temporary.patch         |   12 +
 .../triton/llvm_integration/cl717293402.patch |  127 ++
 .../triton/llvm_integration/series.bzl        |    1 +
 .../tsl/third_party/llvm/generated.patch      | 1337 ++++++++++++--
 .../tsl/third_party/llvm/workspace.bzl        |    4 +-
 .../transforms/vectorize_loads_stores.cc      |    3 +-
 10 files changed, 3800 insertions(+), 619 deletions(-)
 create mode 100644 third_party/triton/llvm_integration/cl717293402.patch
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 3d2a2525c37a9..8b54ffba772b7 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,207 +1,1156 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/TypeRange.h b/mlir/include/mlir/IR/TypeRange.h
---- a/mlir/include/mlir/IR/TypeRange.h
-+++ b/mlir/include/mlir/IR/TypeRange.h
-@@ -29,12 +29,11 @@
- /// a SmallVector/std::vector. This class should be used in places that are not
- /// suitable for a more derived type (e.g. ArrayRef) or a template range
- /// parameter.
--class TypeRange
--    : public llvm::detail::indexed_accessor_range_base<
--          TypeRange,
--          llvm::PointerUnion<const Value *, const Type *, OpOperand *,
--                             detail::OpResultImpl *, Type>,
--          Type, Type, Type> {
-+class TypeRange : public llvm::detail::indexed_accessor_range_base<
-+                      TypeRange,
-+                      llvm::PointerUnion<const Value *, const Type *,
-+                                         OpOperand *, detail::OpResultImpl *>,
-+                      Type, Type, Type> {
- public:
-   using RangeBaseT::RangeBaseT;
-   TypeRange(ArrayRef<Type> types = std::nullopt);
-@@ -45,11 +44,8 @@
-   TypeRange(ValueTypeRange<ValueRangeT> values)
-       : TypeRange(ValueRange(ValueRangeT(values.begin().getCurrent(),
-                                          values.end().getCurrent()))) {}
--
--  TypeRange(Type type) : TypeRange(type, /*count=*/1) {}
--  template <typename Arg, typename = std::enable_if_t<
--                              std::is_constructible_v<ArrayRef<Type>, Arg> &&
--                              !std::is_constructible_v<Type, Arg>>>
-+  template <typename Arg, typename = std::enable_if_t<std::is_constructible<
-+                              ArrayRef<Type>, Arg>::value>>
-   TypeRange(Arg &&arg) : TypeRange(ArrayRef<Type>(std::forward<Arg>(arg))) {}
-   TypeRange(std::initializer_list<Type> types)
-       : TypeRange(ArrayRef<Type>(types)) {}
-@@ -60,9 +56,8 @@
-   /// * A pointer to the first element of an array of types.
-   /// * A pointer to the first element of an array of operands.
-   /// * A pointer to the first element of an array of results.
--  /// * A single 'Type' instance.
-   using OwnerT = llvm::PointerUnion<const Value *, const Type *, OpOperand *,
--                                    detail::OpResultImpl *, Type>;
-+                                    detail::OpResultImpl *>;
- 
-   /// See `llvm::detail::indexed_accessor_range_base` for details.
-   static OwnerT offset_base(OwnerT object, ptrdiff_t index);
-diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h
---- a/mlir/include/mlir/IR/ValueRange.h
-+++ b/mlir/include/mlir/IR/ValueRange.h
-@@ -374,16 +374,16 @@
- /// SmallVector/std::vector. This class should be used in places that are not
- /// suitable for a more derived type (e.g. ArrayRef) or a template range
- /// parameter.
--class ValueRange final : public llvm::detail::indexed_accessor_range_base<
--                             ValueRange,
--                             PointerUnion<const Value *, OpOperand *,
--                                          detail::OpResultImpl *, Value>,
--                             Value, Value, Value> {
-+class ValueRange final
-+    : public llvm::detail::indexed_accessor_range_base<
-+          ValueRange,
-+          PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>,
-+          Value, Value, Value> {
- public:
-   /// The type representing the owner of a ValueRange. This is either a list of
--  /// values, operands, or results or a single value.
-+  /// values, operands, or results.
-   using OwnerT =
--      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *, Value>;
-+      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>;
- 
-   using RangeBaseT::RangeBaseT;
- 
-@@ -392,7 +392,7 @@
-                 std::is_constructible<ArrayRef<Value>, Arg>::value &&
-                 !std::is_convertible<Arg, Value>::value>>
-   ValueRange(Arg &&arg) : ValueRange(ArrayRef<Value>(std::forward<Arg>(arg))) {}
--  ValueRange(Value value) : ValueRange(value, /*count=*/1) {}
-+  ValueRange(const Value &value) : ValueRange(&value, /*count=*/1) {}
-   ValueRange(const std::initializer_list<Value> &values)
-       : ValueRange(ArrayRef<Value>(values)) {}
-   ValueRange(iterator_range<OperandRange::iterator> values)
-diff -ruN --strip-trailing-cr a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
---- a/mlir/lib/IR/OperationSupport.cpp
-+++ b/mlir/lib/IR/OperationSupport.cpp
-@@ -653,15 +653,6 @@
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- ValueRange::OwnerT ValueRange::offset_base(const OwnerT &owner,
-                                            ptrdiff_t index) {
--  if (llvm::isa_and_nonnull<Value>(owner)) {
--    // Prevent out-of-bounds indexing for single values.
--    // Note that we do allow an index of 1 as is required by 'slice'ing that
--    // returns an empty range. This also matches the usual rules of C++ of being
--    // allowed to index past the last element of an array.
--    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
--    // Return nullptr to quickly cause segmentation faults on misuse.
--    return index == 0 ? owner : nullptr;
--  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
-     return {value + index};
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-@@ -670,10 +661,6 @@
+diff -ruN --strip-trailing-cr a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
++++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+@@ -513,12 +513,6 @@
+ Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
+                      Entity entity, mlir::ValueRange oneBasedIndices);
+ 
+-/// Return a vector of extents for the given entity.
+-/// The function creates new operations, but tries to clean-up
+-/// after itself.
+-llvm::SmallVector<mlir::Value>
+-genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
+-
+ } // namespace hlfir
+ 
+ #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
+diff -ruN --strip-trailing-cr a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
++++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+@@ -1421,15 +1421,3 @@
+   return loadTrivialScalar(loc, builder,
+                            getElementAt(loc, builder, entity, oneBasedIndices));
  }
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- Value ValueRange::dereference_iterator(const OwnerT &owner, ptrdiff_t index) {
--  if (auto value = llvm::dyn_cast_if_present<Value>(owner)) {
--    assert(index == 0 && "cannot offset into single-value 'ValueRange'");
--    return value;
+-
+-llvm::SmallVector<mlir::Value>
+-hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
+-                        hlfir::Entity entity) {
+-  entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
+-  mlir::Value shape = hlfir::genShape(loc, builder, entity);
+-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> extents =
+-      hlfir::getExplicitExtentsFromShape(shape, builder);
+-  if (shape.getUses().empty())
+-    shape.getDefiningOp()->erase();
+-  return extents;
+-}
+diff -ruN --strip-trailing-cr a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
++++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+@@ -37,79 +37,6 @@
+ 
+ namespace {
+ 
+-// Helper class to generate operations related to computing
+-// product of values.
+-class ProductFactory {
+-public:
+-  ProductFactory(mlir::Location loc, fir::FirOpBuilder &builder)
+-      : loc(loc), builder(builder) {}
+-
+-  // Generate an update of the inner product value:
+-  //   acc += v1 * v2, OR
+-  //   acc += CONJ(v1) * v2, OR
+-  //   acc ||= v1 && v2
+-  //
+-  // CONJ parameter specifies whether the first complex product argument
+-  // needs to be conjugated.
+-  template <bool CONJ = false>
+-  mlir::Value genAccumulateProduct(mlir::Value acc, mlir::Value v1,
+-                                   mlir::Value v2) {
+-    mlir::Type resultType = acc.getType();
+-    acc = castToProductType(acc, resultType);
+-    v1 = castToProductType(v1, resultType);
+-    v2 = castToProductType(v2, resultType);
+-    mlir::Value result;
+-    if (mlir::isa<mlir::FloatType>(resultType)) {
+-      result = builder.create<mlir::arith::AddFOp>(
+-          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
+-    } else if (mlir::isa<mlir::ComplexType>(resultType)) {
+-      if constexpr (CONJ)
+-        result = fir::IntrinsicLibrary{builder, loc}.genConjg(resultType, v1);
+-      else
+-        result = v1;
+-
+-      result = builder.create<fir::AddcOp>(
+-          loc, acc, builder.create<fir::MulcOp>(loc, result, v2));
+-    } else if (mlir::isa<mlir::IntegerType>(resultType)) {
+-      result = builder.create<mlir::arith::AddIOp>(
+-          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
+-    } else if (mlir::isa<fir::LogicalType>(resultType)) {
+-      result = builder.create<mlir::arith::OrIOp>(
+-          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
+-    } else {
+-      llvm_unreachable("unsupported type");
+-    }
+-
+-    return builder.createConvert(loc, resultType, result);
 -  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
-     return value[index];
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-diff -ruN --strip-trailing-cr a/mlir/lib/IR/TypeRange.cpp b/mlir/lib/IR/TypeRange.cpp
---- a/mlir/lib/IR/TypeRange.cpp
-+++ b/mlir/lib/IR/TypeRange.cpp
-@@ -31,23 +31,12 @@
-     this->base = result;
-   else if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-     this->base = operand;
--  else if (auto value = llvm::dyn_cast_if_present<Value>(owner))
--    this->base = value.getType();
-   else
-     this->base = cast<const Value *>(owner);
- }
+-
+-private:
+-  mlir::Location loc;
+-  fir::FirOpBuilder &builder;
+-
+-  mlir::Value castToProductType(mlir::Value value, mlir::Type type) {
+-    if (mlir::isa<fir::LogicalType>(type))
+-      return builder.createConvert(loc, builder.getIntegerType(1), value);
+-
+-    // TODO: the multiplications/additions by/of zero resulting from
+-    // complex * real are optimized by LLVM under -fno-signed-zeros
+-    // -fno-honor-nans.
+-    // We can make them disappear by default if we:
+-    //   * either expand the complex multiplication into real
+-    //     operations, OR
+-    //   * set nnan nsz fast-math flags to the complex operations.
+-    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
+-      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
+-      fir::factory::Complex helper(builder, loc);
+-      mlir::Type partType = helper.getComplexPartType(type);
+-      return helper.insertComplexPart(zeroCmplx,
+-                                      castToProductType(value, partType),
+-                                      /*isImagPart=*/false);
+-    }
+-    return builder.createConvert(loc, type, value);
+-  }
+-};
+-
+ class TransposeAsElementalConversion
+     : public mlir::OpRewritePattern<hlfir::TransposeOp> {
+ public:
+@@ -163,8 +90,11 @@
+   static mlir::Value genResultShape(mlir::Location loc,
+                                     fir::FirOpBuilder &builder,
+                                     hlfir::Entity array) {
+-    llvm::SmallVector<mlir::Value, 2> inExtents =
+-        hlfir::genExtentsVector(loc, builder, array);
++    mlir::Value inShape = hlfir::genShape(loc, builder, array);
++    llvm::SmallVector<mlir::Value> inExtents =
++        hlfir::getExplicitExtentsFromShape(inShape, builder);
++    if (inShape.getUses().empty())
++      inShape.getDefiningOp()->erase();
+ 
+     // transpose indices
+     assert(inExtents.size() == 2 && "checked in TransposeOp::validate");
+@@ -207,7 +137,7 @@
+     mlir::Value resultShape, dimExtent;
+     llvm::SmallVector<mlir::Value> arrayExtents;
+     if (isTotalReduction)
+-      arrayExtents = hlfir::genExtentsVector(loc, builder, array);
++      arrayExtents = genArrayExtents(loc, builder, array);
+     else
+       std::tie(resultShape, dimExtent) =
+           genResultShapeForPartialReduction(loc, builder, array, dimVal);
+@@ -233,8 +163,7 @@
+       // If DIM is not present, do total reduction.
+ 
+       // Initial value for the reduction.
+-      mlir::Value reductionInitValue =
+-          fir::factory::createZeroValue(builder, loc, elementType);
++      mlir::Value reductionInitValue = genInitValue(loc, builder, elementType);
+ 
+       // The reduction loop may be unordered if FastMathFlags::reassoc
+       // transformations are allowed. The integer reduction is always
+@@ -335,6 +264,17 @@
+   }
+ 
+ private:
++  static llvm::SmallVector<mlir::Value>
++  genArrayExtents(mlir::Location loc, fir::FirOpBuilder &builder,
++                  hlfir::Entity array) {
++    mlir::Value inShape = hlfir::genShape(loc, builder, array);
++    llvm::SmallVector<mlir::Value> inExtents =
++        hlfir::getExplicitExtentsFromShape(inShape, builder);
++    if (inShape.getUses().empty())
++      inShape.getDefiningOp()->erase();
++    return inExtents;
++  }
++
+   // Return fir.shape specifying the shape of the result
+   // of a SUM reduction with DIM=dimVal. The second return value
+   // is the extent of the DIM dimension.
+@@ -343,7 +283,7 @@
+                                     fir::FirOpBuilder &builder,
+                                     hlfir::Entity array, int64_t dimVal) {
+     llvm::SmallVector<mlir::Value> inExtents =
+-        hlfir::genExtentsVector(loc, builder, array);
++        genArrayExtents(loc, builder, array);
+     assert(dimVal > 0 && dimVal <= static_cast<int64_t>(inExtents.size()) &&
+            "DIM must be present and a positive constant not exceeding "
+            "the array's rank");
+@@ -353,6 +293,26 @@
+     return {builder.create<fir::ShapeOp>(loc, inExtents), dimExtent};
+   }
+ 
++  // Generate the initial value for a SUM reduction with the given
++  // data type.
++  static mlir::Value genInitValue(mlir::Location loc,
++                                  fir::FirOpBuilder &builder,
++                                  mlir::Type elementType) {
++    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
++      const llvm::fltSemantics &sem = ty.getFloatSemantics();
++      return builder.createRealConstant(loc, elementType,
++                                        llvm::APFloat::getZero(sem));
++    } else if (auto ty = mlir::dyn_cast<mlir::ComplexType>(elementType)) {
++      mlir::Value initValue = genInitValue(loc, builder, ty.getElementType());
++      return fir::factory::Complex{builder, loc}.createComplex(ty, initValue,
++                                                               initValue);
++    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
++      return builder.createIntegerConstant(loc, elementType, 0);
++    }
++
++    llvm_unreachable("unsupported SUM reduction type");
++  }
++
+   // Generate scalar addition of the two values (of the same data type).
+   static mlir::Value genScalarAdd(mlir::Location loc,
+                                   fir::FirOpBuilder &builder,
+@@ -610,10 +570,16 @@
+   static std::tuple<mlir::Value, mlir::Value>
+   genResultShape(mlir::Location loc, fir::FirOpBuilder &builder,
+                  hlfir::Entity input1, hlfir::Entity input2) {
+-    llvm::SmallVector<mlir::Value, 2> input1Extents =
+-        hlfir::genExtentsVector(loc, builder, input1);
+-    llvm::SmallVector<mlir::Value, 2> input2Extents =
+-        hlfir::genExtentsVector(loc, builder, input2);
++    mlir::Value input1Shape = hlfir::genShape(loc, builder, input1);
++    llvm::SmallVector<mlir::Value> input1Extents =
++        hlfir::getExplicitExtentsFromShape(input1Shape, builder);
++    if (input1Shape.getUses().empty())
++      input1Shape.getDefiningOp()->erase();
++    mlir::Value input2Shape = hlfir::genShape(loc, builder, input2);
++    llvm::SmallVector<mlir::Value> input2Extents =
++        hlfir::getExplicitExtentsFromShape(input2Shape, builder);
++    if (input2Shape.getUses().empty())
++      input2Shape.getDefiningOp()->erase();
  
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- TypeRange::OwnerT TypeRange::offset_base(OwnerT object, ptrdiff_t index) {
--  if (llvm::isa_and_nonnull<Type>(object)) {
--    // Prevent out-of-bounds indexing for single values.
--    // Note that we do allow an index of 1 as is required by 'slice'ing that
--    // returns an empty range. This also matches the usual rules of C++ of being
--    // allowed to index past the last element of an array.
--    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
--    // Return nullptr to quickly cause segmentation faults on misuse.
--    return index == 0 ? object : nullptr;
+     llvm::SmallVector<mlir::Value, 2> newExtents;
+     mlir::Value innerProduct1Extent, innerProduct2Extent;
+@@ -661,6 +627,60 @@
+             innerProductExtent[0]};
+   }
+ 
++  static mlir::Value castToProductType(mlir::Location loc,
++                                       fir::FirOpBuilder &builder,
++                                       mlir::Value value, mlir::Type type) {
++    if (mlir::isa<fir::LogicalType>(type))
++      return builder.createConvert(loc, builder.getIntegerType(1), value);
++
++    // TODO: the multiplications/additions by/of zero resulting from
++    // complex * real are optimized by LLVM under -fno-signed-zeros
++    // -fno-honor-nans.
++    // We can make them disappear by default if we:
++    //   * either expand the complex multiplication into real
++    //     operations, OR
++    //   * set nnan nsz fast-math flags to the complex operations.
++    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
++      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
++      fir::factory::Complex helper(builder, loc);
++      mlir::Type partType = helper.getComplexPartType(type);
++      return helper.insertComplexPart(
++          zeroCmplx, castToProductType(loc, builder, value, partType),
++          /*isImagPart=*/false);
++    }
++    return builder.createConvert(loc, type, value);
++  }
++
++  // Generate an update of the inner product value:
++  //   acc += v1 * v2, OR
++  //   acc ||= v1 && v2
++  static mlir::Value genAccumulateProduct(mlir::Location loc,
++                                          fir::FirOpBuilder &builder,
++                                          mlir::Type resultType,
++                                          mlir::Value acc, mlir::Value v1,
++                                          mlir::Value v2) {
++    acc = castToProductType(loc, builder, acc, resultType);
++    v1 = castToProductType(loc, builder, v1, resultType);
++    v2 = castToProductType(loc, builder, v2, resultType);
++    mlir::Value result;
++    if (mlir::isa<mlir::FloatType>(resultType))
++      result = builder.create<mlir::arith::AddFOp>(
++          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
++    else if (mlir::isa<mlir::ComplexType>(resultType))
++      result = builder.create<fir::AddcOp>(
++          loc, acc, builder.create<fir::MulcOp>(loc, v1, v2));
++    else if (mlir::isa<mlir::IntegerType>(resultType))
++      result = builder.create<mlir::arith::AddIOp>(
++          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
++    else if (mlir::isa<fir::LogicalType>(resultType))
++      result = builder.create<mlir::arith::OrIOp>(
++          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
++    else
++      llvm_unreachable("unsupported type");
++
++    return builder.createConvert(loc, resultType, result);
++  }
++
+   static mlir::LogicalResult
+   genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
+                       hlfir::Entity result, mlir::Value resultShape,
+@@ -728,9 +748,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, {I, K});
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, {K, J});
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                resultElementValue, lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, resultElementValue,
++            lhsElementValue, rhsElementValue);
+         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+         return {};
+       };
+@@ -765,9 +785,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, {J, K});
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, {K});
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                resultElementValue, lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, resultElementValue,
++            lhsElementValue, rhsElementValue);
+         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+         return {};
+       };
+@@ -797,9 +817,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, {K});
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, {K, J});
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                resultElementValue, lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, resultElementValue,
++            lhsElementValue, rhsElementValue);
+         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+         return {};
+       };
+@@ -865,9 +885,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, lhsIndices);
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, rhsIndices);
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                reductionArgs[0], lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, reductionArgs[0], lhsElementValue,
++            rhsElementValue);
+         return {productValue};
+       };
+       llvm::SmallVector<mlir::Value, 1> innerProductValue =
+@@ -884,73 +904,6 @@
+   }
+ };
+ 
+-class DotProductConversion
+-    : public mlir::OpRewritePattern<hlfir::DotProductOp> {
+-public:
+-  using mlir::OpRewritePattern<hlfir::DotProductOp>::OpRewritePattern;
+-
+-  llvm::LogicalResult
+-  matchAndRewrite(hlfir::DotProductOp product,
+-                  mlir::PatternRewriter &rewriter) const override {
+-    hlfir::Entity op = hlfir::Entity{product};
+-    if (!op.isScalar())
+-      return rewriter.notifyMatchFailure(product, "produces non-scalar result");
+-
+-    mlir::Location loc = product.getLoc();
+-    fir::FirOpBuilder builder{rewriter, product.getOperation()};
+-    hlfir::Entity lhs = hlfir::Entity{product.getLhs()};
+-    hlfir::Entity rhs = hlfir::Entity{product.getRhs()};
+-    mlir::Type resultElementType = product.getType();
+-    bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
+-                       mlir::isa<fir::LogicalType>(resultElementType) ||
+-                       static_cast<bool>(builder.getFastMathFlags() &
+-                                         mlir::arith::FastMathFlags::reassoc);
+-
+-    mlir::Value extent = genProductExtent(loc, builder, lhs, rhs);
+-
+-    auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+-                       mlir::ValueRange oneBasedIndices,
+-                       mlir::ValueRange reductionArgs)
+-        -> llvm::SmallVector<mlir::Value, 1> {
+-      hlfir::Entity lhsElementValue =
+-          hlfir::loadElementAt(loc, builder, lhs, oneBasedIndices);
+-      hlfir::Entity rhsElementValue =
+-          hlfir::loadElementAt(loc, builder, rhs, oneBasedIndices);
+-      mlir::Value productValue =
+-          ProductFactory{loc, builder}.genAccumulateProduct</*CONJ=*/true>(
+-              reductionArgs[0], lhsElementValue, rhsElementValue);
+-      return {productValue};
+-    };
+-
+-    mlir::Value initValue =
+-        fir::factory::createZeroValue(builder, loc, resultElementType);
+-
+-    llvm::SmallVector<mlir::Value, 1> result = hlfir::genLoopNestWithReductions(
+-        loc, builder, {extent},
+-        /*reductionInits=*/{initValue}, genBody, isUnordered);
+-
+-    rewriter.replaceOp(product, result[0]);
+-    return mlir::success();
 -  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
-     return {value + index};
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
-@@ -59,10 +48,6 @@
- 
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- Type TypeRange::dereference_iterator(OwnerT object, ptrdiff_t index) {
--  if (auto type = llvm::dyn_cast_if_present<Type>(object)) {
--    assert(index == 0 && "cannot offset into single-value 'TypeRange'");
--    return type;
+-
+-private:
+-  static mlir::Value genProductExtent(mlir::Location loc,
+-                                      fir::FirOpBuilder &builder,
+-                                      hlfir::Entity input1,
+-                                      hlfir::Entity input2) {
+-    llvm::SmallVector<mlir::Value, 1> input1Extents =
+-        hlfir::genExtentsVector(loc, builder, input1);
+-    llvm::SmallVector<mlir::Value, 1> input2Extents =
+-        hlfir::genExtentsVector(loc, builder, input2);
+-
+-    assert(input1Extents.size() == 1 && input2Extents.size() == 1 &&
+-           "hlfir.dot_product arguments must be vectors");
+-    llvm::SmallVector<mlir::Value, 1> extent =
+-        fir::factory::deduceOptimalExtents(input1Extents, input2Extents);
+-    return extent[0];
 -  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
-     return (value + index)->getType();
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
-diff -ruN --strip-trailing-cr a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
---- a/mlir/unittests/IR/OperationSupportTest.cpp
-+++ b/mlir/unittests/IR/OperationSupportTest.cpp
-@@ -313,21 +313,4 @@
-   op2->destroy();
+-};
+-
+ class SimplifyHLFIRIntrinsics
+     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
+ public:
+@@ -986,8 +939,6 @@
+     if (forceMatmulAsElemental || this->allowNewSideEffects)
+       patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);
+ 
+-    patterns.insert<DotProductConversion>(context);
+-
+     if (mlir::failed(mlir::applyPatternsGreedily(
+             getOperation(), std::move(patterns), config))) {
+       mlir::emitError(getOperation()->getLoc(),
+diff -ruN --strip-trailing-cr a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
+--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
++++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
+@@ -1,144 +0,0 @@
+-// Test hlfir.dot_product simplification to a reduction loop:
+-// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
+-
+-func.func @dot_product_integer(%arg0: !hlfir.expr<?xi16>, %arg1: !hlfir.expr<?xi32>) -> i32 {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xi16>, !hlfir.expr<?xi32>) -> i32
+-  return %res : i32
+-}
+-// CHECK-LABEL:   func.func @dot_product_integer(
+-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xi16>,
+-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xi32>) -> i32 {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xi16>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (i32) {
+-// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xi16>, index) -> i16
+-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xi32>, index) -> i32
+-// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
+-// CHECK:             %[[VAL_12:.*]] = arith.muli %[[VAL_11]], %[[VAL_10]] : i32
+-// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_8]], %[[VAL_12]] : i32
+-// CHECK:             fir.result %[[VAL_13]] : i32
+-// CHECK:           }
+-// CHECK:           return %[[VAL_6]] : i32
+-// CHECK:         }
+-
+-func.func @dot_product_real(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf16>) -> f32 {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xf16>) -> f32
+-  return %res : f32
+-}
+-// CHECK-LABEL:   func.func @dot_product_real(
+-// CHECK-SAME:                                %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+-// CHECK-SAME:                                %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> f32 {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (f32) {
+-// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xf32>, index) -> f32
+-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xf16>, index) -> f16
+-// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (f16) -> f32
+-// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_9]], %[[VAL_11]] : f32
+-// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_12]] : f32
+-// CHECK:             fir.result %[[VAL_13]] : f32
+-// CHECK:           }
+-// CHECK:           return %[[VAL_6]] : f32
+-// CHECK:         }
+-
+-func.func @dot_product_complex(%arg0: !hlfir.expr<?xcomplex<f32>>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xcomplex<f32>>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+-  return %res : complex<f32>
+-}
+-// CHECK-LABEL:   func.func @dot_product_complex(
+-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xcomplex<f32>>,
+-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xcomplex<f32>>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+-// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+-// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f32>>, index) -> complex<f32>
+-// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+-// CHECK:             %[[VAL_15:.*]] = fir.extract_value %[[VAL_12]], [1 : index] : (complex<f32>) -> f32
+-// CHECK:             %[[VAL_16:.*]] = arith.negf %[[VAL_15]] : f32
+-// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_12]], %[[VAL_16]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_18:.*]] = fir.mulc %[[VAL_17]], %[[VAL_14]] : complex<f32>
+-// CHECK:             %[[VAL_19:.*]] = fir.addc %[[VAL_11]], %[[VAL_18]] : complex<f32>
+-// CHECK:             fir.result %[[VAL_19]] : complex<f32>
+-// CHECK:           }
+-// CHECK:           return %[[VAL_9]] : complex<f32>
+-// CHECK:         }
+-
+-func.func @dot_product_real_complex(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+-  return %res : complex<f32>
+-}
+-// CHECK-LABEL:   func.func @dot_product_real_complex(
+-// CHECK-SAME:                                        %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+-// CHECK-SAME:                                        %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+-// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+-// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xf32>, index) -> f32
+-// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+-// CHECK:             %[[VAL_14:.*]] = fir.undefined complex<f32>
+-// CHECK:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_12]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+-// CHECK:             %[[VAL_19:.*]] = fir.extract_value %[[VAL_17]], [1 : index] : (complex<f32>) -> f32
+-// CHECK:             %[[VAL_20:.*]] = arith.negf %[[VAL_19]] : f32
+-// CHECK:             %[[VAL_21:.*]] = fir.insert_value %[[VAL_17]], %[[VAL_20]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_22:.*]] = fir.mulc %[[VAL_21]], %[[VAL_18]] : complex<f32>
+-// CHECK:             %[[VAL_23:.*]] = fir.addc %[[VAL_11]], %[[VAL_22]] : complex<f32>
+-// CHECK:             fir.result %[[VAL_23]] : complex<f32>
+-// CHECK:           }
+-// CHECK:           return %[[VAL_9]] : complex<f32>
+-// CHECK:         }
+-
+-func.func @dot_product_logical(%arg0: !hlfir.expr<?x!fir.logical<1>>, %arg1: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?x!fir.logical<1>>, !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4>
+-  return %res : !fir.logical<4>
+-}
+-// CHECK-LABEL:   func.func @dot_product_logical(
+-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?x!fir.logical<1>>,
+-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant false
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x!fir.logical<1>>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+-// CHECK:           %[[VAL_7:.*]] = fir.do_loop %[[VAL_8:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_9:.*]] = %[[VAL_6]]) -> (!fir.logical<4>) {
+-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<1>>, index) -> !fir.logical<1>
+-// CHECK:             %[[VAL_11:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+-// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.logical<4>) -> i1
+-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<1>) -> i1
+-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+-// CHECK:             %[[VAL_15:.*]] = arith.andi %[[VAL_13]], %[[VAL_14]] : i1
+-// CHECK:             %[[VAL_16:.*]] = arith.ori %[[VAL_12]], %[[VAL_15]] : i1
+-// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+-// CHECK:             fir.result %[[VAL_17]] : !fir.logical<4>
+-// CHECK:           }
+-// CHECK:           return %[[VAL_7]] : !fir.logical<4>
+-// CHECK:         }
+-
+-func.func @dot_product_known_dim(%arg0: !hlfir.expr<10xf32>, %arg1: !hlfir.expr<?xi16>) -> f32 {
+-  %res1 = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<10xf32>, !hlfir.expr<?xi16>) -> f32
+-  %res2 = hlfir.dot_product %arg1 %arg0 : (!hlfir.expr<?xi16>, !hlfir.expr<10xf32>) -> f32
+-  %res = arith.addf %res1, %res2 : f32
+-  return %res : f32
+-}
+-// CHECK-LABEL:   func.func @dot_product_known_dim(
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
+-// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
+-// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
+diff -ruN --strip-trailing-cr a/libcxx/include/__config b/libcxx/include/__config
+--- a/libcxx/include/__config
++++ b/libcxx/include/__config
+@@ -1166,9 +1166,7 @@
+ #    define _LIBCPP_NOESCAPE
+ #  endif
+ 
+-// FIXME: Expand this to [[__gnu__::__nodebug__]] again once the testcase reported in
+-// https://github.com/llvm/llvm-project/pull/118710 has been analyzed
+-#  define _LIBCPP_NODEBUG
++#  define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]]
+ 
+ #  if __has_attribute(__standalone_debug__)
+ #    define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__))
+diff -ruN --strip-trailing-cr a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+--- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
++++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+@@ -27,7 +27,7 @@
+     check_factories.registerCheck<libcpp::header_exportable_declarations>("libcpp-header-exportable-declarations");
+     check_factories.registerCheck<libcpp::hide_from_abi>("libcpp-hide-from-abi");
+     check_factories.registerCheck<libcpp::internal_ftm_use>("libcpp-internal-ftms");
+-    // check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
++    check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
+     check_factories.registerCheck<libcpp::proper_version_checks>("libcpp-cpp-version-check");
+     check_factories.registerCheck<libcpp::robust_against_adl_check>("libcpp-robust-against-adl");
+     check_factories.registerCheck<libcpp::uglify_attributes>("libcpp-uglify-attributes");
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+@@ -1140,8 +1140,6 @@
+ 
+   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ 
+-  setTargetDAGCombine(ISD::SHL);
+-
+   // In case of strict alignment, avoid an excessive number of byte wide stores.
+   MaxStoresPerMemsetOptSize = 8;
+   MaxStoresPerMemset =
+@@ -26473,43 +26471,6 @@
+   return NVCAST;
  }
  
--TEST(ValueRangeTest, ValueConstructable) {
--  MLIRContext context;
--  Builder builder(&context);
+-/// If the operand is a bitwise AND with a constant RHS, and the shift has a
+-/// constant RHS and is the only use, we can pull it out of the shift, i.e.
+-///
+-///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
+-///
+-/// We prefer this canonical form to match existing isel patterns.
+-static SDValue performSHLCombine(SDNode *N,
+-                                 TargetLowering::DAGCombinerInfo &DCI,
+-                                 SelectionDAG &DAG) {
+-  if (DCI.isBeforeLegalizeOps())
+-    return SDValue();
 -
--  Operation *useOp =
--      createOp(&context, /*operands=*/std::nullopt, builder.getIntegerType(16));
--  // Valid construction despite a temporary 'OpResult'.
--  ValueRange operands = useOp->getResult(0);
+-  SDValue Op0 = N->getOperand(0);
+-  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
+-    return SDValue();
 -
--  useOp->setOperands(operands);
--  EXPECT_EQ(useOp->getNumOperands(), 1u);
--  EXPECT_EQ(useOp->getOperand(0), useOp->getResult(0));
+-  SDValue C1 = Op0->getOperand(1);
+-  SDValue C2 = N->getOperand(1);
+-  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
+-    return SDValue();
 -
--  useOp->dropAllUses();
--  useOp->destroy();
+-  // Might be folded into shifted op, do not lower.
+-  if (N->hasOneUse()) {
+-    unsigned UseOpc = N->user_begin()->getOpcode();
+-    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
+-        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
+-      return SDValue();
+-  }
+-
+-  SDLoc DL(N);
+-  EVT VT = N->getValueType(0);
+-  SDValue X = Op0->getOperand(0);
+-  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
+-  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
+-  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
+-}
+-
+ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+   SelectionDAG &DAG = DCI.DAG;
+@@ -26855,8 +26816,6 @@
+     return performCTLZCombine(N, DAG, Subtarget);
+   case ISD::SCALAR_TO_VECTOR:
+     return performScalarToVectorCombine(N, DCI, DAG);
+-  case ISD::SHL:
+-    return performSHLCombine(N, DCI, DAG);
+   }
+   return SDValue();
+ }
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
++++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+@@ -4979,7 +4979,7 @@
+     // the subvector length.
+     const unsigned VecVF = getNumElements(Vec->getType());
+     SmallVector<int> Mask(VecVF, PoisonMaskElem);
+-    std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
++    std::iota(Mask.begin(), Mask.end(), 0);
+     for (unsigned I : seq<unsigned>(SubVecVF))
+       Mask[I + Index] = I + VecVF;
+     if (Generator) {
+@@ -13956,11 +13956,12 @@
+     Instruction *InsElt;
+     if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
+       assert(SLPReVec && "FixedVectorType is not expected.");
+-      Vec = InsElt = cast<Instruction>(createInsertVector(
+-          Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
+-      auto *II = dyn_cast<IntrinsicInst>(InsElt);
++      Vec =
++          createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
++      auto *II = dyn_cast<IntrinsicInst>(Vec);
+       if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+         return Vec;
++      InsElt = II;
+     } else {
+       Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
+       InsElt = dyn_cast<InsertElementInst>(Vec);
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
++++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+@@ -190,7 +190,8 @@
+ define i8 @test_i8_7_mask_shl_1(i8 %a0) {
+ ; CHECK-LABEL: test_i8_7_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #1, #3
++; CHECK-NEXT:    and w8, w0, #0x7
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 7
+   %t1 = shl i8 %t0, 1
+@@ -199,7 +200,8 @@
+ define i8 @test_i8_7_mask_shl_4(i8 %a0) {
+ ; CHECK-LABEL: test_i8_7_mask_shl_4:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #4, #3
++; CHECK-NEXT:    and w8, w0, #0x7
++; CHECK-NEXT:    lsl w0, w8, #4
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 7
+   %t1 = shl i8 %t0, 4
+@@ -227,8 +229,8 @@
+ define i8 @test_i8_28_mask_shl_1(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0x38
++; CHECK-NEXT:    and w8, w0, #0x1c
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 1
+@@ -237,8 +239,8 @@
+ define i8 @test_i8_28_mask_shl_2(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_2:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #2
+-; CHECK-NEXT:    and w0, w8, #0x70
++; CHECK-NEXT:    and w8, w0, #0x1c
++; CHECK-NEXT:    lsl w0, w8, #2
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 2
+@@ -247,8 +249,8 @@
+ define i8 @test_i8_28_mask_shl_3(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_3:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #3
+-; CHECK-NEXT:    and w0, w8, #0xe0
++; CHECK-NEXT:    and w8, w0, #0x1c
++; CHECK-NEXT:    lsl w0, w8, #3
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 3
+@@ -257,8 +259,8 @@
+ define i8 @test_i8_28_mask_shl_4(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_4:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #4
+-; CHECK-NEXT:    and w0, w8, #0xc0
++; CHECK-NEXT:    and w8, w0, #0xc
++; CHECK-NEXT:    lsl w0, w8, #4
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 4
+@@ -268,8 +270,8 @@
+ define i8 @test_i8_224_mask_shl_1(i8 %a0) {
+ ; CHECK-LABEL: test_i8_224_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0xc0
++; CHECK-NEXT:    and w8, w0, #0x60
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 224
+   %t1 = shl i8 %t0, 1
+@@ -463,7 +465,8 @@
+ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
+ ; CHECK-LABEL: test_i16_127_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #1, #7
++; CHECK-NEXT:    and w8, w0, #0x7f
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 127
+   %t1 = shl i16 %t0, 1
+@@ -472,7 +475,8 @@
+ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
+ ; CHECK-LABEL: test_i16_127_mask_shl_8:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #8, #7
++; CHECK-NEXT:    and w8, w0, #0x7f
++; CHECK-NEXT:    lsl w0, w8, #8
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 127
+   %t1 = shl i16 %t0, 8
+@@ -500,8 +504,8 @@
+ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_3:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #3
+-; CHECK-NEXT:    and w0, w8, #0x3f80
++; CHECK-NEXT:    and w8, w0, #0x7f0
++; CHECK-NEXT:    lsl w0, w8, #3
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 3
+@@ -510,8 +514,8 @@
+ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_4:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #4
+-; CHECK-NEXT:    and w0, w8, #0x7f00
++; CHECK-NEXT:    and w8, w0, #0x7f0
++; CHECK-NEXT:    lsl w0, w8, #4
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 4
+@@ -520,8 +524,8 @@
+ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_5:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #5
+-; CHECK-NEXT:    and w0, w8, #0xfe00
++; CHECK-NEXT:    and w8, w0, #0x7f0
++; CHECK-NEXT:    lsl w0, w8, #5
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 5
+@@ -530,8 +534,8 @@
+ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_6:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #6
+-; CHECK-NEXT:    and w0, w8, #0xfc00
++; CHECK-NEXT:    and w8, w0, #0x3f0
++; CHECK-NEXT:    lsl w0, w8, #6
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 6
+@@ -541,8 +545,8 @@
+ define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
+ ; CHECK-LABEL: test_i16_65024_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0xfc00
++; CHECK-NEXT:    and w8, w0, #0x7e00
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 65024
+   %t1 = shl i16 %t0, 1
+@@ -736,7 +740,8 @@
+ define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
+ ; CHECK-LABEL: test_i32_32767_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #1, #15
++; CHECK-NEXT:    and w8, w0, #0x7fff
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 32767
+   %t1 = shl i32 %t0, 1
+@@ -745,7 +750,8 @@
+ define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
+ ; CHECK-LABEL: test_i32_32767_mask_shl_16:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #16, #15
++; CHECK-NEXT:    and w8, w0, #0x7fff
++; CHECK-NEXT:    lsl w0, w8, #16
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 32767
+   %t1 = shl i32 %t0, 16
+@@ -773,8 +779,8 @@
+ define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_7:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #7
+-; CHECK-NEXT:    and w0, w8, #0x3fff8000
++; CHECK-NEXT:    and w8, w0, #0x7fff00
++; CHECK-NEXT:    lsl w0, w8, #7
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 7
+@@ -783,8 +789,8 @@
+ define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_8:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #8
+-; CHECK-NEXT:    and w0, w8, #0x7fff0000
++; CHECK-NEXT:    and w8, w0, #0x7fff00
++; CHECK-NEXT:    lsl w0, w8, #8
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 8
+@@ -793,8 +799,8 @@
+ define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_9:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #9
+-; CHECK-NEXT:    and w0, w8, #0xfffe0000
++; CHECK-NEXT:    and w8, w0, #0x7fff00
++; CHECK-NEXT:    lsl w0, w8, #9
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 9
+@@ -803,8 +809,8 @@
+ define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_10:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #10
+-; CHECK-NEXT:    and w0, w8, #0xfffc0000
++; CHECK-NEXT:    and w8, w0, #0x3fff00
++; CHECK-NEXT:    lsl w0, w8, #10
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 10
+@@ -814,8 +820,8 @@
+ define i32 @test_i32_4294836224_mask_shl_1(i32 %a0) {
+ ; CHECK-LABEL: test_i32_4294836224_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0xfffc0000
++; CHECK-NEXT:    and w8, w0, #0x7ffe0000
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 4294836224
+   %t1 = shl i32 %t0, 1
+@@ -1009,7 +1015,8 @@
+ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
+ ; CHECK-LABEL: test_i64_2147483647_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w0, w0, #1
++; CHECK-NEXT:    and x8, x0, #0x7fffffff
++; CHECK-NEXT:    lsl x0, x8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 2147483647
+   %t1 = shl i64 %t0, 1
+@@ -1047,8 +1054,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_15:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #15
+-; CHECK-NEXT:    and x0, x8, #0x3fffffff80000000
++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #15
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 15
+@@ -1057,8 +1064,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_16:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #16
+-; CHECK-NEXT:    and x0, x8, #0x7fffffff00000000
++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #16
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 16
+@@ -1067,8 +1074,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_17:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #17
+-; CHECK-NEXT:    and x0, x8, #0xfffffffe00000000
++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #17
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 17
+@@ -1077,8 +1084,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_18:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #18
+-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
++; CHECK-NEXT:    and x8, x0, #0x3fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #18
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 18
+@@ -1088,8 +1095,8 @@
+ define i64 @test_i64_18446744065119617024_mask_shl_1(i64 %a0) {
+ ; CHECK-LABEL: test_i64_18446744065119617024_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #1
+-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
++; CHECK-NEXT:    and x8, x0, #0x7ffffffe00000000
++; CHECK-NEXT:    lsl x0, x8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 18446744065119617024
+   %t1 = shl i64 %t0, 1
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
+--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
++++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
+@@ -1013,8 +1013,8 @@
+ define i32 @c2_i32(i32 %arg) nounwind {
+ ; CHECK-LABEL: c2_i32:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr w8, w0, #17
+-; CHECK-NEXT:    and w0, w8, #0xffc
++; CHECK-NEXT:    ubfx w8, w0, #19, #10
++; CHECK-NEXT:    lsl w0, w8, #2
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i32 %arg, 19
+   %tmp1 = and i32 %tmp0, 1023
+@@ -1063,8 +1063,8 @@
+ define i64 @c2_i64(i64 %arg) nounwind {
+ ; CHECK-LABEL: c2_i64:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr x8, x0, #49
+-; CHECK-NEXT:    and x0, x8, #0xffc
++; CHECK-NEXT:    ubfx x8, x0, #51, #10
++; CHECK-NEXT:    lsl x0, x8, #2
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i64 %arg, 51
+   %tmp1 = and i64 %tmp0, 1023
+@@ -1120,8 +1120,8 @@
+ define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
+ ; CHECK-LABEL: c7_i32:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr w8, w0, #17
+-; CHECK-NEXT:    and w8, w8, #0xffc
++; CHECK-NEXT:    ubfx w8, w0, #19, #10
++; CHECK-NEXT:    lsl w8, w8, #2
+ ; CHECK-NEXT:    str w8, [x1]
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i32 %arg, 19
+@@ -1163,8 +1163,8 @@
+ define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
+ ; CHECK-LABEL: c7_i64:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr x8, x0, #49
+-; CHECK-NEXT:    and x8, x8, #0xffc
++; CHECK-NEXT:    ubfx x8, x0, #51, #10
++; CHECK-NEXT:    lsl x8, x8, #2
+ ; CHECK-NEXT:    str x8, [x1]
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i64 %arg, 51
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/fpenv.ll b/llvm/test/CodeGen/AArch64/fpenv.ll
+--- a/llvm/test/CodeGen/AArch64/fpenv.ll
++++ b/llvm/test/CodeGen/AArch64/fpenv.ll
+@@ -4,11 +4,11 @@
+ define void @func_set_rounding_dyn(i32 %rm) {
+ ; CHECK-LABEL: func_set_rounding_dyn:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w9, w0, #22
++; CHECK-NEXT:    sub w9, w0, #1
+ ; CHECK-NEXT:    mrs x8, FPCR
++; CHECK-NEXT:    and w9, w9, #0x3
+ ; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+-; CHECK-NEXT:    sub w9, w9, #1024, lsl #12 // =4194304
+-; CHECK-NEXT:    and w9, w9, #0xc00000
++; CHECK-NEXT:    lsl w9, w9, #22
+ ; CHECK-NEXT:    orr x8, x8, x9
+ ; CHECK-NEXT:    msr FPCR, x8
+ ; CHECK-NEXT:    ret
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/xbfiz.ll b/llvm/test/CodeGen/AArch64/xbfiz.ll
+--- a/llvm/test/CodeGen/AArch64/xbfiz.ll
++++ b/llvm/test/CodeGen/AArch64/xbfiz.ll
+@@ -69,19 +69,3 @@
+   %and = and i64 %shl, 4294967295
+   ret i64 %and
+ }
+-
+-define i64 @lsl_zext_i8_i64(i8 %b) {
+-; CHECK-LABEL: lsl_zext_i8_i64:
+-; CHECK:    ubfiz x0, x0, #1, #8
+-  %1 = zext i8 %b to i64
+-  %2 = shl i64 %1, 1
+-  ret i64 %2
 -}
 -
- } // namespace
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-@@ -43,10 +43,7 @@
- 
- gentbl(
-     name = "diagnostic_defs_gen",
--    tbl_outs = [(
--        "-gen-clang-diags-defs -clang-component=%s" % c,
--        "include/clang/Basic/Diagnostic%sKinds.inc" % c,
--    ) for c in [
-+    tbl_outs = [out for c in [
-         "AST",
-         "Analysis",
-         "Comment",
-@@ -60,6 +57,15 @@
-         "Refactoring",
-         "Sema",
-         "Serialization",
-+    ] for out in [
-+        (
-+            "-gen-clang-diags-defs -clang-component=%s" % c,
-+            "include/clang/Basic/Diagnostic%sKinds.inc" % c,
-+        ),
-+        (
-+            "-gen-clang-diags-enums -clang-component=%s" % c,
-+            "include/clang/Basic/Diagnostic%sEnums.inc" % c,
-+        ),
-     ]] + [
-         (
-             "-gen-clang-diag-groups",
+-define i64 @lsl_zext_i16_i64(i16 %b) {
+-; CHECK-LABEL: lsl_zext_i16_i64:
+-; CHECK:    ubfiz x0, x0, #1, #16
+-  %1 = zext i16 %b to i64
+-  %2 = shl i64 %1, 1
+-  ret i64 %2
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
++++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+@@ -0,0 +1,81 @@
++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
++; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
++
++define <16 x double> @test(ptr %x, double %v, double %a) {
++; CHECK-LABEL: define <16 x double> @test(
++; CHECK-SAME: ptr [[X:%.*]], double [[V:%.*]], double [[A:%.*]]) {
++; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
++; CHECK-NEXT:    [[GEP8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 9
++; CHECK-NEXT:    [[TMP1:%.*]] = load <6 x double>, ptr [[X]], align 4
++; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[GEP6]], align 4
++; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[GEP8]], align 4
++; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> poison, double [[A]], i32 0
++; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x double> [[TMP4]], <16 x double> poison, <16 x i32> zeroinitializer
++; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[V]], i32 0
++; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[V]], i32 0
++; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <2 x i32> zeroinitializer
++; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v6f64(<16 x double> poison, <6 x double> [[TMP1]], i64 0)
++; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
++; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 14, i32 15>
++; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP12]], <2 x double> [[TMP6]], i64 6)
++; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP13]], <2 x double> [[TMP7]], i64 8)
++; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP14]], <2 x double> [[TMP9]], i64 10)
++; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP15]], <2 x double> [[TMP9]], i64 12)
++; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP16]], <2 x double> [[TMP9]], i64 14)
++; CHECK-NEXT:    [[TMP18:%.*]] = fadd <16 x double> [[TMP5]], [[TMP17]]
++; CHECK-NEXT:    ret <16 x double> [[TMP18]]
++;
++  %gep1 = getelementptr inbounds double, ptr %x, i64 1
++  %gep2 = getelementptr inbounds double, ptr %x, i64 2
++  %gep3 = getelementptr inbounds double, ptr %x, i64 3
++  %gep4 = getelementptr inbounds double, ptr %x, i64 4
++  %gep5 = getelementptr inbounds double, ptr %x, i64 5
++  %gep6 = getelementptr inbounds double, ptr %x, i64 8
++  %gep7 = getelementptr inbounds double, ptr %x, i64 9
++  %gep8 = getelementptr inbounds double, ptr %x, i64 9
++  %gep9 = getelementptr inbounds double, ptr %x, i64 10
++  %x0 = load double, ptr %x, align 4
++  %x1 = load double, ptr %gep1, align 4
++  %x2 = load double, ptr %gep2, align 4
++  %x3 = load double, ptr %gep3, align 4
++  %x4 = load double, ptr %gep4, align 4
++  %x5 = load double, ptr %gep5, align 4
++  %x6 = load double, ptr %gep6, align 4
++  %x7 = load double, ptr %gep7, align 4
++  %x8 = load double, ptr %gep8, align 4
++  %x9 = load double, ptr %gep9, align 4
++  %add1 = fadd double %a, %x0
++  %add2 = fadd double %a, %x1
++  %add3 = fadd double %a, %x2
++  %add4 = fadd double %a, %x3
++  %add5 = fadd double %a, %x4
++  %add6 = fadd double %a, %x5
++  %add7 = fadd double %a, %x6
++  %add8 = fadd double %a, %x7
++  %add9 = fadd double %a, %x8
++  %add10 = fadd double %a, %x9
++  %add11 = fadd double %a, %v
++  %add12 = fadd double %a, %v
++  %add13 = fadd double %a, %v
++  %add14 = fadd double %a, %v
++  %add15 = fadd double %a, %v
++  %add16 = fadd double %a, %v
++  %i0 = insertelement <16 x double> poison, double %add1, i32 0
++  %i1 = insertelement <16 x double> %i0, double %add2, i32 1
++  %i2 = insertelement <16 x double> %i1, double %add3, i32 2
++  %i3 = insertelement <16 x double> %i2, double %add4, i32 3
++  %i4 = insertelement <16 x double> %i3, double %add5, i32 4
++  %i5 = insertelement <16 x double> %i4, double %add6, i32 5
++  %i6 = insertelement <16 x double> %i5, double %add7, i32 6
++  %i7 = insertelement <16 x double> %i6, double %add8, i32 7
++  %i8 = insertelement <16 x double> %i7, double %add9, i32 8
++  %i9 = insertelement <16 x double> %i8, double %add10, i32 9
++  %i10 = insertelement <16 x double> %i9, double %add11, i32 10
++  %i11 = insertelement <16 x double> %i10, double %add12, i32 11
++  %i12 = insertelement <16 x double> %i11, double %add13, i32 12
++  %i13 = insertelement <16 x double> %i12, double %add14, i32 13
++  %i14 = insertelement <16 x double> %i13, double %add15, i32 14
++  %i15 = insertelement <16 x double> %i14, double %add16, i32 15
++  ret <16 x double> %i15
++}
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 4706c63c0e1cc..cb092919de358 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "bf17016a92bc8a23d2cdd2b51355dd4eb5019c68"
-    LLVM_SHA256 = "ba09f12e5019f5aca531b1733275f0a10b181d6f894deb1a4610e017f76b172a"
+    LLVM_COMMIT = "13c761789753862a7cc31a2a26f23010afa668b9"
+    LLVM_SHA256 = "587f3eda6d00d751cbfc69fa5a15475ae4232e191ace04031b343e4e8ae16355"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/shardy/temporary.patch b/third_party/shardy/temporary.patch
index d68a9c7c5255c..6a074a44f8e2c 100644
--- a/third_party/shardy/temporary.patch
+++ b/third_party/shardy/temporary.patch
@@ -1,246 +1,1390 @@
 diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 2331b44..3d2a252 100644
+index 3d2a252..8b54ffb 100644
 --- a/third_party/llvm/generated.patch
 +++ b/third_party/llvm/generated.patch
-@@ -1,22 +1,207 @@
+@@ -1,207 +1,1156 @@
  Auto generated patch. Do not edit or delete it, even if empty.
--diff -ruN --strip-trailing-cr a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
----- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
--+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
--@@ -1291,7 +1291,7 @@
-- /// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.
-- static Value rewriteI4ToI8Ext(PatternRewriter &rewriter, Location loc,
--                               Value srcValue, const ExtractNBitsFn &extFn) {
---  auto srcVecType = cast<VectorType>(srcValue.getType());
--+  [[maybe_unused]] auto srcVecType = cast<VectorType>(srcValue.getType());
--   assert(srcVecType.getElementType().isSignlessInteger(4) &&
--          "Expected i4 type");
-+diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/TypeRange.h b/mlir/include/mlir/IR/TypeRange.h
-+--- a/mlir/include/mlir/IR/TypeRange.h
-++++ b/mlir/include/mlir/IR/TypeRange.h
-+@@ -29,12 +29,11 @@
-+ /// a SmallVector/std::vector. This class should be used in places that are not
-+ /// suitable for a more derived type (e.g. ArrayRef) or a template range
-+ /// parameter.
-+-class TypeRange
-+-    : public llvm::detail::indexed_accessor_range_base<
-+-          TypeRange,
-+-          llvm::PointerUnion<const Value *, const Type *, OpOperand *,
-+-                             detail::OpResultImpl *, Type>,
-+-          Type, Type, Type> {
-++class TypeRange : public llvm::detail::indexed_accessor_range_base<
-++                      TypeRange,
-++                      llvm::PointerUnion<const Value *, const Type *,
-++                                         OpOperand *, detail::OpResultImpl *>,
-++                      Type, Type, Type> {
-+ public:
-+   using RangeBaseT::RangeBaseT;
-+   TypeRange(ArrayRef<Type> types = std::nullopt);
-+@@ -45,11 +44,8 @@
-+   TypeRange(ValueTypeRange<ValueRangeT> values)
-+       : TypeRange(ValueRange(ValueRangeT(values.begin().getCurrent(),
-+                                          values.end().getCurrent()))) {}
-+-
-+-  TypeRange(Type type) : TypeRange(type, /*count=*/1) {}
-+-  template <typename Arg, typename = std::enable_if_t<
-+-                              std::is_constructible_v<ArrayRef<Type>, Arg> &&
-+-                              !std::is_constructible_v<Type, Arg>>>
-++  template <typename Arg, typename = std::enable_if_t<std::is_constructible<
-++                              ArrayRef<Type>, Arg>::value>>
-+   TypeRange(Arg &&arg) : TypeRange(ArrayRef<Type>(std::forward<Arg>(arg))) {}
-+   TypeRange(std::initializer_list<Type> types)
-+       : TypeRange(ArrayRef<Type>(types)) {}
-+@@ -60,9 +56,8 @@
-+   /// * A pointer to the first element of an array of types.
-+   /// * A pointer to the first element of an array of operands.
-+   /// * A pointer to the first element of an array of results.
-+-  /// * A single 'Type' instance.
-+   using OwnerT = llvm::PointerUnion<const Value *, const Type *, OpOperand *,
-+-                                    detail::OpResultImpl *, Type>;
-++                                    detail::OpResultImpl *>;
-  
--@@ -1311,7 +1311,7 @@
-- /// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations.
-- static Value rewriteI2ToI8Ext(PatternRewriter &rewriter, Location loc,
--                               Value srcValue, const ExtractNBitsFn &extFn) {
---  VectorType srcVecType = cast<VectorType>(srcValue.getType());
--+  [[maybe_unused]] VectorType srcVecType = cast<VectorType>(srcValue.getType());
--   assert(srcVecType.getElementType().isSignlessInteger(2) &&
--          "Expected i2 type");
-+   /// See `llvm::detail::indexed_accessor_range_base` for details.
-+   static OwnerT offset_base(OwnerT object, ptrdiff_t index);
-+diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h
-+--- a/mlir/include/mlir/IR/ValueRange.h
-++++ b/mlir/include/mlir/IR/ValueRange.h
-+@@ -374,16 +374,16 @@
-+ /// SmallVector/std::vector. This class should be used in places that are not
-+ /// suitable for a more derived type (e.g. ArrayRef) or a template range
-+ /// parameter.
-+-class ValueRange final : public llvm::detail::indexed_accessor_range_base<
-+-                             ValueRange,
-+-                             PointerUnion<const Value *, OpOperand *,
-+-                                          detail::OpResultImpl *, Value>,
-+-                             Value, Value, Value> {
-++class ValueRange final
-++    : public llvm::detail::indexed_accessor_range_base<
-++          ValueRange,
-++          PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>,
-++          Value, Value, Value> {
+-diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/TypeRange.h b/mlir/include/mlir/IR/TypeRange.h
+---- a/mlir/include/mlir/IR/TypeRange.h
+-+++ b/mlir/include/mlir/IR/TypeRange.h
+-@@ -29,12 +29,11 @@
+- /// a SmallVector/std::vector. This class should be used in places that are not
+- /// suitable for a more derived type (e.g. ArrayRef) or a template range
+- /// parameter.
+--class TypeRange
+--    : public llvm::detail::indexed_accessor_range_base<
+--          TypeRange,
+--          llvm::PointerUnion<const Value *, const Type *, OpOperand *,
+--                             detail::OpResultImpl *, Type>,
+--          Type, Type, Type> {
+-+class TypeRange : public llvm::detail::indexed_accessor_range_base<
+-+                      TypeRange,
+-+                      llvm::PointerUnion<const Value *, const Type *,
+-+                                         OpOperand *, detail::OpResultImpl *>,
+-+                      Type, Type, Type> {
+- public:
+-   using RangeBaseT::RangeBaseT;
+-   TypeRange(ArrayRef<Type> types = std::nullopt);
+-@@ -45,11 +44,8 @@
+-   TypeRange(ValueTypeRange<ValueRangeT> values)
+-       : TypeRange(ValueRange(ValueRangeT(values.begin().getCurrent(),
+-                                          values.end().getCurrent()))) {}
+--
+--  TypeRange(Type type) : TypeRange(type, /*count=*/1) {}
+--  template <typename Arg, typename = std::enable_if_t<
+--                              std::is_constructible_v<ArrayRef<Type>, Arg> &&
+--                              !std::is_constructible_v<Type, Arg>>>
+-+  template <typename Arg, typename = std::enable_if_t<std::is_constructible<
+-+                              ArrayRef<Type>, Arg>::value>>
+-   TypeRange(Arg &&arg) : TypeRange(ArrayRef<Type>(std::forward<Arg>(arg))) {}
+-   TypeRange(std::initializer_list<Type> types)
+-       : TypeRange(ArrayRef<Type>(types)) {}
+-@@ -60,9 +56,8 @@
+-   /// * A pointer to the first element of an array of types.
+-   /// * A pointer to the first element of an array of operands.
+-   /// * A pointer to the first element of an array of results.
+--  /// * A single 'Type' instance.
+-   using OwnerT = llvm::PointerUnion<const Value *, const Type *, OpOperand *,
+--                                    detail::OpResultImpl *, Type>;
+-+                                    detail::OpResultImpl *>;
+- 
+-   /// See `llvm::detail::indexed_accessor_range_base` for details.
+-   static OwnerT offset_base(OwnerT object, ptrdiff_t index);
+-diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h
+---- a/mlir/include/mlir/IR/ValueRange.h
+-+++ b/mlir/include/mlir/IR/ValueRange.h
+-@@ -374,16 +374,16 @@
+- /// SmallVector/std::vector. This class should be used in places that are not
+- /// suitable for a more derived type (e.g. ArrayRef) or a template range
+- /// parameter.
+--class ValueRange final : public llvm::detail::indexed_accessor_range_base<
+--                             ValueRange,
+--                             PointerUnion<const Value *, OpOperand *,
+--                                          detail::OpResultImpl *, Value>,
+--                             Value, Value, Value> {
+-+class ValueRange final
+-+    : public llvm::detail::indexed_accessor_range_base<
+-+          ValueRange,
+-+          PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>,
+-+          Value, Value, Value> {
+- public:
+-   /// The type representing the owner of a ValueRange. This is either a list of
+--  /// values, operands, or results or a single value.
+-+  /// values, operands, or results.
+-   using OwnerT =
+--      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *, Value>;
+-+      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>;
+- 
+-   using RangeBaseT::RangeBaseT;
+- 
+-@@ -392,7 +392,7 @@
+-                 std::is_constructible<ArrayRef<Value>, Arg>::value &&
+-                 !std::is_convertible<Arg, Value>::value>>
+-   ValueRange(Arg &&arg) : ValueRange(ArrayRef<Value>(std::forward<Arg>(arg))) {}
+--  ValueRange(Value value) : ValueRange(value, /*count=*/1) {}
+-+  ValueRange(const Value &value) : ValueRange(&value, /*count=*/1) {}
+-   ValueRange(const std::initializer_list<Value> &values)
+-       : ValueRange(ArrayRef<Value>(values)) {}
+-   ValueRange(iterator_range<OperandRange::iterator> values)
+-diff -ruN --strip-trailing-cr a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
+---- a/mlir/lib/IR/OperationSupport.cpp
+-+++ b/mlir/lib/IR/OperationSupport.cpp
+-@@ -653,15 +653,6 @@
+- /// See `llvm::detail::indexed_accessor_range_base` for details.
+- ValueRange::OwnerT ValueRange::offset_base(const OwnerT &owner,
+-                                            ptrdiff_t index) {
+--  if (llvm::isa_and_nonnull<Value>(owner)) {
+--    // Prevent out-of-bounds indexing for single values.
+--    // Note that we do allow an index of 1 as is required by 'slice'ing that
+--    // returns an empty range. This also matches the usual rules of C++ of being
+--    // allowed to index past the last element of an array.
+--    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
+--    // Return nullptr to quickly cause segmentation faults on misuse.
+--    return index == 0 ? owner : nullptr;
+--  }
+-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
+-     return {value + index};
+-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
+-@@ -670,10 +661,6 @@
++diff -ruN --strip-trailing-cr a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
++--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
++@@ -513,12 +513,6 @@
++ Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
++                      Entity entity, mlir::ValueRange oneBasedIndices);
++ 
++-/// Return a vector of extents for the given entity.
++-/// The function creates new operations, but tries to clean-up
++-/// after itself.
++-llvm::SmallVector<mlir::Value>
++-genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
++-
++ } // namespace hlfir
++ 
++ #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
++diff -ruN --strip-trailing-cr a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
++--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
++@@ -1421,15 +1421,3 @@
++   return loadTrivialScalar(loc, builder,
++                            getElementAt(loc, builder, entity, oneBasedIndices));
+  }
+- /// See `llvm::detail::indexed_accessor_range_base` for details.
+- Value ValueRange::dereference_iterator(const OwnerT &owner, ptrdiff_t index) {
+--  if (auto value = llvm::dyn_cast_if_present<Value>(owner)) {
+--    assert(index == 0 && "cannot offset into single-value 'ValueRange'");
+--    return value;
++-
++-llvm::SmallVector<mlir::Value>
++-hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
++-                        hlfir::Entity entity) {
++-  entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
++-  mlir::Value shape = hlfir::genShape(loc, builder, entity);
++-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> extents =
++-      hlfir::getExplicitExtentsFromShape(shape, builder);
++-  if (shape.getUses().empty())
++-    shape.getDefiningOp()->erase();
++-  return extents;
++-}
++diff -ruN --strip-trailing-cr a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
++--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
++@@ -37,79 +37,6 @@
++ 
++ namespace {
++ 
++-// Helper class to generate operations related to computing
++-// product of values.
++-class ProductFactory {
++-public:
++-  ProductFactory(mlir::Location loc, fir::FirOpBuilder &builder)
++-      : loc(loc), builder(builder) {}
++-
++-  // Generate an update of the inner product value:
++-  //   acc += v1 * v2, OR
++-  //   acc += CONJ(v1) * v2, OR
++-  //   acc ||= v1 && v2
++-  //
++-  // CONJ parameter specifies whether the first complex product argument
++-  // needs to be conjugated.
++-  template <bool CONJ = false>
++-  mlir::Value genAccumulateProduct(mlir::Value acc, mlir::Value v1,
++-                                   mlir::Value v2) {
++-    mlir::Type resultType = acc.getType();
++-    acc = castToProductType(acc, resultType);
++-    v1 = castToProductType(v1, resultType);
++-    v2 = castToProductType(v2, resultType);
++-    mlir::Value result;
++-    if (mlir::isa<mlir::FloatType>(resultType)) {
++-      result = builder.create<mlir::arith::AddFOp>(
++-          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
++-    } else if (mlir::isa<mlir::ComplexType>(resultType)) {
++-      if constexpr (CONJ)
++-        result = fir::IntrinsicLibrary{builder, loc}.genConjg(resultType, v1);
++-      else
++-        result = v1;
++-
++-      result = builder.create<fir::AddcOp>(
++-          loc, acc, builder.create<fir::MulcOp>(loc, result, v2));
++-    } else if (mlir::isa<mlir::IntegerType>(resultType)) {
++-      result = builder.create<mlir::arith::AddIOp>(
++-          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
++-    } else if (mlir::isa<fir::LogicalType>(resultType)) {
++-      result = builder.create<mlir::arith::OrIOp>(
++-          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
++-    } else {
++-      llvm_unreachable("unsupported type");
++-    }
++-
++-    return builder.createConvert(loc, resultType, result);
+ -  }
+-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
+-     return value[index];
+-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
+-diff -ruN --strip-trailing-cr a/mlir/lib/IR/TypeRange.cpp b/mlir/lib/IR/TypeRange.cpp
+---- a/mlir/lib/IR/TypeRange.cpp
+-+++ b/mlir/lib/IR/TypeRange.cpp
+-@@ -31,23 +31,12 @@
+-     this->base = result;
+-   else if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
+-     this->base = operand;
+--  else if (auto value = llvm::dyn_cast_if_present<Value>(owner))
+--    this->base = value.getType();
+-   else
+-     this->base = cast<const Value *>(owner);
+- }
++-
++-private:
++-  mlir::Location loc;
++-  fir::FirOpBuilder &builder;
++-
++-  mlir::Value castToProductType(mlir::Value value, mlir::Type type) {
++-    if (mlir::isa<fir::LogicalType>(type))
++-      return builder.createConvert(loc, builder.getIntegerType(1), value);
++-
++-    // TODO: the multiplications/additions by/of zero resulting from
++-    // complex * real are optimized by LLVM under -fno-signed-zeros
++-    // -fno-honor-nans.
++-    // We can make them disappear by default if we:
++-    //   * either expand the complex multiplication into real
++-    //     operations, OR
++-    //   * set nnan nsz fast-math flags to the complex operations.
++-    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
++-      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
++-      fir::factory::Complex helper(builder, loc);
++-      mlir::Type partType = helper.getComplexPartType(type);
++-      return helper.insertComplexPart(zeroCmplx,
++-                                      castToProductType(value, partType),
++-                                      /*isImagPart=*/false);
++-    }
++-    return builder.createConvert(loc, type, value);
++-  }
++-};
++-
++ class TransposeAsElementalConversion
++     : public mlir::OpRewritePattern<hlfir::TransposeOp> {
 + public:
-+   /// The type representing the owner of a ValueRange. This is either a list of
-+-  /// values, operands, or results or a single value.
-++  /// values, operands, or results.
-+   using OwnerT =
-+-      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *, Value>;
-++      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>;
++@@ -163,8 +90,11 @@
++   static mlir::Value genResultShape(mlir::Location loc,
++                                     fir::FirOpBuilder &builder,
++                                     hlfir::Entity array) {
++-    llvm::SmallVector<mlir::Value, 2> inExtents =
++-        hlfir::genExtentsVector(loc, builder, array);
+++    mlir::Value inShape = hlfir::genShape(loc, builder, array);
+++    llvm::SmallVector<mlir::Value> inExtents =
+++        hlfir::getExplicitExtentsFromShape(inShape, builder);
+++    if (inShape.getUses().empty())
+++      inShape.getDefiningOp()->erase();
++ 
++     // transpose indices
++     assert(inExtents.size() == 2 && "checked in TransposeOp::validate");
++@@ -207,7 +137,7 @@
++     mlir::Value resultShape, dimExtent;
++     llvm::SmallVector<mlir::Value> arrayExtents;
++     if (isTotalReduction)
++-      arrayExtents = hlfir::genExtentsVector(loc, builder, array);
+++      arrayExtents = genArrayExtents(loc, builder, array);
++     else
++       std::tie(resultShape, dimExtent) =
++           genResultShapeForPartialReduction(loc, builder, array, dimVal);
++@@ -233,8 +163,7 @@
++       // If DIM is not present, do total reduction.
++ 
++       // Initial value for the reduction.
++-      mlir::Value reductionInitValue =
++-          fir::factory::createZeroValue(builder, loc, elementType);
+++      mlir::Value reductionInitValue = genInitValue(loc, builder, elementType);
++ 
++       // The reduction loop may be unordered if FastMathFlags::reassoc
++       // transformations are allowed. The integer reduction is always
++@@ -335,6 +264,17 @@
++   }
++ 
++ private:
+++  static llvm::SmallVector<mlir::Value>
+++  genArrayExtents(mlir::Location loc, fir::FirOpBuilder &builder,
+++                  hlfir::Entity array) {
+++    mlir::Value inShape = hlfir::genShape(loc, builder, array);
+++    llvm::SmallVector<mlir::Value> inExtents =
+++        hlfir::getExplicitExtentsFromShape(inShape, builder);
+++    if (inShape.getUses().empty())
+++      inShape.getDefiningOp()->erase();
+++    return inExtents;
+++  }
+++
++   // Return fir.shape specifying the shape of the result
++   // of a SUM reduction with DIM=dimVal. The second return value
++   // is the extent of the DIM dimension.
++@@ -343,7 +283,7 @@
++                                     fir::FirOpBuilder &builder,
++                                     hlfir::Entity array, int64_t dimVal) {
++     llvm::SmallVector<mlir::Value> inExtents =
++-        hlfir::genExtentsVector(loc, builder, array);
+++        genArrayExtents(loc, builder, array);
++     assert(dimVal > 0 && dimVal <= static_cast<int64_t>(inExtents.size()) &&
++            "DIM must be present and a positive constant not exceeding "
++            "the array's rank");
++@@ -353,6 +293,26 @@
++     return {builder.create<fir::ShapeOp>(loc, inExtents), dimExtent};
++   }
++ 
+++  // Generate the initial value for a SUM reduction with the given
+++  // data type.
+++  static mlir::Value genInitValue(mlir::Location loc,
+++                                  fir::FirOpBuilder &builder,
+++                                  mlir::Type elementType) {
+++    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
+++      const llvm::fltSemantics &sem = ty.getFloatSemantics();
+++      return builder.createRealConstant(loc, elementType,
+++                                        llvm::APFloat::getZero(sem));
+++    } else if (auto ty = mlir::dyn_cast<mlir::ComplexType>(elementType)) {
+++      mlir::Value initValue = genInitValue(loc, builder, ty.getElementType());
+++      return fir::factory::Complex{builder, loc}.createComplex(ty, initValue,
+++                                                               initValue);
+++    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
+++      return builder.createIntegerConstant(loc, elementType, 0);
+++    }
+++
+++    llvm_unreachable("unsupported SUM reduction type");
+++  }
+++
++   // Generate scalar addition of the two values (of the same data type).
++   static mlir::Value genScalarAdd(mlir::Location loc,
++                                   fir::FirOpBuilder &builder,
++@@ -610,10 +570,16 @@
++   static std::tuple<mlir::Value, mlir::Value>
++   genResultShape(mlir::Location loc, fir::FirOpBuilder &builder,
++                  hlfir::Entity input1, hlfir::Entity input2) {
++-    llvm::SmallVector<mlir::Value, 2> input1Extents =
++-        hlfir::genExtentsVector(loc, builder, input1);
++-    llvm::SmallVector<mlir::Value, 2> input2Extents =
++-        hlfir::genExtentsVector(loc, builder, input2);
+++    mlir::Value input1Shape = hlfir::genShape(loc, builder, input1);
+++    llvm::SmallVector<mlir::Value> input1Extents =
+++        hlfir::getExplicitExtentsFromShape(input1Shape, builder);
+++    if (input1Shape.getUses().empty())
+++      input1Shape.getDefiningOp()->erase();
+++    mlir::Value input2Shape = hlfir::genShape(loc, builder, input2);
+++    llvm::SmallVector<mlir::Value> input2Extents =
+++        hlfir::getExplicitExtentsFromShape(input2Shape, builder);
+++    if (input2Shape.getUses().empty())
+++      input2Shape.getDefiningOp()->erase();
   
-+   using RangeBaseT::RangeBaseT;
-+ 
-+@@ -392,7 +392,7 @@
-+                 std::is_constructible<ArrayRef<Value>, Arg>::value &&
-+                 !std::is_convertible<Arg, Value>::value>>
-+   ValueRange(Arg &&arg) : ValueRange(ArrayRef<Value>(std::forward<Arg>(arg))) {}
-+-  ValueRange(Value value) : ValueRange(value, /*count=*/1) {}
-++  ValueRange(const Value &value) : ValueRange(&value, /*count=*/1) {}
-+   ValueRange(const std::initializer_list<Value> &values)
-+       : ValueRange(ArrayRef<Value>(values)) {}
-+   ValueRange(iterator_range<OperandRange::iterator> values)
-+diff -ruN --strip-trailing-cr a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
-+--- a/mlir/lib/IR/OperationSupport.cpp
-++++ b/mlir/lib/IR/OperationSupport.cpp
-+@@ -653,15 +653,6 @@
-+ /// See `llvm::detail::indexed_accessor_range_base` for details.
-+ ValueRange::OwnerT ValueRange::offset_base(const OwnerT &owner,
-+                                            ptrdiff_t index) {
-+-  if (llvm::isa_and_nonnull<Value>(owner)) {
-+-    // Prevent out-of-bounds indexing for single values.
-+-    // Note that we do allow an index of 1 as is required by 'slice'ing that
-+-    // returns an empty range. This also matches the usual rules of C++ of being
-+-    // allowed to index past the last element of an array.
-+-    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
-+-    // Return nullptr to quickly cause segmentation faults on misuse.
-+-    return index == 0 ? owner : nullptr;
-+-  }
-+   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
-+     return {value + index};
-+   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-+@@ -670,10 +661,6 @@
-+ }
-+ /// See `llvm::detail::indexed_accessor_range_base` for details.
-+ Value ValueRange::dereference_iterator(const OwnerT &owner, ptrdiff_t index) {
-+-  if (auto value = llvm::dyn_cast_if_present<Value>(owner)) {
-+-    assert(index == 0 && "cannot offset into single-value 'ValueRange'");
-+-    return value;
-+-  }
-+   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
-+     return value[index];
-+   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-+diff -ruN --strip-trailing-cr a/mlir/lib/IR/TypeRange.cpp b/mlir/lib/IR/TypeRange.cpp
-+--- a/mlir/lib/IR/TypeRange.cpp
-++++ b/mlir/lib/IR/TypeRange.cpp
-+@@ -31,23 +31,12 @@
-+     this->base = result;
-+   else if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-+     this->base = operand;
-+-  else if (auto value = llvm::dyn_cast_if_present<Value>(owner))
-+-    this->base = value.getType();
-+   else
-+     this->base = cast<const Value *>(owner);
-+ }
+- /// See `llvm::detail::indexed_accessor_range_base` for details.
+- TypeRange::OwnerT TypeRange::offset_base(OwnerT object, ptrdiff_t index) {
+--  if (llvm::isa_and_nonnull<Type>(object)) {
+--    // Prevent out-of-bounds indexing for single values.
+--    // Note that we do allow an index of 1 as is required by 'slice'ing that
+--    // returns an empty range. This also matches the usual rules of C++ of being
+--    // allowed to index past the last element of an array.
+--    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
+--    // Return nullptr to quickly cause segmentation faults on misuse.
+--    return index == 0 ? object : nullptr;
++     llvm::SmallVector<mlir::Value, 2> newExtents;
++     mlir::Value innerProduct1Extent, innerProduct2Extent;
++@@ -661,6 +627,60 @@
++             innerProductExtent[0]};
++   }
 + 
-+ /// See `llvm::detail::indexed_accessor_range_base` for details.
-+ TypeRange::OwnerT TypeRange::offset_base(OwnerT object, ptrdiff_t index) {
-+-  if (llvm::isa_and_nonnull<Type>(object)) {
-+-    // Prevent out-of-bounds indexing for single values.
-+-    // Note that we do allow an index of 1 as is required by 'slice'ing that
-+-    // returns an empty range. This also matches the usual rules of C++ of being
-+-    // allowed to index past the last element of an array.
-+-    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
-+-    // Return nullptr to quickly cause segmentation faults on misuse.
-+-    return index == 0 ? object : nullptr;
-+-  }
-+   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
-+     return {value + index};
-+   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
-+@@ -59,10 +48,6 @@
-+ 
-+ /// See `llvm::detail::indexed_accessor_range_base` for details.
-+ Type TypeRange::dereference_iterator(OwnerT object, ptrdiff_t index) {
-+-  if (auto type = llvm::dyn_cast_if_present<Type>(object)) {
-+-    assert(index == 0 && "cannot offset into single-value 'TypeRange'");
-+-    return type;
-+-  }
-+   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
-+     return (value + index)->getType();
-+   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
-+diff -ruN --strip-trailing-cr a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
-+--- a/mlir/unittests/IR/OperationSupportTest.cpp
-++++ b/mlir/unittests/IR/OperationSupportTest.cpp
-+@@ -313,21 +313,4 @@
-+   op2->destroy();
-+ }
+++  static mlir::Value castToProductType(mlir::Location loc,
+++                                       fir::FirOpBuilder &builder,
+++                                       mlir::Value value, mlir::Type type) {
+++    if (mlir::isa<fir::LogicalType>(type))
+++      return builder.createConvert(loc, builder.getIntegerType(1), value);
+++
+++    // TODO: the multiplications/additions by/of zero resulting from
+++    // complex * real are optimized by LLVM under -fno-signed-zeros
+++    // -fno-honor-nans.
+++    // We can make them disappear by default if we:
+++    //   * either expand the complex multiplication into real
+++    //     operations, OR
+++    //   * set nnan nsz fast-math flags to the complex operations.
+++    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
+++      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
+++      fir::factory::Complex helper(builder, loc);
+++      mlir::Type partType = helper.getComplexPartType(type);
+++      return helper.insertComplexPart(
+++          zeroCmplx, castToProductType(loc, builder, value, partType),
+++          /*isImagPart=*/false);
+++    }
+++    return builder.createConvert(loc, type, value);
+++  }
+++
+++  // Generate an update of the inner product value:
+++  //   acc += v1 * v2, OR
+++  //   acc ||= v1 && v2
+++  static mlir::Value genAccumulateProduct(mlir::Location loc,
+++                                          fir::FirOpBuilder &builder,
+++                                          mlir::Type resultType,
+++                                          mlir::Value acc, mlir::Value v1,
+++                                          mlir::Value v2) {
+++    acc = castToProductType(loc, builder, acc, resultType);
+++    v1 = castToProductType(loc, builder, v1, resultType);
+++    v2 = castToProductType(loc, builder, v2, resultType);
+++    mlir::Value result;
+++    if (mlir::isa<mlir::FloatType>(resultType))
+++      result = builder.create<mlir::arith::AddFOp>(
+++          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
+++    else if (mlir::isa<mlir::ComplexType>(resultType))
+++      result = builder.create<fir::AddcOp>(
+++          loc, acc, builder.create<fir::MulcOp>(loc, v1, v2));
+++    else if (mlir::isa<mlir::IntegerType>(resultType))
+++      result = builder.create<mlir::arith::AddIOp>(
+++          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
+++    else if (mlir::isa<fir::LogicalType>(resultType))
+++      result = builder.create<mlir::arith::OrIOp>(
+++          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
+++    else
+++      llvm_unreachable("unsupported type");
+++
+++    return builder.createConvert(loc, resultType, result);
+++  }
+++
++   static mlir::LogicalResult
++   genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
++                       hlfir::Entity result, mlir::Value resultShape,
++@@ -728,9 +748,9 @@
++             hlfir::loadElementAt(loc, builder, lhs, {I, K});
++         hlfir::Entity rhsElementValue =
++             hlfir::loadElementAt(loc, builder, rhs, {K, J});
++-        mlir::Value productValue =
++-            ProductFactory{loc, builder}.genAccumulateProduct(
++-                resultElementValue, lhsElementValue, rhsElementValue);
+++        mlir::Value productValue = genAccumulateProduct(
+++            loc, builder, resultElementType, resultElementValue,
+++            lhsElementValue, rhsElementValue);
++         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
++         return {};
++       };
++@@ -765,9 +785,9 @@
++             hlfir::loadElementAt(loc, builder, lhs, {J, K});
++         hlfir::Entity rhsElementValue =
++             hlfir::loadElementAt(loc, builder, rhs, {K});
++-        mlir::Value productValue =
++-            ProductFactory{loc, builder}.genAccumulateProduct(
++-                resultElementValue, lhsElementValue, rhsElementValue);
+++        mlir::Value productValue = genAccumulateProduct(
+++            loc, builder, resultElementType, resultElementValue,
+++            lhsElementValue, rhsElementValue);
++         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
++         return {};
++       };
++@@ -797,9 +817,9 @@
++             hlfir::loadElementAt(loc, builder, lhs, {K});
++         hlfir::Entity rhsElementValue =
++             hlfir::loadElementAt(loc, builder, rhs, {K, J});
++-        mlir::Value productValue =
++-            ProductFactory{loc, builder}.genAccumulateProduct(
++-                resultElementValue, lhsElementValue, rhsElementValue);
+++        mlir::Value productValue = genAccumulateProduct(
+++            loc, builder, resultElementType, resultElementValue,
+++            lhsElementValue, rhsElementValue);
++         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
++         return {};
++       };
++@@ -865,9 +885,9 @@
++             hlfir::loadElementAt(loc, builder, lhs, lhsIndices);
++         hlfir::Entity rhsElementValue =
++             hlfir::loadElementAt(loc, builder, rhs, rhsIndices);
++-        mlir::Value productValue =
++-            ProductFactory{loc, builder}.genAccumulateProduct(
++-                reductionArgs[0], lhsElementValue, rhsElementValue);
+++        mlir::Value productValue = genAccumulateProduct(
+++            loc, builder, resultElementType, reductionArgs[0], lhsElementValue,
+++            rhsElementValue);
++         return {productValue};
++       };
++       llvm::SmallVector<mlir::Value, 1> innerProductValue =
++@@ -884,73 +904,6 @@
++   }
++ };
++ 
++-class DotProductConversion
++-    : public mlir::OpRewritePattern<hlfir::DotProductOp> {
++-public:
++-  using mlir::OpRewritePattern<hlfir::DotProductOp>::OpRewritePattern;
++-
++-  llvm::LogicalResult
++-  matchAndRewrite(hlfir::DotProductOp product,
++-                  mlir::PatternRewriter &rewriter) const override {
++-    hlfir::Entity op = hlfir::Entity{product};
++-    if (!op.isScalar())
++-      return rewriter.notifyMatchFailure(product, "produces non-scalar result");
++-
++-    mlir::Location loc = product.getLoc();
++-    fir::FirOpBuilder builder{rewriter, product.getOperation()};
++-    hlfir::Entity lhs = hlfir::Entity{product.getLhs()};
++-    hlfir::Entity rhs = hlfir::Entity{product.getRhs()};
++-    mlir::Type resultElementType = product.getType();
++-    bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
++-                       mlir::isa<fir::LogicalType>(resultElementType) ||
++-                       static_cast<bool>(builder.getFastMathFlags() &
++-                                         mlir::arith::FastMathFlags::reassoc);
++-
++-    mlir::Value extent = genProductExtent(loc, builder, lhs, rhs);
++-
++-    auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
++-                       mlir::ValueRange oneBasedIndices,
++-                       mlir::ValueRange reductionArgs)
++-        -> llvm::SmallVector<mlir::Value, 1> {
++-      hlfir::Entity lhsElementValue =
++-          hlfir::loadElementAt(loc, builder, lhs, oneBasedIndices);
++-      hlfir::Entity rhsElementValue =
++-          hlfir::loadElementAt(loc, builder, rhs, oneBasedIndices);
++-      mlir::Value productValue =
++-          ProductFactory{loc, builder}.genAccumulateProduct</*CONJ=*/true>(
++-              reductionArgs[0], lhsElementValue, rhsElementValue);
++-      return {productValue};
++-    };
++-
++-    mlir::Value initValue =
++-        fir::factory::createZeroValue(builder, loc, resultElementType);
++-
++-    llvm::SmallVector<mlir::Value, 1> result = hlfir::genLoopNestWithReductions(
++-        loc, builder, {extent},
++-        /*reductionInits=*/{initValue}, genBody, isUnordered);
++-
++-    rewriter.replaceOp(product, result[0]);
++-    return mlir::success();
+ -  }
+-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
+-     return {value + index};
+-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
+-@@ -59,10 +48,6 @@
+- 
+- /// See `llvm::detail::indexed_accessor_range_base` for details.
+- Type TypeRange::dereference_iterator(OwnerT object, ptrdiff_t index) {
+--  if (auto type = llvm::dyn_cast_if_present<Type>(object)) {
+--    assert(index == 0 && "cannot offset into single-value 'TypeRange'");
+--    return type;
++-
++-private:
++-  static mlir::Value genProductExtent(mlir::Location loc,
++-                                      fir::FirOpBuilder &builder,
++-                                      hlfir::Entity input1,
++-                                      hlfir::Entity input2) {
++-    llvm::SmallVector<mlir::Value, 1> input1Extents =
++-        hlfir::genExtentsVector(loc, builder, input1);
++-    llvm::SmallVector<mlir::Value, 1> input2Extents =
++-        hlfir::genExtentsVector(loc, builder, input2);
++-
++-    assert(input1Extents.size() == 1 && input2Extents.size() == 1 &&
++-           "hlfir.dot_product arguments must be vectors");
++-    llvm::SmallVector<mlir::Value, 1> extent =
++-        fir::factory::deduceOptimalExtents(input1Extents, input2Extents);
++-    return extent[0];
+ -  }
+-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
+-     return (value + index)->getType();
+-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
+-diff -ruN --strip-trailing-cr a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
+---- a/mlir/unittests/IR/OperationSupportTest.cpp
+-+++ b/mlir/unittests/IR/OperationSupportTest.cpp
+-@@ -313,21 +313,4 @@
+-   op2->destroy();
++-};
++-
++ class SimplifyHLFIRIntrinsics
++     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
++ public:
++@@ -986,8 +939,6 @@
++     if (forceMatmulAsElemental || this->allowNewSideEffects)
++       patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);
 + 
-+-TEST(ValueRangeTest, ValueConstructable) {
-+-  MLIRContext context;
-+-  Builder builder(&context);
++-    patterns.insert<DotProductConversion>(context);
++-
++     if (mlir::failed(mlir::applyPatternsGreedily(
++             getOperation(), std::move(patterns), config))) {
++       mlir::emitError(getOperation()->getLoc(),
++diff -ruN --strip-trailing-cr a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
++--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
+++++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
++@@ -1,144 +0,0 @@
++-// Test hlfir.dot_product simplification to a reduction loop:
++-// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
++-
++-func.func @dot_product_integer(%arg0: !hlfir.expr<?xi16>, %arg1: !hlfir.expr<?xi32>) -> i32 {
++-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xi16>, !hlfir.expr<?xi32>) -> i32
++-  return %res : i32
++-}
++-// CHECK-LABEL:   func.func @dot_product_integer(
++-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xi16>,
++-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xi32>) -> i32 {
++-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
++-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
++-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xi16>) -> !fir.shape<1>
++-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
++-// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (i32) {
++-// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xi16>, index) -> i16
++-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xi32>, index) -> i32
++-// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
++-// CHECK:             %[[VAL_12:.*]] = arith.muli %[[VAL_11]], %[[VAL_10]] : i32
++-// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_8]], %[[VAL_12]] : i32
++-// CHECK:             fir.result %[[VAL_13]] : i32
++-// CHECK:           }
++-// CHECK:           return %[[VAL_6]] : i32
++-// CHECK:         }
++-
++-func.func @dot_product_real(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf16>) -> f32 {
++-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xf16>) -> f32
++-  return %res : f32
++-}
++-// CHECK-LABEL:   func.func @dot_product_real(
++-// CHECK-SAME:                                %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
++-// CHECK-SAME:                                %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> f32 {
++-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
++-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
++-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
++-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
++-// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (f32) {
++-// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xf32>, index) -> f32
++-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xf16>, index) -> f16
++-// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (f16) -> f32
++-// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_9]], %[[VAL_11]] : f32
++-// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_12]] : f32
++-// CHECK:             fir.result %[[VAL_13]] : f32
++-// CHECK:           }
++-// CHECK:           return %[[VAL_6]] : f32
++-// CHECK:         }
++-
++-func.func @dot_product_complex(%arg0: !hlfir.expr<?xcomplex<f32>>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
++-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xcomplex<f32>>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
++-  return %res : complex<f32>
++-}
++-// CHECK-LABEL:   func.func @dot_product_complex(
++-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xcomplex<f32>>,
++-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
++-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
++-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
++-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xcomplex<f32>>) -> !fir.shape<1>
++-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
++-// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
++-// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
++-// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f32>>, index) -> complex<f32>
++-// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
++-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
++-// CHECK:             %[[VAL_15:.*]] = fir.extract_value %[[VAL_12]], [1 : index] : (complex<f32>) -> f32
++-// CHECK:             %[[VAL_16:.*]] = arith.negf %[[VAL_15]] : f32
++-// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_12]], %[[VAL_16]], [1 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:             %[[VAL_18:.*]] = fir.mulc %[[VAL_17]], %[[VAL_14]] : complex<f32>
++-// CHECK:             %[[VAL_19:.*]] = fir.addc %[[VAL_11]], %[[VAL_18]] : complex<f32>
++-// CHECK:             fir.result %[[VAL_19]] : complex<f32>
++-// CHECK:           }
++-// CHECK:           return %[[VAL_9]] : complex<f32>
++-// CHECK:         }
 +-
-+-  Operation *useOp =
-+-      createOp(&context, /*operands=*/std::nullopt, builder.getIntegerType(16));
-+-  // Valid construction despite a temporary 'OpResult'.
-+-  ValueRange operands = useOp->getResult(0);
++-func.func @dot_product_real_complex(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
++-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
++-  return %res : complex<f32>
++-}
++-// CHECK-LABEL:   func.func @dot_product_real_complex(
++-// CHECK-SAME:                                        %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
++-// CHECK-SAME:                                        %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
++-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
++-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
++-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
++-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
++-// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
++-// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
++-// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xf32>, index) -> f32
++-// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
++-// CHECK:             %[[VAL_14:.*]] = fir.undefined complex<f32>
++-// CHECK:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_12]], [0 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
++-// CHECK:             %[[VAL_19:.*]] = fir.extract_value %[[VAL_17]], [1 : index] : (complex<f32>) -> f32
++-// CHECK:             %[[VAL_20:.*]] = arith.negf %[[VAL_19]] : f32
++-// CHECK:             %[[VAL_21:.*]] = fir.insert_value %[[VAL_17]], %[[VAL_20]], [1 : index] : (complex<f32>, f32) -> complex<f32>
++-// CHECK:             %[[VAL_22:.*]] = fir.mulc %[[VAL_21]], %[[VAL_18]] : complex<f32>
++-// CHECK:             %[[VAL_23:.*]] = fir.addc %[[VAL_11]], %[[VAL_22]] : complex<f32>
++-// CHECK:             fir.result %[[VAL_23]] : complex<f32>
++-// CHECK:           }
++-// CHECK:           return %[[VAL_9]] : complex<f32>
++-// CHECK:         }
++-
++-func.func @dot_product_logical(%arg0: !hlfir.expr<?x!fir.logical<1>>, %arg1: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
++-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?x!fir.logical<1>>, !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4>
++-  return %res : !fir.logical<4>
++-}
++-// CHECK-LABEL:   func.func @dot_product_logical(
++-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?x!fir.logical<1>>,
++-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
++-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
++-// CHECK:           %[[VAL_3:.*]] = arith.constant false
++-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x!fir.logical<1>>) -> !fir.shape<1>
++-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
++-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
++-// CHECK:           %[[VAL_7:.*]] = fir.do_loop %[[VAL_8:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_9:.*]] = %[[VAL_6]]) -> (!fir.logical<4>) {
++-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<1>>, index) -> !fir.logical<1>
++-// CHECK:             %[[VAL_11:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
++-// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.logical<4>) -> i1
++-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<1>) -> i1
++-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
++-// CHECK:             %[[VAL_15:.*]] = arith.andi %[[VAL_13]], %[[VAL_14]] : i1
++-// CHECK:             %[[VAL_16:.*]] = arith.ori %[[VAL_12]], %[[VAL_15]] : i1
++-// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
++-// CHECK:             fir.result %[[VAL_17]] : !fir.logical<4>
++-// CHECK:           }
++-// CHECK:           return %[[VAL_7]] : !fir.logical<4>
++-// CHECK:         }
++-
++-func.func @dot_product_known_dim(%arg0: !hlfir.expr<10xf32>, %arg1: !hlfir.expr<?xi16>) -> f32 {
++-  %res1 = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<10xf32>, !hlfir.expr<?xi16>) -> f32
++-  %res2 = hlfir.dot_product %arg1 %arg0 : (!hlfir.expr<?xi16>, !hlfir.expr<10xf32>) -> f32
++-  %res = arith.addf %res1, %res2 : f32
++-  return %res : f32
++-}
++-// CHECK-LABEL:   func.func @dot_product_known_dim(
++-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
++-// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
++-// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
++-// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
++diff -ruN --strip-trailing-cr a/libcxx/include/__config b/libcxx/include/__config
++--- a/libcxx/include/__config
+++++ b/libcxx/include/__config
++@@ -1166,9 +1166,7 @@
++ #    define _LIBCPP_NOESCAPE
++ #  endif
++ 
++-// FIXME: Expand this to [[__gnu__::__nodebug__]] again once the testcase reported in
++-// https://github.com/llvm/llvm-project/pull/118710 has been analyzed
++-#  define _LIBCPP_NODEBUG
+++#  define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]]
++ 
++ #  if __has_attribute(__standalone_debug__)
++ #    define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__))
++diff -ruN --strip-trailing-cr a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
++--- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+++++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
++@@ -27,7 +27,7 @@
++     check_factories.registerCheck<libcpp::header_exportable_declarations>("libcpp-header-exportable-declarations");
++     check_factories.registerCheck<libcpp::hide_from_abi>("libcpp-hide-from-abi");
++     check_factories.registerCheck<libcpp::internal_ftm_use>("libcpp-internal-ftms");
++-    // check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
+++    check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
++     check_factories.registerCheck<libcpp::proper_version_checks>("libcpp-cpp-version-check");
++     check_factories.registerCheck<libcpp::robust_against_adl_check>("libcpp-robust-against-adl");
++     check_factories.registerCheck<libcpp::uglify_attributes>("libcpp-uglify-attributes");
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++@@ -1140,8 +1140,6 @@
++ 
++   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
++ 
++-  setTargetDAGCombine(ISD::SHL);
 +-
-+-  useOp->setOperands(operands);
-+-  EXPECT_EQ(useOp->getNumOperands(), 1u);
-+-  EXPECT_EQ(useOp->getOperand(0), useOp->getResult(0));
++   // In case of strict alignment, avoid an excessive number of byte wide stores.
++   MaxStoresPerMemsetOptSize = 8;
++   MaxStoresPerMemset =
++@@ -26473,43 +26471,6 @@
++   return NVCAST;
+  }
+  
+--TEST(ValueRangeTest, ValueConstructable) {
+--  MLIRContext context;
+--  Builder builder(&context);
++-/// If the operand is a bitwise AND with a constant RHS, and the shift has a
++-/// constant RHS and is the only use, we can pull it out of the shift, i.e.
++-///
++-///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
++-///
++-/// We prefer this canonical form to match existing isel patterns.
++-static SDValue performSHLCombine(SDNode *N,
++-                                 TargetLowering::DAGCombinerInfo &DCI,
++-                                 SelectionDAG &DAG) {
++-  if (DCI.isBeforeLegalizeOps())
++-    return SDValue();
+ -
+--  Operation *useOp =
+--      createOp(&context, /*operands=*/std::nullopt, builder.getIntegerType(16));
+--  // Valid construction despite a temporary 'OpResult'.
+--  ValueRange operands = useOp->getResult(0);
++-  SDValue Op0 = N->getOperand(0);
++-  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
++-    return SDValue();
+ -
+--  useOp->setOperands(operands);
+--  EXPECT_EQ(useOp->getNumOperands(), 1u);
+--  EXPECT_EQ(useOp->getOperand(0), useOp->getResult(0));
++-  SDValue C1 = Op0->getOperand(1);
++-  SDValue C2 = N->getOperand(1);
++-  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
++-    return SDValue();
+ -
+--  useOp->dropAllUses();
+--  useOp->destroy();
++-  // Might be folded into shifted op, do not lower.
++-  if (N->hasOneUse()) {
++-    unsigned UseOpc = N->user_begin()->getOpcode();
++-    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
++-        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
++-      return SDValue();
++-  }
 +-
-+-  useOp->dropAllUses();
-+-  useOp->destroy();
++-  SDLoc DL(N);
++-  EVT VT = N->getValueType(0);
++-  SDValue X = Op0->getOperand(0);
++-  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
++-  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
++-  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
 +-}
 +-
-+ } // namespace
-+diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-+--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-++++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-+@@ -43,10 +43,7 @@
-+ 
-+ gentbl(
-+     name = "diagnostic_defs_gen",
-+-    tbl_outs = [(
-+-        "-gen-clang-diags-defs -clang-component=%s" % c,
-+-        "include/clang/Basic/Diagnostic%sKinds.inc" % c,
-+-    ) for c in [
-++    tbl_outs = [out for c in [
-+         "AST",
-+         "Analysis",
-+         "Comment",
-+@@ -60,6 +57,15 @@
-+         "Refactoring",
-+         "Sema",
-+         "Serialization",
-++    ] for out in [
-++        (
-++            "-gen-clang-diags-defs -clang-component=%s" % c,
-++            "include/clang/Basic/Diagnostic%sKinds.inc" % c,
-++        ),
-++        (
-++            "-gen-clang-diags-enums -clang-component=%s" % c,
-++            "include/clang/Basic/Diagnostic%sEnums.inc" % c,
-++        ),
-+     ]] + [
-+         (
-+             "-gen-clang-diag-groups",
++ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
++                                                  DAGCombinerInfo &DCI) const {
++   SelectionDAG &DAG = DCI.DAG;
++@@ -26855,8 +26816,6 @@
++     return performCTLZCombine(N, DAG, Subtarget);
++   case ISD::SCALAR_TO_VECTOR:
++     return performScalarToVectorCombine(N, DCI, DAG);
++-  case ISD::SHL:
++-    return performSHLCombine(N, DCI, DAG);
++   }
++   return SDValue();
++ }
++diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
++--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
++@@ -4979,7 +4979,7 @@
++     // the subvector length.
++     const unsigned VecVF = getNumElements(Vec->getType());
++     SmallVector<int> Mask(VecVF, PoisonMaskElem);
++-    std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
+++    std::iota(Mask.begin(), Mask.end(), 0);
++     for (unsigned I : seq<unsigned>(SubVecVF))
++       Mask[I + Index] = I + VecVF;
++     if (Generator) {
++@@ -13956,11 +13956,12 @@
++     Instruction *InsElt;
++     if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
++       assert(SLPReVec && "FixedVectorType is not expected.");
++-      Vec = InsElt = cast<Instruction>(createInsertVector(
++-          Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
++-      auto *II = dyn_cast<IntrinsicInst>(InsElt);
+++      Vec =
+++          createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
+++      auto *II = dyn_cast<IntrinsicInst>(Vec);
++       if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
++         return Vec;
+++      InsElt = II;
++     } else {
++       Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
++       InsElt = dyn_cast<InsertElementInst>(Vec);
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
++--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+++++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
++@@ -190,7 +190,8 @@
++ define i8 @test_i8_7_mask_shl_1(i8 %a0) {
++ ; CHECK-LABEL: test_i8_7_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    ubfiz w0, w0, #1, #3
+++; CHECK-NEXT:    and w8, w0, #0x7
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 7
++   %t1 = shl i8 %t0, 1
++@@ -199,7 +200,8 @@
++ define i8 @test_i8_7_mask_shl_4(i8 %a0) {
++ ; CHECK-LABEL: test_i8_7_mask_shl_4:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    ubfiz w0, w0, #4, #3
+++; CHECK-NEXT:    and w8, w0, #0x7
+++; CHECK-NEXT:    lsl w0, w8, #4
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 7
++   %t1 = shl i8 %t0, 4
++@@ -227,8 +229,8 @@
++ define i8 @test_i8_28_mask_shl_1(i8 %a0) {
++ ; CHECK-LABEL: test_i8_28_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #1
++-; CHECK-NEXT:    and w0, w8, #0x38
+++; CHECK-NEXT:    and w8, w0, #0x1c
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 28
++   %t1 = shl i8 %t0, 1
++@@ -237,8 +239,8 @@
++ define i8 @test_i8_28_mask_shl_2(i8 %a0) {
++ ; CHECK-LABEL: test_i8_28_mask_shl_2:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #2
++-; CHECK-NEXT:    and w0, w8, #0x70
+++; CHECK-NEXT:    and w8, w0, #0x1c
+++; CHECK-NEXT:    lsl w0, w8, #2
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 28
++   %t1 = shl i8 %t0, 2
++@@ -247,8 +249,8 @@
++ define i8 @test_i8_28_mask_shl_3(i8 %a0) {
++ ; CHECK-LABEL: test_i8_28_mask_shl_3:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #3
++-; CHECK-NEXT:    and w0, w8, #0xe0
+++; CHECK-NEXT:    and w8, w0, #0x1c
+++; CHECK-NEXT:    lsl w0, w8, #3
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 28
++   %t1 = shl i8 %t0, 3
++@@ -257,8 +259,8 @@
++ define i8 @test_i8_28_mask_shl_4(i8 %a0) {
++ ; CHECK-LABEL: test_i8_28_mask_shl_4:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #4
++-; CHECK-NEXT:    and w0, w8, #0xc0
+++; CHECK-NEXT:    and w8, w0, #0xc
+++; CHECK-NEXT:    lsl w0, w8, #4
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 28
++   %t1 = shl i8 %t0, 4
++@@ -268,8 +270,8 @@
++ define i8 @test_i8_224_mask_shl_1(i8 %a0) {
++ ; CHECK-LABEL: test_i8_224_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #1
++-; CHECK-NEXT:    and w0, w8, #0xc0
+++; CHECK-NEXT:    and w8, w0, #0x60
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i8 %a0, 224
++   %t1 = shl i8 %t0, 1
++@@ -463,7 +465,8 @@
++ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
++ ; CHECK-LABEL: test_i16_127_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    ubfiz w0, w0, #1, #7
+++; CHECK-NEXT:    and w8, w0, #0x7f
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 127
++   %t1 = shl i16 %t0, 1
++@@ -472,7 +475,8 @@
++ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
++ ; CHECK-LABEL: test_i16_127_mask_shl_8:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    ubfiz w0, w0, #8, #7
+++; CHECK-NEXT:    and w8, w0, #0x7f
+++; CHECK-NEXT:    lsl w0, w8, #8
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 127
++   %t1 = shl i16 %t0, 8
++@@ -500,8 +504,8 @@
++ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
++ ; CHECK-LABEL: test_i16_2032_mask_shl_3:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #3
++-; CHECK-NEXT:    and w0, w8, #0x3f80
+++; CHECK-NEXT:    and w8, w0, #0x7f0
+++; CHECK-NEXT:    lsl w0, w8, #3
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 2032
++   %t1 = shl i16 %t0, 3
++@@ -510,8 +514,8 @@
++ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
++ ; CHECK-LABEL: test_i16_2032_mask_shl_4:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #4
++-; CHECK-NEXT:    and w0, w8, #0x7f00
+++; CHECK-NEXT:    and w8, w0, #0x7f0
+++; CHECK-NEXT:    lsl w0, w8, #4
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 2032
++   %t1 = shl i16 %t0, 4
++@@ -520,8 +524,8 @@
++ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
++ ; CHECK-LABEL: test_i16_2032_mask_shl_5:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #5
++-; CHECK-NEXT:    and w0, w8, #0xfe00
+++; CHECK-NEXT:    and w8, w0, #0x7f0
+++; CHECK-NEXT:    lsl w0, w8, #5
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 2032
++   %t1 = shl i16 %t0, 5
++@@ -530,8 +534,8 @@
++ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
++ ; CHECK-LABEL: test_i16_2032_mask_shl_6:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #6
++-; CHECK-NEXT:    and w0, w8, #0xfc00
+++; CHECK-NEXT:    and w8, w0, #0x3f0
+++; CHECK-NEXT:    lsl w0, w8, #6
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 2032
++   %t1 = shl i16 %t0, 6
++@@ -541,8 +545,8 @@
++ define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
++ ; CHECK-LABEL: test_i16_65024_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #1
++-; CHECK-NEXT:    and w0, w8, #0xfc00
+++; CHECK-NEXT:    and w8, w0, #0x7e00
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i16 %a0, 65024
++   %t1 = shl i16 %t0, 1
++@@ -736,7 +740,8 @@
++ define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
++ ; CHECK-LABEL: test_i32_32767_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    ubfiz w0, w0, #1, #15
+++; CHECK-NEXT:    and w8, w0, #0x7fff
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 32767
++   %t1 = shl i32 %t0, 1
++@@ -745,7 +750,8 @@
++ define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
++ ; CHECK-LABEL: test_i32_32767_mask_shl_16:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    ubfiz w0, w0, #16, #15
+++; CHECK-NEXT:    and w8, w0, #0x7fff
+++; CHECK-NEXT:    lsl w0, w8, #16
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 32767
++   %t1 = shl i32 %t0, 16
++@@ -773,8 +779,8 @@
++ define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
++ ; CHECK-LABEL: test_i32_8388352_mask_shl_7:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #7
++-; CHECK-NEXT:    and w0, w8, #0x3fff8000
+++; CHECK-NEXT:    and w8, w0, #0x7fff00
+++; CHECK-NEXT:    lsl w0, w8, #7
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 8388352
++   %t1 = shl i32 %t0, 7
++@@ -783,8 +789,8 @@
++ define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
++ ; CHECK-LABEL: test_i32_8388352_mask_shl_8:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #8
++-; CHECK-NEXT:    and w0, w8, #0x7fff0000
+++; CHECK-NEXT:    and w8, w0, #0x7fff00
+++; CHECK-NEXT:    lsl w0, w8, #8
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 8388352
++   %t1 = shl i32 %t0, 8
++@@ -793,8 +799,8 @@
++ define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
++ ; CHECK-LABEL: test_i32_8388352_mask_shl_9:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #9
++-; CHECK-NEXT:    and w0, w8, #0xfffe0000
+++; CHECK-NEXT:    and w8, w0, #0x7fff00
+++; CHECK-NEXT:    lsl w0, w8, #9
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 8388352
++   %t1 = shl i32 %t0, 9
++@@ -803,8 +809,8 @@
++ define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
++ ; CHECK-LABEL: test_i32_8388352_mask_shl_10:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #10
++-; CHECK-NEXT:    and w0, w8, #0xfffc0000
+++; CHECK-NEXT:    and w8, w0, #0x3fff00
+++; CHECK-NEXT:    lsl w0, w8, #10
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 8388352
++   %t1 = shl i32 %t0, 10
++@@ -814,8 +820,8 @@
++ define i32 @test_i32_4294836224_mask_shl_1(i32 %a0) {
++ ; CHECK-LABEL: test_i32_4294836224_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w8, w0, #1
++-; CHECK-NEXT:    and w0, w8, #0xfffc0000
+++; CHECK-NEXT:    and w8, w0, #0x7ffe0000
+++; CHECK-NEXT:    lsl w0, w8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i32 %a0, 4294836224
++   %t1 = shl i32 %t0, 1
++@@ -1009,7 +1015,8 @@
++ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
++ ; CHECK-LABEL: test_i64_2147483647_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w0, w0, #1
+++; CHECK-NEXT:    and x8, x0, #0x7fffffff
+++; CHECK-NEXT:    lsl x0, x8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i64 %a0, 2147483647
++   %t1 = shl i64 %t0, 1
++@@ -1047,8 +1054,8 @@
++ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
++ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_15:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl x8, x0, #15
++-; CHECK-NEXT:    and x0, x8, #0x3fffffff80000000
+++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
+++; CHECK-NEXT:    lsl x0, x8, #15
++ ; CHECK-NEXT:    ret
++   %t0 = and i64 %a0, 140737488289792
++   %t1 = shl i64 %t0, 15
++@@ -1057,8 +1064,8 @@
++ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
++ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_16:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl x8, x0, #16
++-; CHECK-NEXT:    and x0, x8, #0x7fffffff00000000
+++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
+++; CHECK-NEXT:    lsl x0, x8, #16
++ ; CHECK-NEXT:    ret
++   %t0 = and i64 %a0, 140737488289792
++   %t1 = shl i64 %t0, 16
++@@ -1067,8 +1074,8 @@
++ define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
++ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_17:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl x8, x0, #17
++-; CHECK-NEXT:    and x0, x8, #0xfffffffe00000000
+++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
+++; CHECK-NEXT:    lsl x0, x8, #17
++ ; CHECK-NEXT:    ret
++   %t0 = and i64 %a0, 140737488289792
++   %t1 = shl i64 %t0, 17
++@@ -1077,8 +1084,8 @@
++ define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
++ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_18:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl x8, x0, #18
++-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
+++; CHECK-NEXT:    and x8, x0, #0x3fffffff0000
+++; CHECK-NEXT:    lsl x0, x8, #18
++ ; CHECK-NEXT:    ret
++   %t0 = and i64 %a0, 140737488289792
++   %t1 = shl i64 %t0, 18
++@@ -1088,8 +1095,8 @@
++ define i64 @test_i64_18446744065119617024_mask_shl_1(i64 %a0) {
++ ; CHECK-LABEL: test_i64_18446744065119617024_mask_shl_1:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl x8, x0, #1
++-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
+++; CHECK-NEXT:    and x8, x0, #0x7ffffffe00000000
+++; CHECK-NEXT:    lsl x0, x8, #1
++ ; CHECK-NEXT:    ret
++   %t0 = and i64 %a0, 18446744065119617024
++   %t1 = shl i64 %t0, 1
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
++--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
++@@ -1013,8 +1013,8 @@
++ define i32 @c2_i32(i32 %arg) nounwind {
++ ; CHECK-LABEL: c2_i32:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsr w8, w0, #17
++-; CHECK-NEXT:    and w0, w8, #0xffc
+++; CHECK-NEXT:    ubfx w8, w0, #19, #10
+++; CHECK-NEXT:    lsl w0, w8, #2
++ ; CHECK-NEXT:    ret
++   %tmp0 = lshr i32 %arg, 19
++   %tmp1 = and i32 %tmp0, 1023
++@@ -1063,8 +1063,8 @@
++ define i64 @c2_i64(i64 %arg) nounwind {
++ ; CHECK-LABEL: c2_i64:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsr x8, x0, #49
++-; CHECK-NEXT:    and x0, x8, #0xffc
+++; CHECK-NEXT:    ubfx x8, x0, #51, #10
+++; CHECK-NEXT:    lsl x0, x8, #2
++ ; CHECK-NEXT:    ret
++   %tmp0 = lshr i64 %arg, 51
++   %tmp1 = and i64 %tmp0, 1023
++@@ -1120,8 +1120,8 @@
++ define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
++ ; CHECK-LABEL: c7_i32:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsr w8, w0, #17
++-; CHECK-NEXT:    and w8, w8, #0xffc
+++; CHECK-NEXT:    ubfx w8, w0, #19, #10
+++; CHECK-NEXT:    lsl w8, w8, #2
++ ; CHECK-NEXT:    str w8, [x1]
++ ; CHECK-NEXT:    ret
++   %tmp0 = lshr i32 %arg, 19
++@@ -1163,8 +1163,8 @@
++ define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
++ ; CHECK-LABEL: c7_i64:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsr x8, x0, #49
++-; CHECK-NEXT:    and x8, x8, #0xffc
+++; CHECK-NEXT:    ubfx x8, x0, #51, #10
+++; CHECK-NEXT:    lsl x8, x8, #2
++ ; CHECK-NEXT:    str x8, [x1]
++ ; CHECK-NEXT:    ret
++   %tmp0 = lshr i64 %arg, 51
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/fpenv.ll b/llvm/test/CodeGen/AArch64/fpenv.ll
++--- a/llvm/test/CodeGen/AArch64/fpenv.ll
+++++ b/llvm/test/CodeGen/AArch64/fpenv.ll
++@@ -4,11 +4,11 @@
++ define void @func_set_rounding_dyn(i32 %rm) {
++ ; CHECK-LABEL: func_set_rounding_dyn:
++ ; CHECK:       // %bb.0:
++-; CHECK-NEXT:    lsl w9, w0, #22
+++; CHECK-NEXT:    sub w9, w0, #1
++ ; CHECK-NEXT:    mrs x8, FPCR
+++; CHECK-NEXT:    and w9, w9, #0x3
++ ; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
++-; CHECK-NEXT:    sub w9, w9, #1024, lsl #12 // =4194304
++-; CHECK-NEXT:    and w9, w9, #0xc00000
+++; CHECK-NEXT:    lsl w9, w9, #22
++ ; CHECK-NEXT:    orr x8, x8, x9
++ ; CHECK-NEXT:    msr FPCR, x8
++ ; CHECK-NEXT:    ret
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/xbfiz.ll b/llvm/test/CodeGen/AArch64/xbfiz.ll
++--- a/llvm/test/CodeGen/AArch64/xbfiz.ll
+++++ b/llvm/test/CodeGen/AArch64/xbfiz.ll
++@@ -69,19 +69,3 @@
++   %and = and i64 %shl, 4294967295
++   ret i64 %and
++ }
++-
++-define i64 @lsl_zext_i8_i64(i8 %b) {
++-; CHECK-LABEL: lsl_zext_i8_i64:
++-; CHECK:    ubfiz x0, x0, #1, #8
++-  %1 = zext i8 %b to i64
++-  %2 = shl i64 %1, 1
++-  ret i64 %2
+ -}
+ -
+- } // namespace
+-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+---- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+-+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+-@@ -43,10 +43,7 @@
+- 
+- gentbl(
+-     name = "diagnostic_defs_gen",
+--    tbl_outs = [(
+--        "-gen-clang-diags-defs -clang-component=%s" % c,
+--        "include/clang/Basic/Diagnostic%sKinds.inc" % c,
+--    ) for c in [
+-+    tbl_outs = [out for c in [
+-         "AST",
+-         "Analysis",
+-         "Comment",
+-@@ -60,6 +57,15 @@
+-         "Refactoring",
+-         "Sema",
+-         "Serialization",
+-+    ] for out in [
+-+        (
+-+            "-gen-clang-diags-defs -clang-component=%s" % c,
+-+            "include/clang/Basic/Diagnostic%sKinds.inc" % c,
+-+        ),
+-+        (
+-+            "-gen-clang-diags-enums -clang-component=%s" % c,
+-+            "include/clang/Basic/Diagnostic%sEnums.inc" % c,
+-+        ),
+-     ]] + [
+-         (
+-             "-gen-clang-diag-groups",
++-define i64 @lsl_zext_i16_i64(i16 %b) {
++-; CHECK-LABEL: lsl_zext_i16_i64:
++-; CHECK:    ubfiz x0, x0, #1, #16
++-  %1 = zext i16 %b to i64
++-  %2 = shl i64 %1, 1
++-  ret i64 %2
++-}
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
++--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+++++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
++@@ -0,0 +1,81 @@
+++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+++; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+++
+++define <16 x double> @test(ptr %x, double %v, double %a) {
+++; CHECK-LABEL: define <16 x double> @test(
+++; CHECK-SAME: ptr [[X:%.*]], double [[V:%.*]], double [[A:%.*]]) {
+++; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+++; CHECK-NEXT:    [[GEP8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 9
+++; CHECK-NEXT:    [[TMP1:%.*]] = load <6 x double>, ptr [[X]], align 4
+++; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[GEP6]], align 4
+++; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[GEP8]], align 4
+++; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> poison, double [[A]], i32 0
+++; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x double> [[TMP4]], <16 x double> poison, <16 x i32> zeroinitializer
+++; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[V]], i32 0
+++; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+++; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[V]], i32 0
+++; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <2 x i32> zeroinitializer
+++; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v6f64(<16 x double> poison, <6 x double> [[TMP1]], i64 0)
+++; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+++; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 14, i32 15>
+++; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP12]], <2 x double> [[TMP6]], i64 6)
+++; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP13]], <2 x double> [[TMP7]], i64 8)
+++; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP14]], <2 x double> [[TMP9]], i64 10)
+++; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP15]], <2 x double> [[TMP9]], i64 12)
+++; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP16]], <2 x double> [[TMP9]], i64 14)
+++; CHECK-NEXT:    [[TMP18:%.*]] = fadd <16 x double> [[TMP5]], [[TMP17]]
+++; CHECK-NEXT:    ret <16 x double> [[TMP18]]
+++;
+++  %gep1 = getelementptr inbounds double, ptr %x, i64 1
+++  %gep2 = getelementptr inbounds double, ptr %x, i64 2
+++  %gep3 = getelementptr inbounds double, ptr %x, i64 3
+++  %gep4 = getelementptr inbounds double, ptr %x, i64 4
+++  %gep5 = getelementptr inbounds double, ptr %x, i64 5
+++  %gep6 = getelementptr inbounds double, ptr %x, i64 8
+++  %gep7 = getelementptr inbounds double, ptr %x, i64 9
+++  %gep8 = getelementptr inbounds double, ptr %x, i64 9
+++  %gep9 = getelementptr inbounds double, ptr %x, i64 10
+++  %x0 = load double, ptr %x, align 4
+++  %x1 = load double, ptr %gep1, align 4
+++  %x2 = load double, ptr %gep2, align 4
+++  %x3 = load double, ptr %gep3, align 4
+++  %x4 = load double, ptr %gep4, align 4
+++  %x5 = load double, ptr %gep5, align 4
+++  %x6 = load double, ptr %gep6, align 4
+++  %x7 = load double, ptr %gep7, align 4
+++  %x8 = load double, ptr %gep8, align 4
+++  %x9 = load double, ptr %gep9, align 4
+++  %add1 = fadd double %a, %x0
+++  %add2 = fadd double %a, %x1
+++  %add3 = fadd double %a, %x2
+++  %add4 = fadd double %a, %x3
+++  %add5 = fadd double %a, %x4
+++  %add6 = fadd double %a, %x5
+++  %add7 = fadd double %a, %x6
+++  %add8 = fadd double %a, %x7
+++  %add9 = fadd double %a, %x8
+++  %add10 = fadd double %a, %x9
+++  %add11 = fadd double %a, %v
+++  %add12 = fadd double %a, %v
+++  %add13 = fadd double %a, %v
+++  %add14 = fadd double %a, %v
+++  %add15 = fadd double %a, %v
+++  %add16 = fadd double %a, %v
+++  %i0 = insertelement <16 x double> poison, double %add1, i32 0
+++  %i1 = insertelement <16 x double> %i0, double %add2, i32 1
+++  %i2 = insertelement <16 x double> %i1, double %add3, i32 2
+++  %i3 = insertelement <16 x double> %i2, double %add4, i32 3
+++  %i4 = insertelement <16 x double> %i3, double %add5, i32 4
+++  %i5 = insertelement <16 x double> %i4, double %add6, i32 5
+++  %i6 = insertelement <16 x double> %i5, double %add7, i32 6
+++  %i7 = insertelement <16 x double> %i6, double %add8, i32 7
+++  %i8 = insertelement <16 x double> %i7, double %add9, i32 8
+++  %i9 = insertelement <16 x double> %i8, double %add10, i32 9
+++  %i10 = insertelement <16 x double> %i9, double %add11, i32 10
+++  %i11 = insertelement <16 x double> %i10, double %add12, i32 11
+++  %i12 = insertelement <16 x double> %i11, double %add13, i32 12
+++  %i13 = insertelement <16 x double> %i12, double %add14, i32 13
+++  %i14 = insertelement <16 x double> %i13, double %add15, i32 14
+++  %i15 = insertelement <16 x double> %i14, double %add16, i32 15
+++  ret <16 x double> %i15
+++}
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 4602e35..4706c63 100644
+index 4706c63..cb09291 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "c24ce324d56328e4b91c8797ea4935545084303e"
--    LLVM_SHA256 = "ef9f02427de91c37b2315203fc60fa71cac5caa385860fd2a1daa620b4867091"
-+    LLVM_COMMIT = "bf17016a92bc8a23d2cdd2b51355dd4eb5019c68"
-+    LLVM_SHA256 = "ba09f12e5019f5aca531b1733275f0a10b181d6f894deb1a4610e017f76b172a"
+-    LLVM_COMMIT = "bf17016a92bc8a23d2cdd2b51355dd4eb5019c68"
+-    LLVM_SHA256 = "ba09f12e5019f5aca531b1733275f0a10b181d6f894deb1a4610e017f76b172a"
++    LLVM_COMMIT = "13c761789753862a7cc31a2a26f23010afa668b9"
++    LLVM_SHA256 = "587f3eda6d00d751cbfc69fa5a15475ae4232e191ace04031b343e4e8ae16355"
  
      tf_http_archive(
          name = name,
+diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
+index d19d903..2dd4f17 100755
+--- a/third_party/stablehlo/temporary.patch
++++ b/third_party/stablehlo/temporary.patch
+@@ -1,3 +1,15 @@
++diff --ruN a/stablehlo/examples/c++/ExampleAdd.cpp b/stablehlo/examples/c++/ExampleAdd.cpp
++--- stablehlo/examples/c++/ExampleAdd.cpp
+++++ stablehlo/examples/c++/ExampleAdd.cpp
++@@ -49,7 +49,7 @@
++   /** create function **/
++   // create function argument and result types.
++   auto tensorType =
++-      mlir::RankedTensorType::get({3, 4}, mlir::FloatType::getF32(&context));
+++      mlir::RankedTensorType::get({3, 4}, mlir::Float32Type::get(&context));
++   auto func_type =
++       mlir::FunctionType::get(&context, {tensorType, tensorType}, {tensorType});
++ 
+ diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
+ --- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
+ +++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
diff --git a/third_party/shardy/workspace.bzl b/third_party/shardy/workspace.bzl
index 3acd9d8ab4ad2..1c4225dcd7134 100644
--- a/third_party/shardy/workspace.bzl
+++ b/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "293e28a2b7c745c82fc5de99dad207e29340e7e0"
-    SHARDY_SHA256 = "36e38a2a7d23ba3c5385a4dc8651d682269c6a8dcf71b9a4cca5522cc32b7216"
+    SHARDY_COMMIT = "a45b0ae83803b4edb0602f3f5b342571a41b8e91"
+    SHARDY_SHA256 = "29f97d1838f463a6985f255fc29c80aa0517780a6b08fe1d01e3083a7f573942"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index d19d903fccbad..2dd4f1791a6a6 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,3 +1,15 @@
+diff --ruN a/stablehlo/examples/c++/ExampleAdd.cpp b/stablehlo/examples/c++/ExampleAdd.cpp
+--- stablehlo/examples/c++/ExampleAdd.cpp
++++ stablehlo/examples/c++/ExampleAdd.cpp
+@@ -49,7 +49,7 @@
+   /** create function **/
+   // create function argument and result types.
+   auto tensorType =
+-      mlir::RankedTensorType::get({3, 4}, mlir::FloatType::getF32(&context));
++      mlir::RankedTensorType::get({3, 4}, mlir::Float32Type::get(&context));
+   auto func_type =
+       mlir::FunctionType::get(&context, {tensorType, tensorType}, {tensorType});
+ 
 diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
 --- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
 +++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
diff --git a/third_party/triton/llvm_integration/cl717293402.patch b/third_party/triton/llvm_integration/cl717293402.patch
new file mode 100644
index 0000000000000..1d051c8b37f7a
--- /dev/null
+++ b/third_party/triton/llvm_integration/cl717293402.patch
@@ -0,0 +1,127 @@
+
+--- a/include/triton/Conversion/MLIRTypes.h	2024-07-03 07:14:55.000000000 -0700
++++ b/include/triton/Conversion/MLIRTypes.h	2025-01-19 13:19:21.000000000 -0800
+@@ -21,10 +21,10 @@
+ }
+ 
+ // Float types
+-inline Type f16Ty(MLIRContext *ctx) { return FloatType::getF16(ctx); }
+-inline Type f32Ty(MLIRContext *ctx) { return FloatType::getF32(ctx); }
+-inline Type f64Ty(MLIRContext *ctx) { return FloatType::getF64(ctx); }
+-inline Type bf16Ty(MLIRContext *ctx) { return FloatType::getBF16(ctx); }
++inline Type f16Ty(MLIRContext *ctx) { return Float16Type::get(ctx); }
++inline Type f32Ty(MLIRContext *ctx) { return Float32Type::get(ctx); }
++inline Type f64Ty(MLIRContext *ctx) { return Float64Type::get(ctx); }
++inline Type bf16Ty(MLIRContext *ctx) { return BFloat16Type::get(ctx); }
+ 
+ inline bool isFloat(Type type) {
+   return type.isF32() || type.isF64() || type.isF16() || type.isF128() ||
+
+--- a/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-01-15 12:52:52.000000000 -0800
++++ b/lib/Dialect/TritonGPU/IR/Ops.cpp	2025-01-19 13:19:21.000000000 -0800
+@@ -15,7 +15,7 @@
+   auto xTy = getSrc().getType();
+   auto scaleTy = getScale().getType();
+ 
+-  if (xTy.getElementType() != FloatType::getBF16(getContext()) &&
++  if (xTy.getElementType() != BFloat16Type::get(getContext()) &&
+       xTy.getElementType() != IntegerType::get(getContext(), 8)) {
+     return emitOpError("element type of the first operand must be bf16 or i8");
+   }
+@@ -111,7 +111,7 @@
+     auto newShape = SmallVector<int64_t>(xShape);
+     if (!encoding) {
+       newShape.back() *= 2;
+-      retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx));
++      retTy = RankedTensorType::get(xShape, BFloat16Type::get(ctx));
+     } else {
+       auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+       auto newVEncoding = DotOperandEncodingAttr::get(
+@@ -123,7 +123,7 @@
+       const bool hasBatch = xShape.size() == 3;
+       const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+       newShape[kIdx] *= 2;
+-      retTy = RankedTensorType::get(newShape, FloatType::getBF16(ctx),
++      retTy = RankedTensorType::get(newShape, BFloat16Type::get(ctx),
+                                     newVEncoding);
+     }
+     inferredReturnTypes.push_back(retTy);
+
+--- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp	2025-01-15 12:52:52.000000000 -0800
++++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp	2025-01-19 13:19:22.000000000 -0800
+@@ -56,9 +56,9 @@
+   else if (constraint == 'l')
+     ty = IntegerType::get(rewriter.getContext(), 64);
+   else if (constraint == 'f')
+-    ty = FloatType::getF32(rewriter.getContext());
++    ty = Float32Type::get(rewriter.getContext());
+   else if (constraint == 'd')
+-    ty = FloatType::getF64(rewriter.getContext());
++    ty = Float64Type::get(rewriter.getContext());
+   else {
+     assert(false && "Unsupported constraint");
+   }
+
+--- a/unittest/Dialect/TritonGPU/DialectTest.cpp	2025-01-15 12:52:52.000000000 -0800
++++ b/unittest/Dialect/TritonGPU/DialectTest.cpp	2025-01-19 13:19:23.000000000 -0800
+@@ -492,10 +492,10 @@
+         llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank))));
+ 
+     auto srcTy = RankedTensorType::get(
+-        srcShape, FloatType::getF32(&ctx),
++        srcShape, Float32Type::get(&ctx),
+         BlockedEncodingAttr::get(&ctx, sizePerThread, threadsPerWarp,
+                                  warpsPerCTA, order, ctaLayout));
+-    auto dstTy = RankedTensorType::get(dstShape, FloatType::getF32(&ctx));
++    auto dstTy = RankedTensorType::get(dstShape, Float32Type::get(&ctx));
+ 
+     bool couldReshape = false;
+     testReshape(srcTy, dstTy, /*expectedDstEnc=*/std::nullopt,
+@@ -526,7 +526,7 @@
+     ctx.getOrLoadDialect<TritonGPUDialect>();
+     ctaLayout =
+         triton::gpu::CTALayoutAttr::get(&ctx, ctaPerCGA, ctaSplit, ctaOrder);
+-    f16Ty = FloatType::getF16(&ctx);
++    f16Ty = Float16Type::get(&ctx);
+   }
+ 
+   triton::gpu::AMDMfmaEncodingAttr createMFMA(int mDim, int nDim,
+@@ -692,7 +692,7 @@
+       ASSERT_EQ(linearLayout, expandedLL);
+ 
+       // Test that methods of DistributedEncoding return the same values
+-      Type eltTy = FloatType::getF32(&ctx);
++      Type eltTy = Float32Type::get(&ctx);
+ 
+       ASSERT_EQ(getOrder(distributedEncoding), linearEncoding.getRepOrder());
+       ASSERT_EQ(cast<triton::gpu::TritonGPU_AttrTrait>(distributedEncoding)
+
+--- a/unittest/Dialect/TritonGPU/DumpLayoutTest.cpp	2024-10-31 04:36:20.000000000 -0700
++++ b/unittest/Dialect/TritonGPU/DumpLayoutTest.cpp	2025-01-19 13:19:23.000000000 -0800
+@@ -182,7 +182,7 @@
+                              {1},   /* ord, row-major */
+                              {1});  /* cOrd */
+ 
+-  auto elemTy = FloatType::getF16(sharedLayout.getContext());
++  auto elemTy = Float16Type::get(sharedLayout.getContext());
+   auto tensorType = RankedTensorType::get({32}, elemTy, sharedLayout);
+   std::string layout = getLayoutStr(tensorType, /*useHWPointOfView=*/false);
+   assertSameStr(refStr, layout);
+@@ -237,7 +237,7 @@
+                              {1, 0},  /* ord, row-major */
+                              {1, 0}); /* cOrd */
+ 
+-  auto elemTy = FloatType::getF16(sharedLayout.getContext());
++  auto elemTy = Float16Type::get(sharedLayout.getContext());
+   auto tensorType = RankedTensorType::get({8, 32}, elemTy, sharedLayout);
+   std::string layout = getLayoutStr(tensorType, /*useHWPointOfView=*/false);
+   assertSameStr(refStr, layout);
+@@ -510,7 +510,7 @@
+                                {1, 0},  /* ord, row-major */
+                                {1, 0}); /* cOrd */
+ 
+-  auto elemTyHW = FloatType::getF16(sharedLayoutHW.getContext());
++  auto elemTyHW = Float16Type::get(sharedLayoutHW.getContext());
+   auto tensorTypeHW = RankedTensorType::get({8, 32}, elemTyHW, sharedLayoutHW);
+ 
+   std::string layoutHW = getLayoutStr(tensorTypeHW, /*useHWPointOfView=*/true);
diff --git a/third_party/triton/llvm_integration/series.bzl b/third_party/triton/llvm_integration/series.bzl
index 656b9c894904d..be374e9d18868 100644
--- a/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/triton/llvm_integration/series.bzl
@@ -8,5 +8,6 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
+    "//third_party/triton:llvm_integration/cl717293402.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/tsl/third_party/llvm/generated.patch b/third_party/tsl/third_party/llvm/generated.patch
index 3d2a2525c37a9..8b54ffba772b7 100644
--- a/third_party/tsl/third_party/llvm/generated.patch
+++ b/third_party/tsl/third_party/llvm/generated.patch
@@ -1,207 +1,1156 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/TypeRange.h b/mlir/include/mlir/IR/TypeRange.h
---- a/mlir/include/mlir/IR/TypeRange.h
-+++ b/mlir/include/mlir/IR/TypeRange.h
-@@ -29,12 +29,11 @@
- /// a SmallVector/std::vector. This class should be used in places that are not
- /// suitable for a more derived type (e.g. ArrayRef) or a template range
- /// parameter.
--class TypeRange
--    : public llvm::detail::indexed_accessor_range_base<
--          TypeRange,
--          llvm::PointerUnion<const Value *, const Type *, OpOperand *,
--                             detail::OpResultImpl *, Type>,
--          Type, Type, Type> {
-+class TypeRange : public llvm::detail::indexed_accessor_range_base<
-+                      TypeRange,
-+                      llvm::PointerUnion<const Value *, const Type *,
-+                                         OpOperand *, detail::OpResultImpl *>,
-+                      Type, Type, Type> {
- public:
-   using RangeBaseT::RangeBaseT;
-   TypeRange(ArrayRef<Type> types = std::nullopt);
-@@ -45,11 +44,8 @@
-   TypeRange(ValueTypeRange<ValueRangeT> values)
-       : TypeRange(ValueRange(ValueRangeT(values.begin().getCurrent(),
-                                          values.end().getCurrent()))) {}
--
--  TypeRange(Type type) : TypeRange(type, /*count=*/1) {}
--  template <typename Arg, typename = std::enable_if_t<
--                              std::is_constructible_v<ArrayRef<Type>, Arg> &&
--                              !std::is_constructible_v<Type, Arg>>>
-+  template <typename Arg, typename = std::enable_if_t<std::is_constructible<
-+                              ArrayRef<Type>, Arg>::value>>
-   TypeRange(Arg &&arg) : TypeRange(ArrayRef<Type>(std::forward<Arg>(arg))) {}
-   TypeRange(std::initializer_list<Type> types)
-       : TypeRange(ArrayRef<Type>(types)) {}
-@@ -60,9 +56,8 @@
-   /// * A pointer to the first element of an array of types.
-   /// * A pointer to the first element of an array of operands.
-   /// * A pointer to the first element of an array of results.
--  /// * A single 'Type' instance.
-   using OwnerT = llvm::PointerUnion<const Value *, const Type *, OpOperand *,
--                                    detail::OpResultImpl *, Type>;
-+                                    detail::OpResultImpl *>;
- 
-   /// See `llvm::detail::indexed_accessor_range_base` for details.
-   static OwnerT offset_base(OwnerT object, ptrdiff_t index);
-diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h
---- a/mlir/include/mlir/IR/ValueRange.h
-+++ b/mlir/include/mlir/IR/ValueRange.h
-@@ -374,16 +374,16 @@
- /// SmallVector/std::vector. This class should be used in places that are not
- /// suitable for a more derived type (e.g. ArrayRef) or a template range
- /// parameter.
--class ValueRange final : public llvm::detail::indexed_accessor_range_base<
--                             ValueRange,
--                             PointerUnion<const Value *, OpOperand *,
--                                          detail::OpResultImpl *, Value>,
--                             Value, Value, Value> {
-+class ValueRange final
-+    : public llvm::detail::indexed_accessor_range_base<
-+          ValueRange,
-+          PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>,
-+          Value, Value, Value> {
- public:
-   /// The type representing the owner of a ValueRange. This is either a list of
--  /// values, operands, or results or a single value.
-+  /// values, operands, or results.
-   using OwnerT =
--      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *, Value>;
-+      PointerUnion<const Value *, OpOperand *, detail::OpResultImpl *>;
- 
-   using RangeBaseT::RangeBaseT;
- 
-@@ -392,7 +392,7 @@
-                 std::is_constructible<ArrayRef<Value>, Arg>::value &&
-                 !std::is_convertible<Arg, Value>::value>>
-   ValueRange(Arg &&arg) : ValueRange(ArrayRef<Value>(std::forward<Arg>(arg))) {}
--  ValueRange(Value value) : ValueRange(value, /*count=*/1) {}
-+  ValueRange(const Value &value) : ValueRange(&value, /*count=*/1) {}
-   ValueRange(const std::initializer_list<Value> &values)
-       : ValueRange(ArrayRef<Value>(values)) {}
-   ValueRange(iterator_range<OperandRange::iterator> values)
-diff -ruN --strip-trailing-cr a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp
---- a/mlir/lib/IR/OperationSupport.cpp
-+++ b/mlir/lib/IR/OperationSupport.cpp
-@@ -653,15 +653,6 @@
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- ValueRange::OwnerT ValueRange::offset_base(const OwnerT &owner,
-                                            ptrdiff_t index) {
--  if (llvm::isa_and_nonnull<Value>(owner)) {
--    // Prevent out-of-bounds indexing for single values.
--    // Note that we do allow an index of 1 as is required by 'slice'ing that
--    // returns an empty range. This also matches the usual rules of C++ of being
--    // allowed to index past the last element of an array.
--    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
--    // Return nullptr to quickly cause segmentation faults on misuse.
--    return index == 0 ? owner : nullptr;
--  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
-     return {value + index};
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-@@ -670,10 +661,6 @@
+diff -ruN --strip-trailing-cr a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
++++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+@@ -513,12 +513,6 @@
+ Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
+                      Entity entity, mlir::ValueRange oneBasedIndices);
+ 
+-/// Return a vector of extents for the given entity.
+-/// The function creates new operations, but tries to clean-up
+-/// after itself.
+-llvm::SmallVector<mlir::Value>
+-genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
+-
+ } // namespace hlfir
+ 
+ #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
+diff -ruN --strip-trailing-cr a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
++++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+@@ -1421,15 +1421,3 @@
+   return loadTrivialScalar(loc, builder,
+                            getElementAt(loc, builder, entity, oneBasedIndices));
  }
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- Value ValueRange::dereference_iterator(const OwnerT &owner, ptrdiff_t index) {
--  if (auto value = llvm::dyn_cast_if_present<Value>(owner)) {
--    assert(index == 0 && "cannot offset into single-value 'ValueRange'");
--    return value;
+-
+-llvm::SmallVector<mlir::Value>
+-hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
+-                        hlfir::Entity entity) {
+-  entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
+-  mlir::Value shape = hlfir::genShape(loc, builder, entity);
+-  llvm::SmallVector<mlir::Value, Fortran::common::maxRank> extents =
+-      hlfir::getExplicitExtentsFromShape(shape, builder);
+-  if (shape.getUses().empty())
+-    shape.getDefiningOp()->erase();
+-  return extents;
+-}
+diff -ruN --strip-trailing-cr a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
++++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+@@ -37,79 +37,6 @@
+ 
+ namespace {
+ 
+-// Helper class to generate operations related to computing
+-// product of values.
+-class ProductFactory {
+-public:
+-  ProductFactory(mlir::Location loc, fir::FirOpBuilder &builder)
+-      : loc(loc), builder(builder) {}
+-
+-  // Generate an update of the inner product value:
+-  //   acc += v1 * v2, OR
+-  //   acc += CONJ(v1) * v2, OR
+-  //   acc ||= v1 && v2
+-  //
+-  // CONJ parameter specifies whether the first complex product argument
+-  // needs to be conjugated.
+-  template <bool CONJ = false>
+-  mlir::Value genAccumulateProduct(mlir::Value acc, mlir::Value v1,
+-                                   mlir::Value v2) {
+-    mlir::Type resultType = acc.getType();
+-    acc = castToProductType(acc, resultType);
+-    v1 = castToProductType(v1, resultType);
+-    v2 = castToProductType(v2, resultType);
+-    mlir::Value result;
+-    if (mlir::isa<mlir::FloatType>(resultType)) {
+-      result = builder.create<mlir::arith::AddFOp>(
+-          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
+-    } else if (mlir::isa<mlir::ComplexType>(resultType)) {
+-      if constexpr (CONJ)
+-        result = fir::IntrinsicLibrary{builder, loc}.genConjg(resultType, v1);
+-      else
+-        result = v1;
+-
+-      result = builder.create<fir::AddcOp>(
+-          loc, acc, builder.create<fir::MulcOp>(loc, result, v2));
+-    } else if (mlir::isa<mlir::IntegerType>(resultType)) {
+-      result = builder.create<mlir::arith::AddIOp>(
+-          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
+-    } else if (mlir::isa<fir::LogicalType>(resultType)) {
+-      result = builder.create<mlir::arith::OrIOp>(
+-          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
+-    } else {
+-      llvm_unreachable("unsupported type");
+-    }
+-
+-    return builder.createConvert(loc, resultType, result);
 -  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(owner))
-     return value[index];
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-diff -ruN --strip-trailing-cr a/mlir/lib/IR/TypeRange.cpp b/mlir/lib/IR/TypeRange.cpp
---- a/mlir/lib/IR/TypeRange.cpp
-+++ b/mlir/lib/IR/TypeRange.cpp
-@@ -31,23 +31,12 @@
-     this->base = result;
-   else if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(owner))
-     this->base = operand;
--  else if (auto value = llvm::dyn_cast_if_present<Value>(owner))
--    this->base = value.getType();
-   else
-     this->base = cast<const Value *>(owner);
- }
+-
+-private:
+-  mlir::Location loc;
+-  fir::FirOpBuilder &builder;
+-
+-  mlir::Value castToProductType(mlir::Value value, mlir::Type type) {
+-    if (mlir::isa<fir::LogicalType>(type))
+-      return builder.createConvert(loc, builder.getIntegerType(1), value);
+-
+-    // TODO: the multiplications/additions by/of zero resulting from
+-    // complex * real are optimized by LLVM under -fno-signed-zeros
+-    // -fno-honor-nans.
+-    // We can make them disappear by default if we:
+-    //   * either expand the complex multiplication into real
+-    //     operations, OR
+-    //   * set nnan nsz fast-math flags to the complex operations.
+-    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
+-      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
+-      fir::factory::Complex helper(builder, loc);
+-      mlir::Type partType = helper.getComplexPartType(type);
+-      return helper.insertComplexPart(zeroCmplx,
+-                                      castToProductType(value, partType),
+-                                      /*isImagPart=*/false);
+-    }
+-    return builder.createConvert(loc, type, value);
+-  }
+-};
+-
+ class TransposeAsElementalConversion
+     : public mlir::OpRewritePattern<hlfir::TransposeOp> {
+ public:
+@@ -163,8 +90,11 @@
+   static mlir::Value genResultShape(mlir::Location loc,
+                                     fir::FirOpBuilder &builder,
+                                     hlfir::Entity array) {
+-    llvm::SmallVector<mlir::Value, 2> inExtents =
+-        hlfir::genExtentsVector(loc, builder, array);
++    mlir::Value inShape = hlfir::genShape(loc, builder, array);
++    llvm::SmallVector<mlir::Value> inExtents =
++        hlfir::getExplicitExtentsFromShape(inShape, builder);
++    if (inShape.getUses().empty())
++      inShape.getDefiningOp()->erase();
+ 
+     // transpose indices
+     assert(inExtents.size() == 2 && "checked in TransposeOp::validate");
+@@ -207,7 +137,7 @@
+     mlir::Value resultShape, dimExtent;
+     llvm::SmallVector<mlir::Value> arrayExtents;
+     if (isTotalReduction)
+-      arrayExtents = hlfir::genExtentsVector(loc, builder, array);
++      arrayExtents = genArrayExtents(loc, builder, array);
+     else
+       std::tie(resultShape, dimExtent) =
+           genResultShapeForPartialReduction(loc, builder, array, dimVal);
+@@ -233,8 +163,7 @@
+       // If DIM is not present, do total reduction.
+ 
+       // Initial value for the reduction.
+-      mlir::Value reductionInitValue =
+-          fir::factory::createZeroValue(builder, loc, elementType);
++      mlir::Value reductionInitValue = genInitValue(loc, builder, elementType);
+ 
+       // The reduction loop may be unordered if FastMathFlags::reassoc
+       // transformations are allowed. The integer reduction is always
+@@ -335,6 +264,17 @@
+   }
+ 
+ private:
++  static llvm::SmallVector<mlir::Value>
++  genArrayExtents(mlir::Location loc, fir::FirOpBuilder &builder,
++                  hlfir::Entity array) {
++    mlir::Value inShape = hlfir::genShape(loc, builder, array);
++    llvm::SmallVector<mlir::Value> inExtents =
++        hlfir::getExplicitExtentsFromShape(inShape, builder);
++    if (inShape.getUses().empty())
++      inShape.getDefiningOp()->erase();
++    return inExtents;
++  }
++
+   // Return fir.shape specifying the shape of the result
+   // of a SUM reduction with DIM=dimVal. The second return value
+   // is the extent of the DIM dimension.
+@@ -343,7 +283,7 @@
+                                     fir::FirOpBuilder &builder,
+                                     hlfir::Entity array, int64_t dimVal) {
+     llvm::SmallVector<mlir::Value> inExtents =
+-        hlfir::genExtentsVector(loc, builder, array);
++        genArrayExtents(loc, builder, array);
+     assert(dimVal > 0 && dimVal <= static_cast<int64_t>(inExtents.size()) &&
+            "DIM must be present and a positive constant not exceeding "
+            "the array's rank");
+@@ -353,6 +293,26 @@
+     return {builder.create<fir::ShapeOp>(loc, inExtents), dimExtent};
+   }
+ 
++  // Generate the initial value for a SUM reduction with the given
++  // data type.
++  static mlir::Value genInitValue(mlir::Location loc,
++                                  fir::FirOpBuilder &builder,
++                                  mlir::Type elementType) {
++    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
++      const llvm::fltSemantics &sem = ty.getFloatSemantics();
++      return builder.createRealConstant(loc, elementType,
++                                        llvm::APFloat::getZero(sem));
++    } else if (auto ty = mlir::dyn_cast<mlir::ComplexType>(elementType)) {
++      mlir::Value initValue = genInitValue(loc, builder, ty.getElementType());
++      return fir::factory::Complex{builder, loc}.createComplex(ty, initValue,
++                                                               initValue);
++    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
++      return builder.createIntegerConstant(loc, elementType, 0);
++    }
++
++    llvm_unreachable("unsupported SUM reduction type");
++  }
++
+   // Generate scalar addition of the two values (of the same data type).
+   static mlir::Value genScalarAdd(mlir::Location loc,
+                                   fir::FirOpBuilder &builder,
+@@ -610,10 +570,16 @@
+   static std::tuple<mlir::Value, mlir::Value>
+   genResultShape(mlir::Location loc, fir::FirOpBuilder &builder,
+                  hlfir::Entity input1, hlfir::Entity input2) {
+-    llvm::SmallVector<mlir::Value, 2> input1Extents =
+-        hlfir::genExtentsVector(loc, builder, input1);
+-    llvm::SmallVector<mlir::Value, 2> input2Extents =
+-        hlfir::genExtentsVector(loc, builder, input2);
++    mlir::Value input1Shape = hlfir::genShape(loc, builder, input1);
++    llvm::SmallVector<mlir::Value> input1Extents =
++        hlfir::getExplicitExtentsFromShape(input1Shape, builder);
++    if (input1Shape.getUses().empty())
++      input1Shape.getDefiningOp()->erase();
++    mlir::Value input2Shape = hlfir::genShape(loc, builder, input2);
++    llvm::SmallVector<mlir::Value> input2Extents =
++        hlfir::getExplicitExtentsFromShape(input2Shape, builder);
++    if (input2Shape.getUses().empty())
++      input2Shape.getDefiningOp()->erase();
  
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- TypeRange::OwnerT TypeRange::offset_base(OwnerT object, ptrdiff_t index) {
--  if (llvm::isa_and_nonnull<Type>(object)) {
--    // Prevent out-of-bounds indexing for single values.
--    // Note that we do allow an index of 1 as is required by 'slice'ing that
--    // returns an empty range. This also matches the usual rules of C++ of being
--    // allowed to index past the last element of an array.
--    assert(index <= 1 && "out-of-bound offset into single-value 'ValueRange'");
--    // Return nullptr to quickly cause segmentation faults on misuse.
--    return index == 0 ? object : nullptr;
+     llvm::SmallVector<mlir::Value, 2> newExtents;
+     mlir::Value innerProduct1Extent, innerProduct2Extent;
+@@ -661,6 +627,60 @@
+             innerProductExtent[0]};
+   }
+ 
++  static mlir::Value castToProductType(mlir::Location loc,
++                                       fir::FirOpBuilder &builder,
++                                       mlir::Value value, mlir::Type type) {
++    if (mlir::isa<fir::LogicalType>(type))
++      return builder.createConvert(loc, builder.getIntegerType(1), value);
++
++    // TODO: the multiplications/additions by/of zero resulting from
++    // complex * real are optimized by LLVM under -fno-signed-zeros
++    // -fno-honor-nans.
++    // We can make them disappear by default if we:
++    //   * either expand the complex multiplication into real
++    //     operations, OR
++    //   * set nnan nsz fast-math flags to the complex operations.
++    if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) {
++      mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type);
++      fir::factory::Complex helper(builder, loc);
++      mlir::Type partType = helper.getComplexPartType(type);
++      return helper.insertComplexPart(
++          zeroCmplx, castToProductType(loc, builder, value, partType),
++          /*isImagPart=*/false);
++    }
++    return builder.createConvert(loc, type, value);
++  }
++
++  // Generate an update of the inner product value:
++  //   acc += v1 * v2, OR
++  //   acc ||= v1 && v2
++  static mlir::Value genAccumulateProduct(mlir::Location loc,
++                                          fir::FirOpBuilder &builder,
++                                          mlir::Type resultType,
++                                          mlir::Value acc, mlir::Value v1,
++                                          mlir::Value v2) {
++    acc = castToProductType(loc, builder, acc, resultType);
++    v1 = castToProductType(loc, builder, v1, resultType);
++    v2 = castToProductType(loc, builder, v2, resultType);
++    mlir::Value result;
++    if (mlir::isa<mlir::FloatType>(resultType))
++      result = builder.create<mlir::arith::AddFOp>(
++          loc, acc, builder.create<mlir::arith::MulFOp>(loc, v1, v2));
++    else if (mlir::isa<mlir::ComplexType>(resultType))
++      result = builder.create<fir::AddcOp>(
++          loc, acc, builder.create<fir::MulcOp>(loc, v1, v2));
++    else if (mlir::isa<mlir::IntegerType>(resultType))
++      result = builder.create<mlir::arith::AddIOp>(
++          loc, acc, builder.create<mlir::arith::MulIOp>(loc, v1, v2));
++    else if (mlir::isa<fir::LogicalType>(resultType))
++      result = builder.create<mlir::arith::OrIOp>(
++          loc, acc, builder.create<mlir::arith::AndIOp>(loc, v1, v2));
++    else
++      llvm_unreachable("unsupported type");
++
++    return builder.createConvert(loc, resultType, result);
++  }
++
+   static mlir::LogicalResult
+   genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder,
+                       hlfir::Entity result, mlir::Value resultShape,
+@@ -728,9 +748,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, {I, K});
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, {K, J});
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                resultElementValue, lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, resultElementValue,
++            lhsElementValue, rhsElementValue);
+         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+         return {};
+       };
+@@ -765,9 +785,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, {J, K});
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, {K});
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                resultElementValue, lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, resultElementValue,
++            lhsElementValue, rhsElementValue);
+         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+         return {};
+       };
+@@ -797,9 +817,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, {K});
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, {K, J});
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                resultElementValue, lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, resultElementValue,
++            lhsElementValue, rhsElementValue);
+         builder.create<hlfir::AssignOp>(loc, productValue, resultElement);
+         return {};
+       };
+@@ -865,9 +885,9 @@
+             hlfir::loadElementAt(loc, builder, lhs, lhsIndices);
+         hlfir::Entity rhsElementValue =
+             hlfir::loadElementAt(loc, builder, rhs, rhsIndices);
+-        mlir::Value productValue =
+-            ProductFactory{loc, builder}.genAccumulateProduct(
+-                reductionArgs[0], lhsElementValue, rhsElementValue);
++        mlir::Value productValue = genAccumulateProduct(
++            loc, builder, resultElementType, reductionArgs[0], lhsElementValue,
++            rhsElementValue);
+         return {productValue};
+       };
+       llvm::SmallVector<mlir::Value, 1> innerProductValue =
+@@ -884,73 +904,6 @@
+   }
+ };
+ 
+-class DotProductConversion
+-    : public mlir::OpRewritePattern<hlfir::DotProductOp> {
+-public:
+-  using mlir::OpRewritePattern<hlfir::DotProductOp>::OpRewritePattern;
+-
+-  llvm::LogicalResult
+-  matchAndRewrite(hlfir::DotProductOp product,
+-                  mlir::PatternRewriter &rewriter) const override {
+-    hlfir::Entity op = hlfir::Entity{product};
+-    if (!op.isScalar())
+-      return rewriter.notifyMatchFailure(product, "produces non-scalar result");
+-
+-    mlir::Location loc = product.getLoc();
+-    fir::FirOpBuilder builder{rewriter, product.getOperation()};
+-    hlfir::Entity lhs = hlfir::Entity{product.getLhs()};
+-    hlfir::Entity rhs = hlfir::Entity{product.getRhs()};
+-    mlir::Type resultElementType = product.getType();
+-    bool isUnordered = mlir::isa<mlir::IntegerType>(resultElementType) ||
+-                       mlir::isa<fir::LogicalType>(resultElementType) ||
+-                       static_cast<bool>(builder.getFastMathFlags() &
+-                                         mlir::arith::FastMathFlags::reassoc);
+-
+-    mlir::Value extent = genProductExtent(loc, builder, lhs, rhs);
+-
+-    auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+-                       mlir::ValueRange oneBasedIndices,
+-                       mlir::ValueRange reductionArgs)
+-        -> llvm::SmallVector<mlir::Value, 1> {
+-      hlfir::Entity lhsElementValue =
+-          hlfir::loadElementAt(loc, builder, lhs, oneBasedIndices);
+-      hlfir::Entity rhsElementValue =
+-          hlfir::loadElementAt(loc, builder, rhs, oneBasedIndices);
+-      mlir::Value productValue =
+-          ProductFactory{loc, builder}.genAccumulateProduct</*CONJ=*/true>(
+-              reductionArgs[0], lhsElementValue, rhsElementValue);
+-      return {productValue};
+-    };
+-
+-    mlir::Value initValue =
+-        fir::factory::createZeroValue(builder, loc, resultElementType);
+-
+-    llvm::SmallVector<mlir::Value, 1> result = hlfir::genLoopNestWithReductions(
+-        loc, builder, {extent},
+-        /*reductionInits=*/{initValue}, genBody, isUnordered);
+-
+-    rewriter.replaceOp(product, result[0]);
+-    return mlir::success();
 -  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
-     return {value + index};
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
-@@ -59,10 +48,6 @@
- 
- /// See `llvm::detail::indexed_accessor_range_base` for details.
- Type TypeRange::dereference_iterator(OwnerT object, ptrdiff_t index) {
--  if (auto type = llvm::dyn_cast_if_present<Type>(object)) {
--    assert(index == 0 && "cannot offset into single-value 'TypeRange'");
--    return type;
+-
+-private:
+-  static mlir::Value genProductExtent(mlir::Location loc,
+-                                      fir::FirOpBuilder &builder,
+-                                      hlfir::Entity input1,
+-                                      hlfir::Entity input2) {
+-    llvm::SmallVector<mlir::Value, 1> input1Extents =
+-        hlfir::genExtentsVector(loc, builder, input1);
+-    llvm::SmallVector<mlir::Value, 1> input2Extents =
+-        hlfir::genExtentsVector(loc, builder, input2);
+-
+-    assert(input1Extents.size() == 1 && input2Extents.size() == 1 &&
+-           "hlfir.dot_product arguments must be vectors");
+-    llvm::SmallVector<mlir::Value, 1> extent =
+-        fir::factory::deduceOptimalExtents(input1Extents, input2Extents);
+-    return extent[0];
 -  }
-   if (const auto *value = llvm::dyn_cast_if_present<const Value *>(object))
-     return (value + index)->getType();
-   if (auto *operand = llvm::dyn_cast_if_present<OpOperand *>(object))
-diff -ruN --strip-trailing-cr a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
---- a/mlir/unittests/IR/OperationSupportTest.cpp
-+++ b/mlir/unittests/IR/OperationSupportTest.cpp
-@@ -313,21 +313,4 @@
-   op2->destroy();
+-};
+-
+ class SimplifyHLFIRIntrinsics
+     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
+ public:
+@@ -986,8 +939,6 @@
+     if (forceMatmulAsElemental || this->allowNewSideEffects)
+       patterns.insert<MatmulConversion<hlfir::MatmulOp>>(context);
+ 
+-    patterns.insert<DotProductConversion>(context);
+-
+     if (mlir::failed(mlir::applyPatternsGreedily(
+             getOperation(), std::move(patterns), config))) {
+       mlir::emitError(getOperation()->getLoc(),
+diff -ruN --strip-trailing-cr a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
+--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
++++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-dotproduct.fir
+@@ -1,144 +0,0 @@
+-// Test hlfir.dot_product simplification to a reduction loop:
+-// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
+-
+-func.func @dot_product_integer(%arg0: !hlfir.expr<?xi16>, %arg1: !hlfir.expr<?xi32>) -> i32 {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xi16>, !hlfir.expr<?xi32>) -> i32
+-  return %res : i32
+-}
+-// CHECK-LABEL:   func.func @dot_product_integer(
+-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xi16>,
+-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xi32>) -> i32 {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xi16>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (i32) {
+-// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xi16>, index) -> i16
+-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xi32>, index) -> i32
+-// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
+-// CHECK:             %[[VAL_12:.*]] = arith.muli %[[VAL_11]], %[[VAL_10]] : i32
+-// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_8]], %[[VAL_12]] : i32
+-// CHECK:             fir.result %[[VAL_13]] : i32
+-// CHECK:           }
+-// CHECK:           return %[[VAL_6]] : i32
+-// CHECK:         }
+-
+-func.func @dot_product_real(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xf16>) -> f32 {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xf16>) -> f32
+-  return %res : f32
+-}
+-// CHECK-LABEL:   func.func @dot_product_real(
+-// CHECK-SAME:                                %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+-// CHECK-SAME:                                %[[VAL_1:.*]]: !hlfir.expr<?xf16>) -> f32 {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_8:.*]] = %[[VAL_3]]) -> (f32) {
+-// CHECK:             %[[VAL_9:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_7]] : (!hlfir.expr<?xf32>, index) -> f32
+-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_7]] : (!hlfir.expr<?xf16>, index) -> f16
+-// CHECK:             %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (f16) -> f32
+-// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_9]], %[[VAL_11]] : f32
+-// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_8]], %[[VAL_12]] : f32
+-// CHECK:             fir.result %[[VAL_13]] : f32
+-// CHECK:           }
+-// CHECK:           return %[[VAL_6]] : f32
+-// CHECK:         }
+-
+-func.func @dot_product_complex(%arg0: !hlfir.expr<?xcomplex<f32>>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xcomplex<f32>>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+-  return %res : complex<f32>
+-}
+-// CHECK-LABEL:   func.func @dot_product_complex(
+-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?xcomplex<f32>>,
+-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xcomplex<f32>>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+-// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+-// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f32>>, index) -> complex<f32>
+-// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+-// CHECK:             %[[VAL_15:.*]] = fir.extract_value %[[VAL_12]], [1 : index] : (complex<f32>) -> f32
+-// CHECK:             %[[VAL_16:.*]] = arith.negf %[[VAL_15]] : f32
+-// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_12]], %[[VAL_16]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_18:.*]] = fir.mulc %[[VAL_17]], %[[VAL_14]] : complex<f32>
+-// CHECK:             %[[VAL_19:.*]] = fir.addc %[[VAL_11]], %[[VAL_18]] : complex<f32>
+-// CHECK:             fir.result %[[VAL_19]] : complex<f32>
+-// CHECK:           }
+-// CHECK:           return %[[VAL_9]] : complex<f32>
+-// CHECK:         }
+-
+-func.func @dot_product_real_complex(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?xf32>, !hlfir.expr<?xcomplex<f16>>) -> complex<f32>
+-  return %res : complex<f32>
+-}
+-// CHECK-LABEL:   func.func @dot_product_real_complex(
+-// CHECK-SAME:                                        %[[VAL_0:.*]]: !hlfir.expr<?xf32>,
+-// CHECK-SAME:                                        %[[VAL_1:.*]]: !hlfir.expr<?xcomplex<f16>>) -> complex<f32> {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?xf32>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.undefined complex<f32>
+-// CHECK:           %[[VAL_7:.*]] = fir.insert_value %[[VAL_6]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:           %[[VAL_9:.*]] = fir.do_loop %[[VAL_10:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_11:.*]] = %[[VAL_8]]) -> (complex<f32>) {
+-// CHECK:             %[[VAL_12:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_10]] : (!hlfir.expr<?xf32>, index) -> f32
+-// CHECK:             %[[VAL_13:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_10]] : (!hlfir.expr<?xcomplex<f16>>, index) -> complex<f16>
+-// CHECK:             %[[VAL_14:.*]] = fir.undefined complex<f32>
+-// CHECK:             %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_3]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_12]], [0 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_13]] : (complex<f16>) -> complex<f32>
+-// CHECK:             %[[VAL_19:.*]] = fir.extract_value %[[VAL_17]], [1 : index] : (complex<f32>) -> f32
+-// CHECK:             %[[VAL_20:.*]] = arith.negf %[[VAL_19]] : f32
+-// CHECK:             %[[VAL_21:.*]] = fir.insert_value %[[VAL_17]], %[[VAL_20]], [1 : index] : (complex<f32>, f32) -> complex<f32>
+-// CHECK:             %[[VAL_22:.*]] = fir.mulc %[[VAL_21]], %[[VAL_18]] : complex<f32>
+-// CHECK:             %[[VAL_23:.*]] = fir.addc %[[VAL_11]], %[[VAL_22]] : complex<f32>
+-// CHECK:             fir.result %[[VAL_23]] : complex<f32>
+-// CHECK:           }
+-// CHECK:           return %[[VAL_9]] : complex<f32>
+-// CHECK:         }
+-
+-func.func @dot_product_logical(%arg0: !hlfir.expr<?x!fir.logical<1>>, %arg1: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+-  %res = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<?x!fir.logical<1>>, !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4>
+-  return %res : !fir.logical<4>
+-}
+-// CHECK-LABEL:   func.func @dot_product_logical(
+-// CHECK-SAME:                                   %[[VAL_0:.*]]: !hlfir.expr<?x!fir.logical<1>>,
+-// CHECK-SAME:                                   %[[VAL_1:.*]]: !hlfir.expr<?x!fir.logical<4>>) -> !fir.logical<4> {
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_3:.*]] = arith.constant false
+-// CHECK:           %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr<?x!fir.logical<1>>) -> !fir.shape<1>
+-// CHECK:           %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index
+-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+-// CHECK:           %[[VAL_7:.*]] = fir.do_loop %[[VAL_8:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_9:.*]] = %[[VAL_6]]) -> (!fir.logical<4>) {
+-// CHECK:             %[[VAL_10:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<1>>, index) -> !fir.logical<1>
+-// CHECK:             %[[VAL_11:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_8]] : (!hlfir.expr<?x!fir.logical<4>>, index) -> !fir.logical<4>
+-// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_9]] : (!fir.logical<4>) -> i1
+-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<1>) -> i1
+-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+-// CHECK:             %[[VAL_15:.*]] = arith.andi %[[VAL_13]], %[[VAL_14]] : i1
+-// CHECK:             %[[VAL_16:.*]] = arith.ori %[[VAL_12]], %[[VAL_15]] : i1
+-// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+-// CHECK:             fir.result %[[VAL_17]] : !fir.logical<4>
+-// CHECK:           }
+-// CHECK:           return %[[VAL_7]] : !fir.logical<4>
+-// CHECK:         }
+-
+-func.func @dot_product_known_dim(%arg0: !hlfir.expr<10xf32>, %arg1: !hlfir.expr<?xi16>) -> f32 {
+-  %res1 = hlfir.dot_product %arg0 %arg1 : (!hlfir.expr<10xf32>, !hlfir.expr<?xi16>) -> f32
+-  %res2 = hlfir.dot_product %arg1 %arg0 : (!hlfir.expr<?xi16>, !hlfir.expr<10xf32>) -> f32
+-  %res = arith.addf %res1, %res2 : f32
+-  return %res : f32
+-}
+-// CHECK-LABEL:   func.func @dot_product_known_dim(
+-// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+-// CHECK:           %[[VAL_4:.*]] = arith.constant 10 : index
+-// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
+-// CHECK:           fir.do_loop %{{.*}} = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_2]]
+diff -ruN --strip-trailing-cr a/libcxx/include/__config b/libcxx/include/__config
+--- a/libcxx/include/__config
++++ b/libcxx/include/__config
+@@ -1166,9 +1166,7 @@
+ #    define _LIBCPP_NOESCAPE
+ #  endif
+ 
+-// FIXME: Expand this to [[__gnu__::__nodebug__]] again once the testcase reported in
+-// https://github.com/llvm/llvm-project/pull/118710 has been analyzed
+-#  define _LIBCPP_NODEBUG
++#  define _LIBCPP_NODEBUG [[__gnu__::__nodebug__]]
+ 
+ #  if __has_attribute(__standalone_debug__)
+ #    define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__))
+diff -ruN --strip-trailing-cr a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+--- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
++++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp
+@@ -27,7 +27,7 @@
+     check_factories.registerCheck<libcpp::header_exportable_declarations>("libcpp-header-exportable-declarations");
+     check_factories.registerCheck<libcpp::hide_from_abi>("libcpp-hide-from-abi");
+     check_factories.registerCheck<libcpp::internal_ftm_use>("libcpp-internal-ftms");
+-    // check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
++    check_factories.registerCheck<libcpp::nodebug_on_aliases>("libcpp-nodebug-on-aliases");
+     check_factories.registerCheck<libcpp::proper_version_checks>("libcpp-cpp-version-check");
+     check_factories.registerCheck<libcpp::robust_against_adl_check>("libcpp-robust-against-adl");
+     check_factories.registerCheck<libcpp::uglify_attributes>("libcpp-uglify-attributes");
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
++++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+@@ -1140,8 +1140,6 @@
+ 
+   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ 
+-  setTargetDAGCombine(ISD::SHL);
+-
+   // In case of strict alignment, avoid an excessive number of byte wide stores.
+   MaxStoresPerMemsetOptSize = 8;
+   MaxStoresPerMemset =
+@@ -26473,43 +26471,6 @@
+   return NVCAST;
  }
  
--TEST(ValueRangeTest, ValueConstructable) {
--  MLIRContext context;
--  Builder builder(&context);
+-/// If the operand is a bitwise AND with a constant RHS, and the shift has a
+-/// constant RHS and is the only use, we can pull it out of the shift, i.e.
+-///
+-///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
+-///
+-/// We prefer this canonical form to match existing isel patterns.
+-static SDValue performSHLCombine(SDNode *N,
+-                                 TargetLowering::DAGCombinerInfo &DCI,
+-                                 SelectionDAG &DAG) {
+-  if (DCI.isBeforeLegalizeOps())
+-    return SDValue();
 -
--  Operation *useOp =
--      createOp(&context, /*operands=*/std::nullopt, builder.getIntegerType(16));
--  // Valid construction despite a temporary 'OpResult'.
--  ValueRange operands = useOp->getResult(0);
+-  SDValue Op0 = N->getOperand(0);
+-  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
+-    return SDValue();
 -
--  useOp->setOperands(operands);
--  EXPECT_EQ(useOp->getNumOperands(), 1u);
--  EXPECT_EQ(useOp->getOperand(0), useOp->getResult(0));
+-  SDValue C1 = Op0->getOperand(1);
+-  SDValue C2 = N->getOperand(1);
+-  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
+-    return SDValue();
 -
--  useOp->dropAllUses();
--  useOp->destroy();
+-  // Might be folded into shifted op, do not lower.
+-  if (N->hasOneUse()) {
+-    unsigned UseOpc = N->user_begin()->getOpcode();
+-    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
+-        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
+-      return SDValue();
+-  }
+-
+-  SDLoc DL(N);
+-  EVT VT = N->getValueType(0);
+-  SDValue X = Op0->getOperand(0);
+-  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
+-  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
+-  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
+-}
+-
+ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+   SelectionDAG &DAG = DCI.DAG;
+@@ -26855,8 +26816,6 @@
+     return performCTLZCombine(N, DAG, Subtarget);
+   case ISD::SCALAR_TO_VECTOR:
+     return performScalarToVectorCombine(N, DCI, DAG);
+-  case ISD::SHL:
+-    return performSHLCombine(N, DCI, DAG);
+   }
+   return SDValue();
+ }
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
++++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+@@ -4979,7 +4979,7 @@
+     // the subvector length.
+     const unsigned VecVF = getNumElements(Vec->getType());
+     SmallVector<int> Mask(VecVF, PoisonMaskElem);
+-    std::iota(Mask.begin(), std::next(Mask.begin(), Index), 0);
++    std::iota(Mask.begin(), Mask.end(), 0);
+     for (unsigned I : seq<unsigned>(SubVecVF))
+       Mask[I + Index] = I + VecVF;
+     if (Generator) {
+@@ -13956,11 +13956,12 @@
+     Instruction *InsElt;
+     if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
+       assert(SLPReVec && "FixedVectorType is not expected.");
+-      Vec = InsElt = cast<Instruction>(createInsertVector(
+-          Builder, Vec, Scalar, Pos * getNumElements(VecTy)));
+-      auto *II = dyn_cast<IntrinsicInst>(InsElt);
++      Vec =
++          createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
++      auto *II = dyn_cast<IntrinsicInst>(Vec);
+       if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+         return Vec;
++      InsElt = II;
+     } else {
+       Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
+       InsElt = dyn_cast<InsertElementInst>(Vec);
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
++++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+@@ -190,7 +190,8 @@
+ define i8 @test_i8_7_mask_shl_1(i8 %a0) {
+ ; CHECK-LABEL: test_i8_7_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #1, #3
++; CHECK-NEXT:    and w8, w0, #0x7
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 7
+   %t1 = shl i8 %t0, 1
+@@ -199,7 +200,8 @@
+ define i8 @test_i8_7_mask_shl_4(i8 %a0) {
+ ; CHECK-LABEL: test_i8_7_mask_shl_4:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #4, #3
++; CHECK-NEXT:    and w8, w0, #0x7
++; CHECK-NEXT:    lsl w0, w8, #4
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 7
+   %t1 = shl i8 %t0, 4
+@@ -227,8 +229,8 @@
+ define i8 @test_i8_28_mask_shl_1(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0x38
++; CHECK-NEXT:    and w8, w0, #0x1c
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 1
+@@ -237,8 +239,8 @@
+ define i8 @test_i8_28_mask_shl_2(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_2:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #2
+-; CHECK-NEXT:    and w0, w8, #0x70
++; CHECK-NEXT:    and w8, w0, #0x1c
++; CHECK-NEXT:    lsl w0, w8, #2
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 2
+@@ -247,8 +249,8 @@
+ define i8 @test_i8_28_mask_shl_3(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_3:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #3
+-; CHECK-NEXT:    and w0, w8, #0xe0
++; CHECK-NEXT:    and w8, w0, #0x1c
++; CHECK-NEXT:    lsl w0, w8, #3
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 3
+@@ -257,8 +259,8 @@
+ define i8 @test_i8_28_mask_shl_4(i8 %a0) {
+ ; CHECK-LABEL: test_i8_28_mask_shl_4:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #4
+-; CHECK-NEXT:    and w0, w8, #0xc0
++; CHECK-NEXT:    and w8, w0, #0xc
++; CHECK-NEXT:    lsl w0, w8, #4
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 28
+   %t1 = shl i8 %t0, 4
+@@ -268,8 +270,8 @@
+ define i8 @test_i8_224_mask_shl_1(i8 %a0) {
+ ; CHECK-LABEL: test_i8_224_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0xc0
++; CHECK-NEXT:    and w8, w0, #0x60
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i8 %a0, 224
+   %t1 = shl i8 %t0, 1
+@@ -463,7 +465,8 @@
+ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
+ ; CHECK-LABEL: test_i16_127_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #1, #7
++; CHECK-NEXT:    and w8, w0, #0x7f
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 127
+   %t1 = shl i16 %t0, 1
+@@ -472,7 +475,8 @@
+ define i16 @test_i16_127_mask_shl_8(i16 %a0) {
+ ; CHECK-LABEL: test_i16_127_mask_shl_8:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #8, #7
++; CHECK-NEXT:    and w8, w0, #0x7f
++; CHECK-NEXT:    lsl w0, w8, #8
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 127
+   %t1 = shl i16 %t0, 8
+@@ -500,8 +504,8 @@
+ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_3:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #3
+-; CHECK-NEXT:    and w0, w8, #0x3f80
++; CHECK-NEXT:    and w8, w0, #0x7f0
++; CHECK-NEXT:    lsl w0, w8, #3
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 3
+@@ -510,8 +514,8 @@
+ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_4:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #4
+-; CHECK-NEXT:    and w0, w8, #0x7f00
++; CHECK-NEXT:    and w8, w0, #0x7f0
++; CHECK-NEXT:    lsl w0, w8, #4
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 4
+@@ -520,8 +524,8 @@
+ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_5:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #5
+-; CHECK-NEXT:    and w0, w8, #0xfe00
++; CHECK-NEXT:    and w8, w0, #0x7f0
++; CHECK-NEXT:    lsl w0, w8, #5
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 5
+@@ -530,8 +534,8 @@
+ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
+ ; CHECK-LABEL: test_i16_2032_mask_shl_6:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #6
+-; CHECK-NEXT:    and w0, w8, #0xfc00
++; CHECK-NEXT:    and w8, w0, #0x3f0
++; CHECK-NEXT:    lsl w0, w8, #6
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 2032
+   %t1 = shl i16 %t0, 6
+@@ -541,8 +545,8 @@
+ define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
+ ; CHECK-LABEL: test_i16_65024_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0xfc00
++; CHECK-NEXT:    and w8, w0, #0x7e00
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i16 %a0, 65024
+   %t1 = shl i16 %t0, 1
+@@ -736,7 +740,8 @@
+ define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
+ ; CHECK-LABEL: test_i32_32767_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #1, #15
++; CHECK-NEXT:    and w8, w0, #0x7fff
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 32767
+   %t1 = shl i32 %t0, 1
+@@ -745,7 +750,8 @@
+ define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
+ ; CHECK-LABEL: test_i32_32767_mask_shl_16:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    ubfiz w0, w0, #16, #15
++; CHECK-NEXT:    and w8, w0, #0x7fff
++; CHECK-NEXT:    lsl w0, w8, #16
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 32767
+   %t1 = shl i32 %t0, 16
+@@ -773,8 +779,8 @@
+ define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_7:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #7
+-; CHECK-NEXT:    and w0, w8, #0x3fff8000
++; CHECK-NEXT:    and w8, w0, #0x7fff00
++; CHECK-NEXT:    lsl w0, w8, #7
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 7
+@@ -783,8 +789,8 @@
+ define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_8:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #8
+-; CHECK-NEXT:    and w0, w8, #0x7fff0000
++; CHECK-NEXT:    and w8, w0, #0x7fff00
++; CHECK-NEXT:    lsl w0, w8, #8
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 8
+@@ -793,8 +799,8 @@
+ define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_9:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #9
+-; CHECK-NEXT:    and w0, w8, #0xfffe0000
++; CHECK-NEXT:    and w8, w0, #0x7fff00
++; CHECK-NEXT:    lsl w0, w8, #9
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 9
+@@ -803,8 +809,8 @@
+ define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
+ ; CHECK-LABEL: test_i32_8388352_mask_shl_10:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #10
+-; CHECK-NEXT:    and w0, w8, #0xfffc0000
++; CHECK-NEXT:    and w8, w0, #0x3fff00
++; CHECK-NEXT:    lsl w0, w8, #10
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 8388352
+   %t1 = shl i32 %t0, 10
+@@ -814,8 +820,8 @@
+ define i32 @test_i32_4294836224_mask_shl_1(i32 %a0) {
+ ; CHECK-LABEL: test_i32_4294836224_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w8, w0, #1
+-; CHECK-NEXT:    and w0, w8, #0xfffc0000
++; CHECK-NEXT:    and w8, w0, #0x7ffe0000
++; CHECK-NEXT:    lsl w0, w8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i32 %a0, 4294836224
+   %t1 = shl i32 %t0, 1
+@@ -1009,7 +1015,8 @@
+ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
+ ; CHECK-LABEL: test_i64_2147483647_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w0, w0, #1
++; CHECK-NEXT:    and x8, x0, #0x7fffffff
++; CHECK-NEXT:    lsl x0, x8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 2147483647
+   %t1 = shl i64 %t0, 1
+@@ -1047,8 +1054,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_15:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #15
+-; CHECK-NEXT:    and x0, x8, #0x3fffffff80000000
++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #15
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 15
+@@ -1057,8 +1064,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_16:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #16
+-; CHECK-NEXT:    and x0, x8, #0x7fffffff00000000
++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #16
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 16
+@@ -1067,8 +1074,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_17:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #17
+-; CHECK-NEXT:    and x0, x8, #0xfffffffe00000000
++; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #17
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 17
+@@ -1077,8 +1084,8 @@
+ define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
+ ; CHECK-LABEL: test_i64_140737488289792_mask_shl_18:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #18
+-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
++; CHECK-NEXT:    and x8, x0, #0x3fffffff0000
++; CHECK-NEXT:    lsl x0, x8, #18
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 140737488289792
+   %t1 = shl i64 %t0, 18
+@@ -1088,8 +1095,8 @@
+ define i64 @test_i64_18446744065119617024_mask_shl_1(i64 %a0) {
+ ; CHECK-LABEL: test_i64_18446744065119617024_mask_shl_1:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl x8, x0, #1
+-; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
++; CHECK-NEXT:    and x8, x0, #0x7ffffffe00000000
++; CHECK-NEXT:    lsl x0, x8, #1
+ ; CHECK-NEXT:    ret
+   %t0 = and i64 %a0, 18446744065119617024
+   %t1 = shl i64 %t0, 1
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
+--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
++++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
+@@ -1013,8 +1013,8 @@
+ define i32 @c2_i32(i32 %arg) nounwind {
+ ; CHECK-LABEL: c2_i32:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr w8, w0, #17
+-; CHECK-NEXT:    and w0, w8, #0xffc
++; CHECK-NEXT:    ubfx w8, w0, #19, #10
++; CHECK-NEXT:    lsl w0, w8, #2
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i32 %arg, 19
+   %tmp1 = and i32 %tmp0, 1023
+@@ -1063,8 +1063,8 @@
+ define i64 @c2_i64(i64 %arg) nounwind {
+ ; CHECK-LABEL: c2_i64:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr x8, x0, #49
+-; CHECK-NEXT:    and x0, x8, #0xffc
++; CHECK-NEXT:    ubfx x8, x0, #51, #10
++; CHECK-NEXT:    lsl x0, x8, #2
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i64 %arg, 51
+   %tmp1 = and i64 %tmp0, 1023
+@@ -1120,8 +1120,8 @@
+ define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
+ ; CHECK-LABEL: c7_i32:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr w8, w0, #17
+-; CHECK-NEXT:    and w8, w8, #0xffc
++; CHECK-NEXT:    ubfx w8, w0, #19, #10
++; CHECK-NEXT:    lsl w8, w8, #2
+ ; CHECK-NEXT:    str w8, [x1]
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i32 %arg, 19
+@@ -1163,8 +1163,8 @@
+ define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
+ ; CHECK-LABEL: c7_i64:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsr x8, x0, #49
+-; CHECK-NEXT:    and x8, x8, #0xffc
++; CHECK-NEXT:    ubfx x8, x0, #51, #10
++; CHECK-NEXT:    lsl x8, x8, #2
+ ; CHECK-NEXT:    str x8, [x1]
+ ; CHECK-NEXT:    ret
+   %tmp0 = lshr i64 %arg, 51
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/fpenv.ll b/llvm/test/CodeGen/AArch64/fpenv.ll
+--- a/llvm/test/CodeGen/AArch64/fpenv.ll
++++ b/llvm/test/CodeGen/AArch64/fpenv.ll
+@@ -4,11 +4,11 @@
+ define void @func_set_rounding_dyn(i32 %rm) {
+ ; CHECK-LABEL: func_set_rounding_dyn:
+ ; CHECK:       // %bb.0:
+-; CHECK-NEXT:    lsl w9, w0, #22
++; CHECK-NEXT:    sub w9, w0, #1
+ ; CHECK-NEXT:    mrs x8, FPCR
++; CHECK-NEXT:    and w9, w9, #0x3
+ ; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+-; CHECK-NEXT:    sub w9, w9, #1024, lsl #12 // =4194304
+-; CHECK-NEXT:    and w9, w9, #0xc00000
++; CHECK-NEXT:    lsl w9, w9, #22
+ ; CHECK-NEXT:    orr x8, x8, x9
+ ; CHECK-NEXT:    msr FPCR, x8
+ ; CHECK-NEXT:    ret
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/xbfiz.ll b/llvm/test/CodeGen/AArch64/xbfiz.ll
+--- a/llvm/test/CodeGen/AArch64/xbfiz.ll
++++ b/llvm/test/CodeGen/AArch64/xbfiz.ll
+@@ -69,19 +69,3 @@
+   %and = and i64 %shl, 4294967295
+   ret i64 %and
+ }
+-
+-define i64 @lsl_zext_i8_i64(i8 %b) {
+-; CHECK-LABEL: lsl_zext_i8_i64:
+-; CHECK:    ubfiz x0, x0, #1, #8
+-  %1 = zext i8 %b to i64
+-  %2 = shl i64 %1, 1
+-  ret i64 %2
 -}
 -
- } // namespace
-diff -ruN --strip-trailing-cr a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
---- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
-@@ -43,10 +43,7 @@
- 
- gentbl(
-     name = "diagnostic_defs_gen",
--    tbl_outs = [(
--        "-gen-clang-diags-defs -clang-component=%s" % c,
--        "include/clang/Basic/Diagnostic%sKinds.inc" % c,
--    ) for c in [
-+    tbl_outs = [out for c in [
-         "AST",
-         "Analysis",
-         "Comment",
-@@ -60,6 +57,15 @@
-         "Refactoring",
-         "Sema",
-         "Serialization",
-+    ] for out in [
-+        (
-+            "-gen-clang-diags-defs -clang-component=%s" % c,
-+            "include/clang/Basic/Diagnostic%sKinds.inc" % c,
-+        ),
-+        (
-+            "-gen-clang-diags-enums -clang-component=%s" % c,
-+            "include/clang/Basic/Diagnostic%sEnums.inc" % c,
-+        ),
-     ]] + [
-         (
-             "-gen-clang-diag-groups",
+-define i64 @lsl_zext_i16_i64(i16 %b) {
+-; CHECK-LABEL: lsl_zext_i16_i64:
+-; CHECK:    ubfiz x0, x0, #1, #16
+-  %1 = zext i16 %b to i64
+-  %2 = shl i64 %1, 1
+-  ret i64 %2
+-}
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
++++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+@@ -0,0 +1,81 @@
++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
++; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
++
++define <16 x double> @test(ptr %x, double %v, double %a) {
++; CHECK-LABEL: define <16 x double> @test(
++; CHECK-SAME: ptr [[X:%.*]], double [[V:%.*]], double [[A:%.*]]) {
++; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
++; CHECK-NEXT:    [[GEP8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 9
++; CHECK-NEXT:    [[TMP1:%.*]] = load <6 x double>, ptr [[X]], align 4
++; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[GEP6]], align 4
++; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[GEP8]], align 4
++; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> poison, double [[A]], i32 0
++; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x double> [[TMP4]], <16 x double> poison, <16 x i32> zeroinitializer
++; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[V]], i32 0
++; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[V]], i32 0
++; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <2 x i32> zeroinitializer
++; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v6f64(<16 x double> poison, <6 x double> [[TMP1]], i64 0)
++; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
++; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 14, i32 15>
++; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP12]], <2 x double> [[TMP6]], i64 6)
++; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP13]], <2 x double> [[TMP7]], i64 8)
++; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP14]], <2 x double> [[TMP9]], i64 10)
++; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP15]], <2 x double> [[TMP9]], i64 12)
++; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x double> @llvm.vector.insert.v16f64.v2f64(<16 x double> [[TMP16]], <2 x double> [[TMP9]], i64 14)
++; CHECK-NEXT:    [[TMP18:%.*]] = fadd <16 x double> [[TMP5]], [[TMP17]]
++; CHECK-NEXT:    ret <16 x double> [[TMP18]]
++;
++  %gep1 = getelementptr inbounds double, ptr %x, i64 1
++  %gep2 = getelementptr inbounds double, ptr %x, i64 2
++  %gep3 = getelementptr inbounds double, ptr %x, i64 3
++  %gep4 = getelementptr inbounds double, ptr %x, i64 4
++  %gep5 = getelementptr inbounds double, ptr %x, i64 5
++  %gep6 = getelementptr inbounds double, ptr %x, i64 8
++  %gep7 = getelementptr inbounds double, ptr %x, i64 9
++  %gep8 = getelementptr inbounds double, ptr %x, i64 9
++  %gep9 = getelementptr inbounds double, ptr %x, i64 10
++  %x0 = load double, ptr %x, align 4
++  %x1 = load double, ptr %gep1, align 4
++  %x2 = load double, ptr %gep2, align 4
++  %x3 = load double, ptr %gep3, align 4
++  %x4 = load double, ptr %gep4, align 4
++  %x5 = load double, ptr %gep5, align 4
++  %x6 = load double, ptr %gep6, align 4
++  %x7 = load double, ptr %gep7, align 4
++  %x8 = load double, ptr %gep8, align 4
++  %x9 = load double, ptr %gep9, align 4
++  %add1 = fadd double %a, %x0
++  %add2 = fadd double %a, %x1
++  %add3 = fadd double %a, %x2
++  %add4 = fadd double %a, %x3
++  %add5 = fadd double %a, %x4
++  %add6 = fadd double %a, %x5
++  %add7 = fadd double %a, %x6
++  %add8 = fadd double %a, %x7
++  %add9 = fadd double %a, %x8
++  %add10 = fadd double %a, %x9
++  %add11 = fadd double %a, %v
++  %add12 = fadd double %a, %v
++  %add13 = fadd double %a, %v
++  %add14 = fadd double %a, %v
++  %add15 = fadd double %a, %v
++  %add16 = fadd double %a, %v
++  %i0 = insertelement <16 x double> poison, double %add1, i32 0
++  %i1 = insertelement <16 x double> %i0, double %add2, i32 1
++  %i2 = insertelement <16 x double> %i1, double %add3, i32 2
++  %i3 = insertelement <16 x double> %i2, double %add4, i32 3
++  %i4 = insertelement <16 x double> %i3, double %add5, i32 4
++  %i5 = insertelement <16 x double> %i4, double %add6, i32 5
++  %i6 = insertelement <16 x double> %i5, double %add7, i32 6
++  %i7 = insertelement <16 x double> %i6, double %add8, i32 7
++  %i8 = insertelement <16 x double> %i7, double %add9, i32 8
++  %i9 = insertelement <16 x double> %i8, double %add10, i32 9
++  %i10 = insertelement <16 x double> %i9, double %add11, i32 10
++  %i11 = insertelement <16 x double> %i10, double %add12, i32 11
++  %i12 = insertelement <16 x double> %i11, double %add13, i32 12
++  %i13 = insertelement <16 x double> %i12, double %add14, i32 13
++  %i14 = insertelement <16 x double> %i13, double %add15, i32 14
++  %i15 = insertelement <16 x double> %i14, double %add16, i32 15
++  ret <16 x double> %i15
++}
diff --git a/third_party/tsl/third_party/llvm/workspace.bzl b/third_party/tsl/third_party/llvm/workspace.bzl
index 4706c63c0e1cc..cb092919de358 100644
--- a/third_party/tsl/third_party/llvm/workspace.bzl
+++ b/third_party/tsl/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "bf17016a92bc8a23d2cdd2b51355dd4eb5019c68"
-    LLVM_SHA256 = "ba09f12e5019f5aca531b1733275f0a10b181d6f894deb1a4610e017f76b172a"
+    LLVM_COMMIT = "13c761789753862a7cc31a2a26f23010afa668b9"
+    LLVM_SHA256 = "587f3eda6d00d751cbfc69fa5a15475ae4232e191ace04031b343e4e8ae16355"
 
     tf_http_archive(
         name = name,
diff --git a/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc b/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc
index 2c14e7e299772..a493ffcd2c4bd 100644
--- a/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/xla/backends/gpu/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -176,8 +176,7 @@ mlir::VectorType GetVectorTypeForAtomicRMW(mlir::RankedTensorType tensor_type,
     return nullptr;
   }
 
-  if (tensor_type.getElementType() !=
-      mlir::FloatType::getF32(loop.getContext()))
+  if (tensor_type.getElementType() != mlir::Float32Type::get(loop.getContext()))
     return nullptr;
 
   if (mlir::getConstantIntValue(loop.getStep()) != 1 ||