[Matrix] Fix miscompile for NT matmul if the transpose has other use

We should only add the fake lowering entry for the matrix remark if the transpose is not lowered on its own. `MapVector::insert` is used to insert the entry during proper lowering which does not overwrite the fake entry in the map. We actually had test coverage for this but the reference output code was wrong; it was storing undef rather than the transposed column. Also add an assert that would have caught this. Differential Revision: https://reviews.llvm.org/D106457
2025-01-31 20:51:52 +01:00 · 2021-07-21 09:46:47 -07:00 · 2021-07-21 09:46:47 -07:00 · cf091349e0
commit cf091349e0
parent 54bb3c6144
2 changed files with 7 additions and 6 deletions
--- a/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@ -1140,7 +1140,8 @@ public:
  /// deletion.
  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,
                        IRBuilder<> &Builder) {
-    Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
+    auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
+    assert(inserted.second && "multiple matrix lowering mapping");

    ToRemove.push_back(Inst);
    Value *Flattened = nullptr;
@ -1540,11 +1541,11 @@ public:
      if (Transpose->hasOneUse()) {
        FusedInsts.insert(cast<Instruction>(Transpose));
        ToRemove.push_back(cast<Instruction>(Transpose));
+        // TODO: add a fake entry for the folded instruction so that this is
+        // included in the expression in the remark.
+        Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType);
      }
      finalizeLowering(MatMul, Result, Builder);
-      // TODO: add a fake entry for the folded instruction so that this is
-      // included in the expression in the remark.
-      Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType);
      return;
    }

--- a/test/Transforms/LowerMatrixIntrinsics/multiply-right-transpose.ll
+++ b/test/Transforms/LowerMatrixIntrinsics/multiply-right-transpose.ll
@ -91,10 +91,10 @@ define <4 x double> @multiply_right_transpose_2x3x2(<6 x double> %a, <6 x double
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <6 x double>* [[P:%.*]] to double*
 ; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP12]] to <3 x double>*
-; CHECK-NEXT:    store <3 x double> undef, <3 x double>* [[VEC_CAST]], align 16
+; CHECK-NEXT:    store <3 x double> [[TMP5]], <3 x double>* [[VEC_CAST]], align 16
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP12]], i64 3
 ; CHECK-NEXT:    [[VEC_CAST42:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
-; CHECK-NEXT:    store <3 x double> undef, <3 x double>* [[VEC_CAST42]], align 8
+; CHECK-NEXT:    store <3 x double> [[TMP11]], <3 x double>* [[VEC_CAST42]], align 8
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[A:%.*]], <6 x double> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <6 x double> [[A]], <6 x double> poison, <2 x i32> <i32 4, i32 5>