llvm-mirror/test/Transforms/LowerMatrixIntrinsics/propagate-mixed-users.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s

; Currently we only lower stores with shape information, but need to embed the
; matrix in a flat vector for function calls and returns.
define <8 x double> @strided_load_4x4(<8 x double> %in, <8 x double>* %Ptr) {
; CHECK-LABEL: @strided_load_4x4(
; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[IN:%.*]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[IN]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x double> [[SPLIT]], i64 0
; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[TMP1]], i64 0
; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 0
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP3]], i64 1
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[SPLIT]], i64 1
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i64 0
; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 1
; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i64 1
; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[SPLIT]], i64 2
; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> undef, double [[TMP9]], i64 0
; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 2
; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP11]], i64 1
; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x double> [[SPLIT]], i64 3
; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> undef, double [[TMP13]], i64 0
; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 3
; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> [[TMP14]], double [[TMP15]], i64 1
; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x double>* [[PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP20]] to <2 x double>*
; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[VEC_CAST]], align 8
; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP20]], i64 2
; CHECK-NEXT:    [[VEC_CAST2:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[VEC_CAST2]], align 8
; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP20]], i64 4
; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <2 x double>*
; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[VEC_CAST4]], align 8
; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, double* [[TMP20]], i64 6
; CHECK-NEXT:    [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
; CHECK-NEXT:    store <2 x double> [[TMP16]], <2 x double>* [[VEC_CAST6]], align 8
; CHECK-NEXT:    call void @foo(<8 x double> [[TMP19]])
; CHECK-NEXT:    ret <8 x double> [[TMP19]]
;
  %transposed = call <8 x double> @llvm.matrix.transpose(<8 x double> %in, i32 4, i32 2)
  store <8 x double> %transposed, <8 x double>* %Ptr, align 8
  call void @foo(<8 x double> %transposed)
  ret <8 x double> %transposed
}

declare <8 x double> @llvm.matrix.transpose(<8 x double>, i32, i32)

declare void @foo(<8 x double>)
[Matrix] Add forward shape propagation and first shape aware lowerings. This patch adds infrastructure for forward shape propagation to LowerMatrixIntrinsics. It also updates the pass to make use of the shape information to break up larger vector operations and to eliminate unnecessary conversion operations between columnwise matrixes and flattened vectors: if shape information is available for an instruction, lower the operation to a set of instructions operating on columns. For example, a store of a matrix is broken down into separate stores for each column. For users that do not have shape information (e.g. because they do not yet support shape information aware lowering), we pack the result columns into a flat vector and update those users. It also adds shape aware lowering for the first non-intrinsic instruction: vector stores. Example: For %c = call <4 x double> @llvm.matrix.transpose(<4 x double> %a, i32 2, i32 2) store <4 x double> %c, <4 x double>* %Ptr We generate the code below without shape propagation. Note %9 which combines the columns of the transposed matrix into a flat vector. %split = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1> %split1 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3> %1 = extractelement <2 x double> %split, i64 0 %2 = insertelement <2 x double> undef, double %1, i64 0 %3 = extractelement <2 x double> %split1, i64 0 %4 = insertelement <2 x double> %2, double %3, i64 1 %5 = extractelement <2 x double> %split, i64 1 %6 = insertelement <2 x double> undef, double %5, i64 0 %7 = extractelement <2 x double> %split1, i64 1 %8 = insertelement <2 x double> %6, double %7, i64 1 %9 = shufflevector <2 x double> %4, <2 x double> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> store <4 x double> %9, <4 x double>* %Ptr With this patch, we propagate the 2x2 shape information from the transpose to the store and we generate the code below. Note that we store the columns directly and do not need an extra shuffle. %9 = bitcast <4 x double>* %Ptr to double* %10 = bitcast double* %9 to <2 x double>* store <2 x double> %4, <2 x double>* %10, align 8 %11 = getelementptr double, double* %9, i32 2 %12 = bitcast double* %11 to <2 x double>* store <2 x double> %8, <2 x double>* %12, align 8 Reviewers: anemet, Gerolf, reames, hfinkel, andrew.w.kaylor Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D70897 2019-12-23 13:39:36 +01:00			`; NOTE: Assertions have been autogenerated by utils/update_test_checks.py`
			`; RUN: opt -lower-matrix-intrinsics -S < %s \| FileCheck %s`
			`; RUN: opt -passes='lower-matrix-intrinsics' -S < %s \| FileCheck %s`

			`; Currently we only lower stores with shape information, but need to embed the`
			`; matrix in a flat vector for function calls and returns.`
			`define <8 x double> @strided_load_4x4(<8 x double> %in, <8 x double>* %Ptr) {`
			`; CHECK-LABEL: @strided_load_4x4(`
			`; CHECK-NEXT: [[SPLIT:%.]] = shufflevector <8 x double> [[IN:%.]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x double> [[IN]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>`
			`; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x double> [[SPLIT]], i64 0`
			`; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[TMP1]], i64 0`
			`; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 0`
			`; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP3]], i64 1`
			`; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[SPLIT]], i64 1`
			`; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i64 0`
			`; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 1`
			`; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i64 1`
			`; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x double> [[SPLIT]], i64 2`
			`; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> undef, double [[TMP9]], i64 0`
			`; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 2`
			`; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP11]], i64 1`
			`; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x double> [[SPLIT]], i64 3`
			`; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> undef, double [[TMP13]], i64 0`
			`; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x double> [[SPLIT1]], i64 3`
			`; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x double> [[TMP14]], double [[TMP15]], i64 1`
			`; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>`
			`; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>`
			`; CHECK-NEXT: [[TMP20:%.]] = bitcast <8 x double> [[PTR:%.]] to double`
[Matrix] Update check lines for strided intrinsics (NFC). This re-generates some check lines, after the naming of values got improved, to reduce the size of diffs in follow-on patches. 2020-06-09 11:04:36 +01:00			`; CHECK-NEXT: [[VEC_CAST:%.]] = bitcast double [[TMP20]] to <2 x double>*`
			`; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[VEC_CAST]], align 8`
[Matrix] Update load/store intrinsics. This patch adjust the load/store matrix intrinsics, formerly known as llvm.matrix.columnwise.load/store, to improve the naming and allow passing of extra information (volatile). The patch performs the following changes: * Rename columnwise.load/store to column.major.load/store. This is more expressive and also more in line with the naming in Clang. * Changes the stride arguments from i32 to i64. The stride can be larger than i32 and this makes things more uniform with the way things are handled in Clang. * A new boolean argument is added to indicate whether the load/store is volatile. The lowering respects that when emitting vector load/store instructions * MatrixBuilder is updated to require both Alignment and IsVolatile arguments, which are passed through to the generated intrinsic. The alignment is set using the `align` attribute. The changes are grouped together in a single patch, to have a single commit that breaks the compatibility. We probably should be fine with updating the intrinsics, as we did not yet officially support them in the last stable release. If there are any concerns, we can add auto-upgrade rules for the columnwise intrinsics though. Reviewers: anemet, Gerolf, hfinkel, andrew.w.kaylor, LuoYuanke, nicolasvasilache, rjmccall, ftynse Reviewed By: anemet, nicolasvasilache Differential Revision: https://reviews.llvm.org/D81472 2020-06-18 09:30:41 +01:00			`; CHECK-NEXT: [[VEC_GEP:%.]] = getelementptr double, double [[TMP20]], i64 2`
[Matrix] Update check lines for strided intrinsics (NFC). This re-generates some check lines, after the naming of values got improved, to reduce the size of diffs in follow-on patches. 2020-06-09 11:04:36 +01:00			`; CHECK-NEXT: [[VEC_CAST2:%.]] = bitcast double [[VEC_GEP]] to <2 x double>*`
			`; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[VEC_CAST2]], align 8`
[Matrix] Update load/store intrinsics. This patch adjust the load/store matrix intrinsics, formerly known as llvm.matrix.columnwise.load/store, to improve the naming and allow passing of extra information (volatile). The patch performs the following changes: * Rename columnwise.load/store to column.major.load/store. This is more expressive and also more in line with the naming in Clang. * Changes the stride arguments from i32 to i64. The stride can be larger than i32 and this makes things more uniform with the way things are handled in Clang. * A new boolean argument is added to indicate whether the load/store is volatile. The lowering respects that when emitting vector load/store instructions * MatrixBuilder is updated to require both Alignment and IsVolatile arguments, which are passed through to the generated intrinsic. The alignment is set using the `align` attribute. The changes are grouped together in a single patch, to have a single commit that breaks the compatibility. We probably should be fine with updating the intrinsics, as we did not yet officially support them in the last stable release. If there are any concerns, we can add auto-upgrade rules for the columnwise intrinsics though. Reviewers: anemet, Gerolf, hfinkel, andrew.w.kaylor, LuoYuanke, nicolasvasilache, rjmccall, ftynse Reviewed By: anemet, nicolasvasilache Differential Revision: https://reviews.llvm.org/D81472 2020-06-18 09:30:41 +01:00			`; CHECK-NEXT: [[VEC_GEP3:%.]] = getelementptr double, double [[TMP20]], i64 4`
[Matrix] Update check lines for strided intrinsics (NFC). This re-generates some check lines, after the naming of values got improved, to reduce the size of diffs in follow-on patches. 2020-06-09 11:04:36 +01:00			`; CHECK-NEXT: [[VEC_CAST4:%.]] = bitcast double [[VEC_GEP3]] to <2 x double>*`
			`; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[VEC_CAST4]], align 8`
[Matrix] Update load/store intrinsics. This patch adjust the load/store matrix intrinsics, formerly known as llvm.matrix.columnwise.load/store, to improve the naming and allow passing of extra information (volatile). The patch performs the following changes: * Rename columnwise.load/store to column.major.load/store. This is more expressive and also more in line with the naming in Clang. * Changes the stride arguments from i32 to i64. The stride can be larger than i32 and this makes things more uniform with the way things are handled in Clang. * A new boolean argument is added to indicate whether the load/store is volatile. The lowering respects that when emitting vector load/store instructions * MatrixBuilder is updated to require both Alignment and IsVolatile arguments, which are passed through to the generated intrinsic. The alignment is set using the `align` attribute. The changes are grouped together in a single patch, to have a single commit that breaks the compatibility. We probably should be fine with updating the intrinsics, as we did not yet officially support them in the last stable release. If there are any concerns, we can add auto-upgrade rules for the columnwise intrinsics though. Reviewers: anemet, Gerolf, hfinkel, andrew.w.kaylor, LuoYuanke, nicolasvasilache, rjmccall, ftynse Reviewed By: anemet, nicolasvasilache Differential Revision: https://reviews.llvm.org/D81472 2020-06-18 09:30:41 +01:00			`; CHECK-NEXT: [[VEC_GEP5:%.]] = getelementptr double, double [[TMP20]], i64 6`
[Matrix] Update check lines for strided intrinsics (NFC). This re-generates some check lines, after the naming of values got improved, to reduce the size of diffs in follow-on patches. 2020-06-09 11:04:36 +01:00			`; CHECK-NEXT: [[VEC_CAST6:%.]] = bitcast double [[VEC_GEP5]] to <2 x double>*`
			`; CHECK-NEXT: store <2 x double> [[TMP16]], <2 x double>* [[VEC_CAST6]], align 8`
[Matrix] Add forward shape propagation and first shape aware lowerings. This patch adds infrastructure for forward shape propagation to LowerMatrixIntrinsics. It also updates the pass to make use of the shape information to break up larger vector operations and to eliminate unnecessary conversion operations between columnwise matrixes and flattened vectors: if shape information is available for an instruction, lower the operation to a set of instructions operating on columns. For example, a store of a matrix is broken down into separate stores for each column. For users that do not have shape information (e.g. because they do not yet support shape information aware lowering), we pack the result columns into a flat vector and update those users. It also adds shape aware lowering for the first non-intrinsic instruction: vector stores. Example: For %c = call <4 x double> @llvm.matrix.transpose(<4 x double> %a, i32 2, i32 2) store <4 x double> %c, <4 x double>* %Ptr We generate the code below without shape propagation. Note %9 which combines the columns of the transposed matrix into a flat vector. %split = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1> %split1 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3> %1 = extractelement <2 x double> %split, i64 0 %2 = insertelement <2 x double> undef, double %1, i64 0 %3 = extractelement <2 x double> %split1, i64 0 %4 = insertelement <2 x double> %2, double %3, i64 1 %5 = extractelement <2 x double> %split, i64 1 %6 = insertelement <2 x double> undef, double %5, i64 0 %7 = extractelement <2 x double> %split1, i64 1 %8 = insertelement <2 x double> %6, double %7, i64 1 %9 = shufflevector <2 x double> %4, <2 x double> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> store <4 x double> %9, <4 x double>* %Ptr With this patch, we propagate the 2x2 shape information from the transpose to the store and we generate the code below. Note that we store the columns directly and do not need an extra shuffle. %9 = bitcast <4 x double>* %Ptr to double* %10 = bitcast double* %9 to <2 x double>* store <2 x double> %4, <2 x double>* %10, align 8 %11 = getelementptr double, double* %9, i32 2 %12 = bitcast double* %11 to <2 x double>* store <2 x double> %8, <2 x double>* %12, align 8 Reviewers: anemet, Gerolf, reames, hfinkel, andrew.w.kaylor Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D70897 2019-12-23 13:39:36 +01:00			`; CHECK-NEXT: call void @foo(<8 x double> [[TMP19]])`
			`; CHECK-NEXT: ret <8 x double> [[TMP19]]`
			`;`
			`%transposed = call <8 x double> @llvm.matrix.transpose(<8 x double> %in, i32 4, i32 2)`
[Matrix] Add align info to some more loads/stores (NFC). Some tests were missing alignment info. Subsequent changes properly preserve the set alignment. Set it properly beforehand, to avoid unnecessary test changes. 2020-06-16 17:02:09 +01:00			`store <8 x double> %transposed, <8 x double>* %Ptr, align 8`
[Matrix] Add forward shape propagation and first shape aware lowerings. This patch adds infrastructure for forward shape propagation to LowerMatrixIntrinsics. It also updates the pass to make use of the shape information to break up larger vector operations and to eliminate unnecessary conversion operations between columnwise matrixes and flattened vectors: if shape information is available for an instruction, lower the operation to a set of instructions operating on columns. For example, a store of a matrix is broken down into separate stores for each column. For users that do not have shape information (e.g. because they do not yet support shape information aware lowering), we pack the result columns into a flat vector and update those users. It also adds shape aware lowering for the first non-intrinsic instruction: vector stores. Example: For %c = call <4 x double> @llvm.matrix.transpose(<4 x double> %a, i32 2, i32 2) store <4 x double> %c, <4 x double>* %Ptr We generate the code below without shape propagation. Note %9 which combines the columns of the transposed matrix into a flat vector. %split = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1> %split1 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3> %1 = extractelement <2 x double> %split, i64 0 %2 = insertelement <2 x double> undef, double %1, i64 0 %3 = extractelement <2 x double> %split1, i64 0 %4 = insertelement <2 x double> %2, double %3, i64 1 %5 = extractelement <2 x double> %split, i64 1 %6 = insertelement <2 x double> undef, double %5, i64 0 %7 = extractelement <2 x double> %split1, i64 1 %8 = insertelement <2 x double> %6, double %7, i64 1 %9 = shufflevector <2 x double> %4, <2 x double> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3> store <4 x double> %9, <4 x double>* %Ptr With this patch, we propagate the 2x2 shape information from the transpose to the store and we generate the code below. Note that we store the columns directly and do not need an extra shuffle. %9 = bitcast <4 x double>* %Ptr to double* %10 = bitcast double* %9 to <2 x double>* store <2 x double> %4, <2 x double>* %10, align 8 %11 = getelementptr double, double* %9, i32 2 %12 = bitcast double* %11 to <2 x double>* store <2 x double> %8, <2 x double>* %12, align 8 Reviewers: anemet, Gerolf, reames, hfinkel, andrew.w.kaylor Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D70897 2019-12-23 13:39:36 +01:00			`call void @foo(<8 x double> %transposed)`
			`ret <8 x double> %transposed`
			`}`

			`declare <8 x double> @llvm.matrix.transpose(<8 x double>, i32, i32)`

			`declare void @foo(<8 x double>)`