llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms Author: Matthias Braun (MatzeB) <details> <summary>Changes</summary> As shufflevector is effectively commutative we should apply the same logic as other commutative operations where we order the inputs by their `getComplexity()` value. This will put things like `undef`, `poison` and constants on the right hand side where possible. We had a rule moving `undef` to the right hand side that is superseded by this. --- Patch is 55.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113212.diff 10 Files Affected: - (modified) clang/test/CodeGen/X86/avx-shuffle-builtins.c (+450-72) - (modified) clang/test/CodeGen/X86/sse.c (+5-6) - (modified) llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (+18-13) - (modified) llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll (+1-1) - (modified) llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll (+2-2) - (modified) llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll (+3-3) - (modified) llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll (+1-1) - (modified) llvm/test/Transforms/InstCombine/vec_shuffle.ll (+41-2) - (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll (+1-1) - (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll (+1-1) ``````````diff diff --git a/clang/test/CodeGen/X86/avx-shuffle-builtins.c b/clang/test/CodeGen/X86/avx-shuffle-builtins.c index d184d28f3e07aa..8d5b2c1d8c4394 100644 --- a/clang/test/CodeGen/X86/avx-shuffle-builtins.c +++ b/clang/test/CodeGen/X86/avx-shuffle-builtins.c @@ -1,7 +1,7 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: x86-registered-target -// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64 -// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X86 -// FIXME: This is testing optimized generation of shuffle instructions and should be fixed. +// RUN: %clang_cc1 -ffreestanding %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -ffreestanding %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK #include <immintrin.h> @@ -10,201 +10,579 @@ // Test LLVM IR codegen of shuffle instructions, checking if the masks are correct // +// CHECK-LABEL: define dso_local <8 x float> @x( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <8 x float> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[SHUFP:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> +// CHECK-NEXT: ret <8 x float> [[SHUFP]] +// __m256 x(__m256 a, __m256 b) { - // CHECK-LABEL: x - // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> return _mm256_shuffle_ps(a, b, 203); } +// CHECK-LABEL: define dso_local <2 x double> @test_mm_permute_pd( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0> +// CHECK-NEXT: ret <2 x double> [[PERMIL]] +// __m128d test_mm_permute_pd(__m128d a) { - // CHECK-LABEL: test_mm_permute_pd - // CHECK: shufflevector{{.*}}<i32 1, i32 0> return _mm_permute_pd(a, 1); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute_pd( +// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2> +// CHECK-NEXT: ret <4 x double> [[PERMIL]] +// __m256d test_mm256_permute_pd(__m256d a) { - // CHECK-LABEL: test_mm256_permute_pd - // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2> return _mm256_permute_pd(a, 5); } +// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> +// CHECK-NEXT: ret <4 x float> [[PERMIL]] +// __m128 test_mm_permute_ps(__m128 a) { - // CHECK-LABEL: test_mm_permute_ps - // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0> return _mm_permute_ps(a, 0x1b); } // Test case for PR12401 +// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps2( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 2, i32 3> +// CHECK-NEXT: ret <4 x float> [[PERMIL]] +// __m128 test_mm_permute_ps2(__m128 a) { - // CHECK-LABEL: test_mm_permute_ps2 - // CHECK: shufflevector{{.*}}<i32 2, i32 1, i32 2, i32 3> return _mm_permute_ps(a, 0xe6); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute_ps( +// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> +// CHECK-NEXT: ret <8 x float> [[PERMIL]] +// __m256 test_mm256_permute_ps(__m256 a) { - // CHECK-LABEL: test_mm256_permute_ps - // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> return _mm256_permute_ps(a, 0x1b); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute2f128_pd( +// CHECK-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x double> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 2, i32 3, i32 6, i32 7> +// CHECK-NEXT: ret <4 x double> [[VPERM]] +// __m256d test_mm256_permute2f128_pd(__m256d a, __m256d b) { - // CHECK-LABEL: test_mm256_permute2f128_pd - // CHECK: shufflevector{{.*}}<i32 2, i32 3, i32 6, i32 7> return _mm256_permute2f128_pd(a, b, 0x31); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute2f128_ps( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <8 x float> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP0]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> +// CHECK-NEXT: ret <8 x float> [[VPERM]] +// __m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) { - // CHECK-LABEL: test_mm256_permute2f128_ps - // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> return _mm256_permute2f128_ps(a, b, 0x13); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_permute2f128_si256( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x i64> [[B]], ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to <8 x i32> +// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[VPERM]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP4]] +// __m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) { - // CHECK-LABEL: test_mm256_permute2f128_si256 - // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5> return _mm256_permute2f128_si256(a, b, 0x20); } __m128 +// CHECK-LABEL: define dso_local <4 x float> @test_mm_broadcast_ss( +// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[__F_I:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1 +// CHECK-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[TMP4]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT3_I]], float [[TMP6]], i32 3 +// CHECK-NEXT: store <4 x float> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16 +// CHECK-NEXT: ret <4 x float> [[TMP7]] +// test_mm_broadcast_ss(float const *__a) { - // CHECK-LABEL: test_mm_broadcast_ss - // CHECK: insertelement <4 x float> {{.*}}, i64 0 - // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> poison, <4 x i32> zeroinitializer return _mm_broadcast_ss(__a); } __m256d +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_broadcast_sd( +// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[__D_I:%.*]] = alloca double, align 8 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1 +// CHECK-NEXT: store double [[TMP2]], ptr [[__D_I]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[__D_I]], align 8 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[__D_I]], align 8 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP4]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[__D_I]], align 8 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[__D_I]], align 8 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x double> [[VECINIT3_I]], double [[TMP6]], i32 3 +// CHECK-NEXT: store <4 x double> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// CHECK-NEXT: ret <4 x double> [[TMP7]] +// test_mm256_broadcast_sd(double const *__a) { - // CHECK-LABEL: test_mm256_broadcast_sd - // CHECK: insertelement <4 x double> {{.*}}, i64 0 - // CHECK: shufflevector <4 x double> {{.*}}, <4 x double> poison, <4 x i32> zeroinitializer return _mm256_broadcast_sd(__a); } __m256 +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_broadcast_ss( +// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[__F_I:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1 +// CHECK-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP4]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 3 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 4 +// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 5 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 6 +// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[__F_I]], align 4 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <8 x float> [[VECINIT7_I]], float [[TMP10]], i32 7 +// CHECK-NEXT: store <8 x float> [[VECINIT8_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// CHECK-NEXT: [[TMP11:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32 +// CHECK-NEXT: ret <8 x float> [[TMP11]] +// test_mm256_broadcast_ss(float const *__a) { - // CHECK-LABEL: test_mm256_broadcast_ss - // CHECK: insertelement <8 x float> {{.*}}, i64 0 - // CHECK: shufflevector <8 x float> {{.*}}, <8 x float> poison, <8 x i32> zeroinitializer return _mm256_broadcast_ss(__a); } // Make sure we have the correct mask for each insertf128 case. +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_0( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> +// CHECK-NEXT: ret <8 x float> [[INSERT]] +// __m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) { - // CHECK-LABEL: test_mm256_insertf128_ps_0 - // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> return _mm256_insertf128_ps(a, b, 0); } +// CHECK-LABEL: define dso_local <4 x double> @test_mm256_insertf128_pd_0( +// CHECK-SAME: <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3> +// CHECK-NEXT: ret <4 x double> [[INSERT]] +// __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) { - // CHECK-LABEL: test_mm256_insertf128_pd_0 - // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7> return _mm256_insertf128_pd(a, b, 0); } +// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_insertf128_si256_0( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP4]] +// __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) { - // CHECK-LABEL: test_mm256_insertf128_si256_0 - // X64: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7> - // X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> return _mm256_insertf128_si256(a, b, 0); } +// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_1( +// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32 +// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = l... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/113212 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits