Author: Simon Pilgrim Date: 2023-11-30T10:07:00Z New Revision: abc60e9808820c3f6614e6815909d43ed085460e
URL: https://github.com/llvm/llvm-project/commit/abc60e9808820c3f6614e6815909d43ed085460e DIFF: https://github.com/llvm/llvm-project/commit/abc60e9808820c3f6614e6815909d43ed085460e.diff LOG: [X86] vec_fabs.ll - add SSE test coverage Added: Modified: llvm/test/CodeGen/X86/vec_fabs.ll Removed: ################################################################################ diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll index ec02dfda30c8502..c17341c2c8b077e 100644 --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -1,24 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512,X86-AVX512VL -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X86,X86-AVX512,X86-AVX512FP16 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512,X86-AVX512VLDQ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512FP16 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512VLDQ +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1OR2,X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1OR2,X86-AVX2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512,X86-AVX512VL +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512,X86-AVX512FP16 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512,X86-AVX512VLDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X64,X64-SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1OR2,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1OR2,X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512FP16 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512VLDQ ; ; 128-bit Vectors ; -define <2 x double> @fabs_v2f64(<2 x double> %p) { -; X86-AVX-LABEL: fabs_v2f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +define <2 x double> @fabs_v2f64(<2 x double> %p) nounwind { +; X86-SSE-LABEL: fabs_v2f64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX1OR2-LABEL: fabs_v2f64: +; X86-AVX1OR2: # %bb.0: +; X86-AVX1OR2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1OR2-NEXT: retl ; ; X86-AVX512VL-LABEL: fabs_v2f64: ; X86-AVX512VL: # %bb.0: @@ -35,10 +42,15 @@ define <2 x double> @fabs_v2f64(<2 x double> %p) { ; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0 ; X86-AVX512VLDQ-NEXT: retl ; -; X64-AVX-LABEL: fabs_v2f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-SSE-LABEL: fabs_v2f64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX1OR2-LABEL: fabs_v2f64: +; X64-AVX1OR2: # %bb.0: +; X64-AVX1OR2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512VL-LABEL: fabs_v2f64: ; X64-AVX512VL: # %bb.0: @@ -59,7 +71,12 @@ define <2 x double> @fabs_v2f64(<2 x double> %p) { } declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p) -define <4 x float> @fabs_v4f32(<4 x float> %p) { +define <4 x float> @fabs_v4f32(<4 x float> %p) nounwind { +; X86-SSE-LABEL: fabs_v4f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: retl +; ; X86-AVX1-LABEL: fabs_v4f32: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 @@ -86,6 +103,11 @@ define <4 x float> @fabs_v4f32(<4 x float> %p) { ; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-AVX512VLDQ-NEXT: retl ; +; X64-SSE-LABEL: fabs_v4f32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: retq +; ; X64-AVX1-LABEL: fabs_v4f32: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -116,7 +138,134 @@ define <4 x float> @fabs_v4f32(<4 x float> %p) { } declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p) -define <8 x half> @fabs_v8f16(ptr %p) { +define <8 x half> @fabs_v8f16(ptr %p) nounwind { +; X86-SSE-LABEL: fabs_v8f16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $148, %esp +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movdqa (%eax), %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm1, (%esp) +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movhps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X86-SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,1] +; X86-SSE-NEXT: movaps %xmm1, %xmm0 +; X86-SSE-NEXT: addl $148, %esp +; X86-SSE-NEXT: retl +; ; X86-AVX1-LABEL: fabs_v8f16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -138,6 +287,74 @@ define <8 x half> @fabs_v8f16(ptr %p) { ; X86-AVX512-NEXT: vpand (%eax), %xmm0, %xmm0 ; X86-AVX512-NEXT: retl ; +; X64-SSE-LABEL: fabs_v8f16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: subq $72, %rsp +; X64-SSE-NEXT: movdqa (%rdi), %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; X64-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: addq $72, %rsp +; X64-SSE-NEXT: retq +; ; X64-AVX1-LABEL: fabs_v8f16: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0 @@ -165,7 +382,14 @@ declare <8 x half> @llvm.fabs.v8f16(<8 x half> %p) ; 256-bit Vectors ; -define <4 x double> @fabs_v4f64(<4 x double> %p) { +define <4 x double> @fabs_v4f64(<4 x double> %p) nounwind { +; X86-SSE-LABEL: fabs_v4f64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN] +; X86-SSE-NEXT: andps %xmm2, %xmm0 +; X86-SSE-NEXT: andps %xmm2, %xmm1 +; X86-SSE-NEXT: retl +; ; X86-AVX1-LABEL: fabs_v4f64: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 @@ -192,6 +416,13 @@ define <4 x double> @fabs_v4f64(<4 x double> %p) { ; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0 ; X86-AVX512VLDQ-NEXT: retl ; +; X64-SSE-LABEL: fabs_v4f64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN] +; X64-SSE-NEXT: andps %xmm2, %xmm0 +; X64-SSE-NEXT: andps %xmm2, %xmm1 +; X64-SSE-NEXT: retq +; ; X64-AVX1-LABEL: fabs_v4f64: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -222,7 +453,14 @@ define <4 x double> @fabs_v4f64(<4 x double> %p) { } declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p) -define <8 x float> @fabs_v8f32(<8 x float> %p) { +define <8 x float> @fabs_v8f32(<8 x float> %p) nounwind { +; X86-SSE-LABEL: fabs_v8f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; X86-SSE-NEXT: andps %xmm2, %xmm0 +; X86-SSE-NEXT: andps %xmm2, %xmm1 +; X86-SSE-NEXT: retl +; ; X86-AVX1-LABEL: fabs_v8f32: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 @@ -249,6 +487,13 @@ define <8 x float> @fabs_v8f32(<8 x float> %p) { ; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; X86-AVX512VLDQ-NEXT: retl ; +; X64-SSE-LABEL: fabs_v8f32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; X64-SSE-NEXT: andps %xmm2, %xmm0 +; X64-SSE-NEXT: andps %xmm2, %xmm1 +; X64-SSE-NEXT: retq +; ; X64-AVX1-LABEL: fabs_v8f32: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -279,7 +524,256 @@ define <8 x float> @fabs_v8f32(<8 x float> %p) { } declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p) -define <16 x half> @fabs_v16f16(ptr %p) { +define <16 x half> @fabs_v16f16(ptr %p) nounwind { +; X86-SSE-LABEL: fabs_v16f16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $324, %esp # imm = 0x144 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps (%eax), %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqa 16(%eax), %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm1, (%esp) +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X86-SSE-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movhps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X86-SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,1] +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: addl $324, %esp # imm = 0x144 +; X86-SSE-NEXT: retl +; ; X86-AVX1-LABEL: fabs_v16f16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -301,6 +795,138 @@ define <16 x half> @fabs_v16f16(ptr %p) { ; X86-AVX512-NEXT: vpand (%eax), %ymm0, %ymm0 ; X86-AVX512-NEXT: retl ; +; X64-SSE-LABEL: fabs_v16f16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: subq $88, %rsp +; X64-SSE-NEXT: movdqa (%rdi), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movaps 16(%rdi), %xmm0 +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; X64-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; X64-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: addq $88, %rsp +; X64-SSE-NEXT: retq +; ; X64-AVX1-LABEL: fabs_v16f16: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0 @@ -328,13 +954,28 @@ declare <16 x half> @llvm.fabs.v16f16(<16 x half> %p) ; 512-bit Vectors ; -define <8 x double> @fabs_v8f64(<8 x double> %p) { -; X86-AVX-LABEL: fabs_v8f64: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX-NEXT: retl +define <8 x double> @fabs_v8f64(<8 x double> %p) nounwind { +; X86-SSE-LABEL: fabs_v8f64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN] +; X86-SSE-NEXT: andps %xmm3, %xmm0 +; X86-SSE-NEXT: andps %xmm3, %xmm1 +; X86-SSE-NEXT: andps %xmm3, %xmm2 +; X86-SSE-NEXT: andps 8(%ebp), %xmm3 +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X86-AVX1OR2-LABEL: fabs_v8f64: +; X86-AVX1OR2: # %bb.0: +; X86-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX1OR2-NEXT: retl ; ; X86-AVX512VL-LABEL: fabs_v8f64: ; X86-AVX512VL: # %bb.0: @@ -351,12 +992,21 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) { ; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; -; X64-AVX-LABEL: fabs_v8f64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX-NEXT: retq +; X64-SSE-LABEL: fabs_v8f64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps {{.*#+}} xmm4 = [NaN,NaN] +; X64-SSE-NEXT: andps %xmm4, %xmm0 +; X64-SSE-NEXT: andps %xmm4, %xmm1 +; X64-SSE-NEXT: andps %xmm4, %xmm2 +; X64-SSE-NEXT: andps %xmm4, %xmm3 +; X64-SSE-NEXT: retq +; +; X64-AVX1OR2-LABEL: fabs_v8f64: +; X64-AVX1OR2: # %bb.0: +; X64-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512VL-LABEL: fabs_v8f64: ; X64-AVX512VL: # %bb.0: @@ -377,13 +1027,28 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) { } declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p) -define <16 x float> @fabs_v16f32(<16 x float> %p) { -; X86-AVX-LABEL: fabs_v16f32: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX-NEXT: retl +define <16 x float> @fabs_v16f32(<16 x float> %p) nounwind { +; X86-SSE-LABEL: fabs_v16f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] +; X86-SSE-NEXT: andps %xmm3, %xmm0 +; X86-SSE-NEXT: andps %xmm3, %xmm1 +; X86-SSE-NEXT: andps %xmm3, %xmm2 +; X86-SSE-NEXT: andps 8(%ebp), %xmm3 +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X86-AVX1OR2-LABEL: fabs_v16f32: +; X86-AVX1OR2: # %bb.0: +; X86-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX1OR2-NEXT: retl ; ; X86-AVX512VL-LABEL: fabs_v16f32: ; X86-AVX512VL: # %bb.0: @@ -400,12 +1065,21 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) { ; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; -; X64-AVX-LABEL: fabs_v16f32: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX-NEXT: retq +; X64-SSE-LABEL: fabs_v16f32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN] +; X64-SSE-NEXT: andps %xmm4, %xmm0 +; X64-SSE-NEXT: andps %xmm4, %xmm1 +; X64-SSE-NEXT: andps %xmm4, %xmm2 +; X64-SSE-NEXT: andps %xmm4, %xmm3 +; X64-SSE-NEXT: retq +; +; X64-AVX1OR2-LABEL: fabs_v16f32: +; X64-AVX1OR2: # %bb.0: +; X64-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX1OR2-NEXT: retq ; ; X64-AVX512VL-LABEL: fabs_v16f32: ; X64-AVX512VL: # %bb.0: @@ -426,7 +1100,502 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) { } declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p) -define <32 x half> @fabs_v32f16(ptr %p) { +define <32 x half> @fabs_v32f16(ptr %p) nounwind { +; X86-SSE-LABEL: fabs_v32f16: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $644, %esp # imm = 0x284 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps (%eax), %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movaps 16(%eax), %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movaps 32(%eax), %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqa 48(%eax), %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrlq $48, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: psrld $16, %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __extendhfsf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: movss %xmm1, (%esp) +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X86-SSE-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: calll __truncsfhf2 +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X86-SSE-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE-NEXT: movhps {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; X86-SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,1] +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload +; X86-SSE-NEXT: addl $644, %esp # imm = 0x284 +; X86-SSE-NEXT: retl +; ; X86-AVX1-LABEL: fabs_v32f16: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -466,6 +1635,266 @@ define <32 x half> @fabs_v32f16(ptr %p) { ; X86-AVX512VLDQ-NEXT: vpandq (%eax), %zmm0, %zmm0 ; X86-AVX512VLDQ-NEXT: retl ; +; X64-SSE-LABEL: fabs_v32f16: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: subq $120, %rsp +; X64-SSE-NEXT: movdqa (%rdi), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps 16(%rdi), %xmm0 +; X64-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movaps 32(%rdi), %xmm0 +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps 48(%rdi), %xmm0 +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; X64-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; X64-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; X64-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; X64-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrlq $48, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: psrld $16, %xmm0 +; X64-SSE-NEXT: callq __extendhfsf2@PLT +; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: callq __truncsfhf2@PLT +; X64-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; X64-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; X64-SSE-NEXT: # xmm3 = xmm3[0],mem[0] +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; X64-SSE-NEXT: addq $120, %rsp +; X64-SSE-NEXT: retq +; ; X64-AVX1-LABEL: fabs_v32f16: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] @@ -520,7 +1949,7 @@ declare <32 x half> @llvm.fabs.v32f16(<32 x half> %p) ; We should generate: ; mov (put constant value in return register) -define i64 @fabs_v2f32_1() { +define i64 @fabs_v2f32_1() nounwind { ; X86-LABEL: fabs_v2f32_1: ; X86: # %bb.0: ; X86-NEXT: xorl %eax, %eax @@ -537,7 +1966,7 @@ define i64 @fabs_v2f32_1() { ret i64 %ret } -define i64 @fabs_v2f32_2() { +define i64 @fabs_v2f32_2() nounwind { ; X86-LABEL: fabs_v2f32_2: ; X86: # %bb.0: ; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF @@ -557,28 +1986,52 @@ define i64 @fabs_v2f32_2() { declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p) ; PR70947 - remove duplicate xmm/ymm constant loads -define void @PR70947(ptr %src, ptr %dst) { -; X86-LABEL: PR70947: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vandps (%ecx), %ymm0, %ymm1 -; X86-NEXT: vandps 32(%ecx), %xmm0, %xmm0 -; X86-NEXT: vmovups %ymm1, (%eax) -; X86-NEXT: vmovups %xmm0, 16(%eax) -; X86-NEXT: vzeroupper -; X86-NEXT: retl +define void @PR70947(ptr %src, ptr %dst) nounwind { +; X86-SSE-LABEL: PR70947: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movups (%ecx), %xmm0 +; X86-SSE-NEXT: movups 32(%ecx), %xmm1 +; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN] +; X86-SSE-NEXT: andps %xmm2, %xmm0 +; X86-SSE-NEXT: andps %xmm2, %xmm1 +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: movups %xmm1, 16(%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: PR70947: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X86-AVX-NEXT: vandps (%ecx), %ymm0, %ymm1 +; X86-AVX-NEXT: vandps 32(%ecx), %xmm0, %xmm0 +; X86-AVX-NEXT: vmovups %ymm1, (%eax) +; X86-AVX-NEXT: vmovups %xmm0, 16(%eax) +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl ; -; X64-LABEL: PR70947: -; X64: # %bb.0: -; X64-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vandps (%rdi), %ymm0, %ymm1 -; X64-NEXT: vandps 32(%rdi), %xmm0, %xmm0 -; X64-NEXT: vmovups %ymm1, (%rsi) -; X64-NEXT: vmovups %xmm0, 16(%rsi) -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-SSE-LABEL: PR70947: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups (%rdi), %xmm0 +; X64-SSE-NEXT: movups 32(%rdi), %xmm1 +; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN] +; X64-SSE-NEXT: andps %xmm2, %xmm0 +; X64-SSE-NEXT: andps %xmm2, %xmm1 +; X64-SSE-NEXT: movups %xmm0, (%rsi) +; X64-SSE-NEXT: movups %xmm1, 16(%rsi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: PR70947: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] +; X64-AVX-NEXT: vandps (%rdi), %ymm0, %ymm1 +; X64-AVX-NEXT: vandps 32(%rdi), %xmm0, %xmm0 +; X64-AVX-NEXT: vmovups %ymm1, (%rsi) +; X64-AVX-NEXT: vmovups %xmm0, 16(%rsi) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq %src4 = getelementptr inbounds double, ptr %src, i64 4 %dst4 = getelementptr inbounds i32, ptr %dst, i64 4 %ld0 = load <4 x double>, ptr %src, align 8 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits