Attached patch adds SSE alternatives to sse2_cvtpi2pd, sse2_cvtpd2pi and sse2_cvttpd2pi to avoid MMX registers when e.g. _mm_cvtepi32_pd intrinsics is used. Without the patch, the testcase compiles to (-O2 -mavx):
_Z7prepareii: vmovd %edi, %xmm1 vpinsrd $1, %esi, %xmm1, %xmm0 movdq2q %xmm0, %mm0 cvtpi2pd %mm0, %xmm0 vhaddpd %xmm0, %xmm0, %xmm0 ret while patched gcc generates: vmovd %edi, %xmm1 vpinsrd $1, %esi, %xmm1, %xmm0 vcvtdq2pd %xmm0, %xmm0 vhaddpd %xmm0, %xmm0, %xmm0 ret The later avoids transition of FPU to MMX mode. 2019-01-23 Uroš Bizjak <ubiz...@gmail.com> PR target/88998 * config/i386/sse.md (sse2_cvtpi2pd): Add SSE alternatives. Disparage MMX alternative. (sse2_cvtpd2pi): Ditto. (sse2_cvttpd2pi): Ditto. testsuite/ChangeLog: 2019-01-23 Uroš Bizjak <ubiz...@gmail.com> PR target/88998 * g++.target/i386/pr88998.c: New test. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Committed to mainline, will be backported to release branches. Uros.
Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 268188) +++ config/i386/sse.md (working copy) @@ -4997,37 +4997,49 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "sse2_cvtpi2pd" - [(set (match_operand:V2DF 0 "register_operand" "=x,x") - (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "y,m")))] + [(set (match_operand:V2DF 0 "register_operand" "=v,x") + (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "vBm,?!y")))] "TARGET_SSE2" - "cvtpi2pd\t{%1, %0|%0, %1}" + "@ + %vcvtdq2pd\t{%1, %0|%0, %1} + cvtpi2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx,*") - (set_attr "prefix_data16" "1,*") + (set_attr "unit" "*,mmx") + (set_attr "prefix_data16" "*,1") + (set_attr "prefix" "maybe_vex,*") (set_attr "mode" "V2DF")]) (define_insn "sse2_cvtpd2pi" - [(set (match_operand:V2SI 0 "register_operand" "=y") - (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")] + [(set (match_operand:V2SI 0 "register_operand" "=v,?!y") + (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vBm,xm")] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2" - "cvtpd2pi\t{%1, %0|%0, %1}" + "@ + * return TARGET_AVX ? \"vcvtpd2dq{x}\t{%1, %0|%0, %1}\" : \"cvtpd2dq\t{%1, %0|%0, %1}\"; + cvtpd2pi\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx") + (set_attr "unit" "*,mmx") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double") - (set_attr "btver2_decode" "direct") - (set_attr "prefix_data16" "1") - (set_attr "mode" "DI")]) + (set_attr "prefix_data16" "*,1") + (set_attr "prefix" "maybe_vex,*") + (set_attr "mode" "TI")]) (define_insn "sse2_cvttpd2pi" - [(set (match_operand:V2SI 0 "register_operand" "=y") - (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm")))] + [(set (match_operand:V2SI 0 "register_operand" "=v,?!y") + (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vBm,xm")))] "TARGET_SSE2" - "cvttpd2pi\t{%1, %0|%0, %1}" + "@ + * return TARGET_AVX ? \"vcvttpd2dq{x}\t{%1, %0|%0, %1}\" : \"cvttpd2dq\t{%1, %0|%0, %1}\"; + cvttpd2pi\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx") + (set_attr "unit" "*,mmx") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double") - (set_attr "prefix_data16" "1") + (set_attr "prefix_data16" "*,1") + (set_attr "prefix" "maybe_vex,*") (set_attr "mode" "TI")]) (define_insn "sse2_cvtsi2sd" Index: testsuite/g++.target/i386/pr88998.C =================================================================== --- testsuite/g++.target/i386/pr88998.C (nonexistent) +++ testsuite/g++.target/i386/pr88998.C (working copy) @@ -0,0 +1,31 @@ +// PR target/88998 +// { dg-do run { target sse2_runtime } } +// { dg-options "-O2 -msse2 -mfpmath=387" } +// { dg-require-effective-target c++11 } + +#include <cassert> +#include <unordered_map> +#include <x86intrin.h> + +double +__attribute__((noinline)) +prepare (int a, int b) +{ + __m128i is = _mm_setr_epi32 (a, b, 0, 0); + __m128d ds = _mm_cvtepi32_pd (is); + return ds[0] + ds[1]; +} + +int +main (int, char **) +{ + double d = prepare (1, 2); + + std::unordered_map < int, int >m; + m.insert ({0, 0}); + m.insert ({1, 1}); + assert (m.load_factor () <= m.max_load_factor ()); + + assert (d == 3); + return 0; +}