> we need to generate
> 
>       vxorp[ds]       %xmmN, %xmmN, %xmmN
>       ...
>       vcvtss2sd       f(%rip), %xmmN, %xmmX
>       ...
>       vcvtsi2ss       i(%rip), %xmmN, %xmmY
>
> to avoid partial XMM register stall.  This patch adds a pass to generate
> a single
> 
>       vxorps          %xmmN, %xmmN, %xmmN
> 
> at function entry, which is shared by all SF and DF conversions, instead
> of generating one
> 
>       vxorp[ds]       %xmmN, %xmmN, %xmmN
> 
> for each SF/DF conversion.
> 
> Performance impacts on SPEC CPU 2017 rate with 1 copy using
> 
> -Ofast -march=native -mfpmath=sse -fno-associative-math -funroll-loops
> 
> are
> 
> 1. On Broadwell server:
> 
> 500.perlbench_r (-0.82%)
> 502.gcc_r (0.73%)
> 505.mcf_r (-0.24%)
> 520.omnetpp_r (-2.22%)
> 523.xalancbmk_r (-1.47%)
> 525.x264_r (0.31%)
> 531.deepsjeng_r (0.27%)
> 541.leela_r (0.85%)
> 548.exchange2_r (-0.11%)
> 557.xz_r (-0.34%)
> Geomean: (-0.23%)
> 
> 503.bwaves_r (0.00%)
> 507.cactuBSSN_r (-1.88%)
> 508.namd_r (0.00%)
> 510.parest_r (-0.56%)
> 511.povray_r (0.49%)
> 519.lbm_r (-1.28%)
> 521.wrf_r (-0.28%)
> 526.blender_r (0.55%)
> 527.cam4_r (-0.20%)
> 538.imagick_r (2.52%)
> 544.nab_r (-0.18%)
> 549.fotonik3d_r (-0.51%)
> 554.roms_r (-0.22%)
> Geomean: (0.00%)

I wonder why the patch seems to have more effect on specint that should not 
care much
about float<->double conversions?

> number of vxorp[ds]:
> 
> before                after           difference
> 14570         4515            -69%
> 
> OK for trunk?

This looks very nice though.

+/* At function entry, generate a single
+       vxorps %xmmN, %xmmN, %xmmN
+   for all
+       vcvtss2sd  op, %xmmN, %xmmX
+       vcvtsd2ss  op, %xmmN, %xmmX
+       vcvtsi2ss  op, %xmmN, %xmmX
+       vcvtsi2sd  op, %xmmN, %xmmX
+ */
+
+static unsigned int
+remove_partial_avx_dependency (void)
+{
+  timevar_push (TV_MACH_DEP);
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_md_add_problem ();
+  df_analyze ();
+
+  basic_block bb;
+  rtx_insn *insn, *set_insn;
+  rtx set;
+  rtx v4sf_const0 = NULL_RTX;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+       {
+         if (!NONDEBUG_INSN_P (insn))
+           continue;
+
+         set = single_set (insn);
+         if (set)
+           {
+             machine_mode dest_vecmode, dest_mode;
+             rtx src = SET_SRC (set);
+             rtx dest, vec, zero;
+
+             /* Check for conversions to SF or DF.  */
+             switch (GET_CODE (src))
+               {
+               case FLOAT_TRUNCATE:
+                 /* DF -> SF.  */
+                 if (GET_MODE (XEXP (src, 0)) != DFmode)
+                   continue;
+                 /* Fall through.  */
+               case FLOAT_EXTEND:
+                 /* SF -> DF.  */
+               case FLOAT:
+                 /* SI -> SF, SI -> DF, DI -> SF, DI -> DF.  */
+                 dest = SET_DEST (set);
+                 dest_mode = GET_MODE (dest);
+                 switch (dest_mode)
+                   {
+                   case E_SFmode:
+                     dest_vecmode = V4SFmode;
+                     break;
+                   case E_DFmode:
+                     dest_vecmode = V2DFmode;
+                     break;
+                   default:
+                     continue;
+                   }
+
+                 if (!TARGET_64BIT
+                     && GET_MODE (XEXP (src, 0)) == DImode)
+                   continue;
+
+                 if (!v4sf_const0)
+                   v4sf_const0 = gen_reg_rtx (V4SFmode);
+
+                 if (dest_vecmode == V4SFmode)
+                   zero = v4sf_const0;
+                 else
+                   zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+
+                 /* Change source to vector mode.  */
+                 src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
+                 src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
+                                          GEN_INT (HOST_WIDE_INT_1U));
+                 /* Change destination to vector mode.  */
+                 vec = gen_reg_rtx (dest_vecmode);
+                 /* Generate a XMM vector SET.  */
+                 set = gen_rtx_SET (vec, src);
+                 set_insn = emit_insn_before (set, insn);
+                 df_insn_rescan (set_insn);
+
+                 src = gen_rtx_SUBREG (dest_mode, vec, 0);
+                 set = gen_rtx_SET (dest, src);
+
+                 /* Drop possible dead definitions.  */
+                 PATTERN (insn) = set;
+
+                 INSN_CODE (insn) = -1;
+                 recog_memoized (insn);
+                 df_insn_rescan (insn);
+                 break;
+
+               default:
+                 break;
+               }
+           }
+       }
+    }
+
+  if (v4sf_const0)
+    {
+      /* Generate a single vxorps at function entry and preform df
+        rescan. */
+      bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+      insn = BB_HEAD (bb);
+      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+      set_insn = emit_insn_after (set, insn);
+      df_insn_rescan (set_insn);
+      df_process_deferred_rescans ();
+    }

It seems suboptimal to place the const0 at the entry of function - if the
conversoin happens in cold region of function this will just increase register
pressure.  I guess right answer would be to look for the postdominance frontier
of the set of all uses of the zero register?

Honza

Reply via email to