Re: [PATCH] PR target/66749: Add -march=iamcu to optimize for IA MCU

Uros Bizjak Mon, 06 Jul 2015 08:14:10 -0700

On Mon, Jul 6, 2015 at 3:28 PM, H.J. Lu <hjl.to...@gmail.com> wrote:
> IA MCU is based on Intel Pentium ISA without x87 and passing parameters
> in registers.  We want to optimize for IA MCU without changing existing
> Pentium codegen.  This patch adds PROCESSOR_IAMCU for -march=iamcu,
> which is based on -march=pentium with updated cost tables.
>
> OK for trunk?
>
> Thanks.
>
>
> H.J.
> --
> gcc/
>
>         PR target/66749
>         * config/i386/i386.c (iamcu_cost): New.
>         (m_IAMCU): Likewise.
>         (initial_ix86_arch_features): Disable X86_ARCH_CMOV for m_IAMCU.
>         (processor_target_table): Add an entry for "iamcu".
>         (processor_alias_table): Likewise.
>         (ix86_issue_rate): Handle PROCESSOR_IAMCU.
>         (ix86_adjust_cost): Likewise.
>         (ia32_multipass_dfa_lookahead): Likewise.
>         * config/i386/i386.h (processor_type): Add PROCESSOR_IAMCU.
>         * config/i386/x86-tune.def: Updated for m_IAMCU.
>
> gcc/testsuite/
>
>         PR target/66749
>         * gcc.target/i386/pr66749.c: New test.


I assume there will be separate patch for configure bits that will set
-march=iamcu for i[34567]86-*-elfiamcu target.

This part is OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c                  | 76 
> ++++++++++++++++++++++++++++++++-
>  gcc/config/i386/i386.h                  |  1 +
>  gcc/config/i386/x86-tune.def            | 36 +++++++++-------
>  gcc/testsuite/gcc.target/i386/pr66749.c | 14 ++++++
>  4 files changed, 111 insertions(+), 16 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr66749.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 7d26e8c..98250c4 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -426,6 +426,74 @@ struct processor_costs pentium_cost = {
>    1,                                   /* cond_not_taken_branch_cost.  */
>  };
>
> +static const
> +struct processor_costs iamcu_cost = {
> +  COSTS_N_INSNS (1),                   /* cost of an add instruction */
> +  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
> +  COSTS_N_INSNS (4),                   /* variable shift costs */
> +  COSTS_N_INSNS (1),                   /* constant shift costs */
> +  {COSTS_N_INSNS (11),                 /* cost of starting multiply for QI */
> +   COSTS_N_INSNS (11),                 /*                               HI */
> +   COSTS_N_INSNS (11),                 /*                               SI */
> +   COSTS_N_INSNS (11),                 /*                               DI */
> +   COSTS_N_INSNS (11)},                        /*                            
> other */
> +  0,                                   /* cost of multiply per each bit set 
> */
> +  {COSTS_N_INSNS (25),                 /* cost of a divide/mod for QI */
> +   COSTS_N_INSNS (25),                 /*                          HI */
> +   COSTS_N_INSNS (25),                 /*                          SI */
> +   COSTS_N_INSNS (25),                 /*                          DI */
> +   COSTS_N_INSNS (25)},                        /*                          
> other */
> +  COSTS_N_INSNS (3),                   /* cost of movsx */
> +  COSTS_N_INSNS (2),                   /* cost of movzx */
> +  8,                                   /* "large" insn */
> +  6,                                   /* MOVE_RATIO */
> +  6,                                /* cost for loading QImode using movzbl 
> */
> +  {2, 4, 2},                           /* cost of loading integer registers
> +                                          in QImode, HImode and SImode.
> +                                          Relative to reg-reg move (2).  */
> +  {2, 4, 2},                           /* cost of storing integer registers 
> */
> +  2,                                   /* cost of reg,reg fld/fst */
> +  {2, 2, 6},                           /* cost of loading fp registers
> +                                          in SFmode, DFmode and XFmode */
> +  {4, 4, 6},                           /* cost of storing fp registers
> +                                          in SFmode, DFmode and XFmode */
> +  8,                                   /* cost of moving MMX register */
> +  {8, 8},                              /* cost of loading MMX registers
> +                                          in SImode and DImode */
> +  {8, 8},                              /* cost of storing MMX registers
> +                                          in SImode and DImode */
> +  2,                                   /* cost of moving SSE register */
> +  {4, 8, 16},                          /* cost of loading SSE registers
> +                                          in SImode, DImode and TImode */
> +  {4, 8, 16},                          /* cost of storing SSE registers
> +                                          in SImode, DImode and TImode */
> +  3,                                   /* MMX or SSE register to integer */
> +  8,                                   /* size of l1 cache.  */
> +  8,                                   /* size of l2 cache  */
> +  0,                                   /* size of prefetch block */
> +  0,                                   /* number of parallel prefetches */
> +  2,                                   /* Branch cost */
> +  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
> +  COSTS_N_INSNS (3),                   /* cost of FMUL instruction.  */
> +  COSTS_N_INSNS (39),                  /* cost of FDIV instruction.  */
> +  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
> +  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
> +  COSTS_N_INSNS (70),                  /* cost of FSQRT instruction.  */
> +  pentium_memcpy,
> +  pentium_memset,
> +  1,                                   /* scalar_stmt_cost.  */
> +  1,                                   /* scalar load_cost.  */
> +  1,                                   /* scalar_store_cost.  */
> +  1,                                   /* vec_stmt_cost.  */
> +  1,                                   /* vec_to_scalar_cost.  */
> +  1,                                   /* scalar_to_vec_cost.  */
> +  1,                                   /* vec_align_load_cost.  */
> +  2,                                   /* vec_unalign_load_cost.  */
> +  1,                                   /* vec_store_cost.  */
> +  3,                                   /* cond_taken_branch_cost.  */
> +  1,                                   /* cond_not_taken_branch_cost.  */
> +};
> +
>  /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
>     (we ensure the alignment).  For small blocks inline loop is still a
>     noticeable win, for bigger blocks either rep movsl or rep movsb is
> @@ -2027,6 +2095,7 @@ const struct processor_costs *ix86_cost = &pentium_cost;
>  #define m_386 (1<<PROCESSOR_I386)
>  #define m_486 (1<<PROCESSOR_I486)
>  #define m_PENT (1<<PROCESSOR_PENTIUM)
> +#define m_IAMCU (1<<PROCESSOR_IAMCU)
>  #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
>  #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
>  #define m_NOCONA (1<<PROCESSOR_NOCONA)
> @@ -2086,7 +2155,7 @@ unsigned char ix86_arch_features[X86_ARCH_LAST];
>     ix86_arch_features based on the processor mask.  */
>  static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
>    /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
> -  ~(m_386 | m_486 | m_PENT | m_K6),
> +  ~(m_386 | m_486 | m_PENT | m_IAMCU | m_K6),
>
>    /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
>    ~m_386,
> @@ -2497,6 +2566,7 @@ static const struct ptt 
> processor_target_table[PROCESSOR_max] =
>    {"i386", &i386_cost, 4, 3, 4, 3, 4},
>    {"i486", &i486_cost, 16, 15, 16, 15, 16},
>    {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
> +  {"iamcu", &iamcu_cost, 16, 7, 16, 7, 16},
>    {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
>    {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
>    {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
> @@ -3246,6 +3316,7 @@ ix86_option_override_internal (bool main_args_p,
>        {"i486", PROCESSOR_I486, CPU_NONE, 0},
>        {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
>        {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
> +      {"iamcu", PROCESSOR_IAMCU, CPU_PENTIUM, 0},
>        {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
>        {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
>        {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | 
> PTA_PRFCHW},
> @@ -26138,6 +26209,7 @@ ix86_issue_rate (void)
>    switch (ix86_tune)
>      {
>      case PROCESSOR_PENTIUM:
> +    case PROCESSOR_IAMCU:
>      case PROCESSOR_BONNELL:
>      case PROCESSOR_SILVERMONT:
>      case PROCESSOR_KNL:
> @@ -26324,6 +26396,7 @@ ix86_adjust_cost (rtx_insn *insn, rtx link, rtx_insn 
> *dep_insn, int cost)
>    switch (ix86_tune)
>      {
>      case PROCESSOR_PENTIUM:
> +    case PROCESSOR_IAMCU:
>        /* Address Generation Interlock adds a cycle of latency.  */
>        if (insn_type == TYPE_LEA)
>         {
> @@ -26533,6 +26606,7 @@ ia32_multipass_dfa_lookahead (void)
>    switch (ix86_tune)
>      {
>      case PROCESSOR_PENTIUM:
> +    case PROCESSOR_IAMCU:
>        return 2;
>
>      case PROCESSOR_PENTIUMPRO:
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index d710b3d..f357e79 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2266,6 +2266,7 @@ enum processor_type
>    PROCESSOR_I386,                      /* 80386 */
>    PROCESSOR_I486,                      /* 80486DX, 80486SX, 80486DX[24] */
>    PROCESSOR_PENTIUM,
> +  PROCESSOR_IAMCU,
>    PROCESSOR_PENTIUMPRO,
>    PROCESSOR_PENTIUM4,
>    PROCESSOR_NOCONA,
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index bb3209d..42a560b 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -40,8 +40,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>
>  /* X86_TUNE_SCHEDULE: Enable scheduling.  */
>  DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
> -          m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
> -         | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
> +          m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> +         | m_INTEL | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
>
>  /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
>     on modern chips.  Preffer stores affecting whole integer register
> @@ -172,19 +172,21 @@ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
>  /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
>     over esp subtraction.  */
>  DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
> -          | m_K6_GEODE)
> +         | m_IAMCU | m_K6_GEODE)
>
>  /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
>     over esp subtraction.  */
> -DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
> +DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_IAMCU
> +         | m_K6_GEODE)
>
>  /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
>     over esp addition.  */
> -DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
> +DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
> +         | m_IAMCU | m_PPRO)
>
>  /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
>     over esp addition.  */
> -DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
> +DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_IAMCU)
>
>  
> /*****************************************************************************/
>  /* Branch predictor tuning                                                  
> */
> @@ -224,7 +226,7 @@ DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | 
> m_GENERIC)
>
>  /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
>     as "add mem, reg".  */
> -DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
> +DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_IAMCU | m_PPRO))
>
>  /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.   */
>  DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
> @@ -284,7 +286,8 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
>
>  /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
>  DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
> -         ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL  | m_K6))
> +         ~(m_PENT | m_IAMCU | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL
> +           | m_K6))
>
>  /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
>  DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
> @@ -304,8 +307,8 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
>  /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
>     integer operand.  */
>  DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
> -          ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
> -           | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
> +          ~(m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL
> +           | m_SILVERMONT | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
>
>  /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
>  DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
> @@ -444,7 +447,8 @@ DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
>
>  /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
>     of mozbl/movwl.  */
> -DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",  m_486 | 
> m_PENT)
> +DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
> +         m_486 | m_PENT | m_IAMCU)
>
>  /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
>     and SImode multiply, but 386 and 486 do HImode multiply faster.  */
> @@ -454,19 +458,21 @@ DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, 
> "promote_himode_imul",
>  /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
>     into 16bit/8bit when resulting sequence is shorter.  For example
>     for "and $-65536, reg" to 16bit store of 0.  */
> -DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
> +DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
> +         ~(m_386 | m_486 | m_PENT | m_IAMCU))
>
>  /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
>     such as "add $1, mem".  */
> -DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
> +DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
> +         ~(m_PENT | m_IAMCU))
>
>  /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
>     than a MOV.  */
> -DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
> +DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_IAMCU)
>
>  /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
>     but one byte longer.  */
> -DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
> +DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_IAMCU)
>
>  /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
>     use of partial registers by renaming.  This improved performance of 16bit
> diff --git a/gcc/testsuite/gcc.target/i386/pr66749.c 
> b/gcc/testsuite/gcc.target/i386/pr66749.c
> new file mode 100644
> index 0000000..affda08
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr66749.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target ia32 } */
> +/* { dg-require-effective-target nonpic } */
> +/* { dg-options "-O2 -miamcu -mtune=iamcu" } */
> +
> +char a[10], b[10];
> +
> +int f(int i)
> +{
> +  return a[i+1] + b[i+1];
> +}
> +
> +/* { dg-final { scan-assembler "a\\+1" } } */
> +/* { dg-final { scan-assembler "b\\+1" } } */
> --
> 2.4.3
>

Re: [PATCH] PR target/66749: Add -march=iamcu to optimize for IA MCU

Reply via email to