On Mon, Jul 6, 2015 at 3:28 PM, H.J. Lu <hjl.to...@gmail.com> wrote: > IA MCU is based on Intel Pentium ISA without x87 and passing parameters > in registers. We want to optimize for IA MCU without changing existing > Pentium codegen. This patch adds PROCESSOR_IAMCU for -march=iamcu, > which is based on -march=pentium with updated cost tables. > > OK for trunk? > > Thanks. > > > H.J. > -- > gcc/ > > PR target/66749 > * config/i386/i386.c (iamcu_cost): New. > (m_IAMCU): Likewise. > (initial_ix86_arch_features): Disable X86_ARCH_CMOV for m_IAMCU. > (processor_target_table): Add an entry for "iamcu". > (processor_alias_table): Likewise. > (ix86_issue_rate): Handle PROCESSOR_IAMCU. > (ix86_adjust_cost): Likewise. > (ia32_multipass_dfa_lookahead): Likewise. > * config/i386/i386.h (processor_type): Add PROCESSOR_IAMCU. > * config/i386/x86-tune.def: Updated for m_IAMCU. > > gcc/testsuite/ > > PR target/66749 > * gcc.target/i386/pr66749.c: New test.
I assume there will be separate patch for configure bits that will set -march=iamcu for i[34567]86-*-elfiamcu target. This part is OK. Thanks, Uros. > --- > gcc/config/i386/i386.c | 76 > ++++++++++++++++++++++++++++++++- > gcc/config/i386/i386.h | 1 + > gcc/config/i386/x86-tune.def | 36 +++++++++------- > gcc/testsuite/gcc.target/i386/pr66749.c | 14 ++++++ > 4 files changed, 111 insertions(+), 16 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr66749.c > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 7d26e8c..98250c4 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -426,6 +426,74 @@ struct processor_costs pentium_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > +static const > +struct processor_costs iamcu_cost = { > + COSTS_N_INSNS (1), /* cost of an add instruction */ > + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ > + COSTS_N_INSNS (4), /* variable shift costs */ > + COSTS_N_INSNS (1), /* constant shift costs */ > + {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ > + COSTS_N_INSNS (11), /* HI */ > + COSTS_N_INSNS (11), /* SI */ > + COSTS_N_INSNS (11), /* DI */ > + COSTS_N_INSNS (11)}, /* > other */ > + 0, /* cost of multiply per each bit set > */ > + {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ > + COSTS_N_INSNS (25), /* HI */ > + COSTS_N_INSNS (25), /* SI */ > + COSTS_N_INSNS (25), /* DI */ > + COSTS_N_INSNS (25)}, /* > other */ > + COSTS_N_INSNS (3), /* cost of movsx */ > + COSTS_N_INSNS (2), /* cost of movzx */ > + 8, /* "large" insn */ > + 6, /* MOVE_RATIO */ > + 6, /* cost for loading QImode using movzbl > */ > + {2, 4, 2}, /* cost of loading integer registers > + in QImode, HImode and SImode. > + Relative to reg-reg move (2). */ > + {2, 4, 2}, /* cost of storing integer registers > */ > + 2, /* cost of reg,reg fld/fst */ > + {2, 2, 6}, /* cost of loading fp registers > + in SFmode, DFmode and XFmode */ > + {4, 4, 6}, /* cost of storing fp registers > + in SFmode, DFmode and XFmode */ > + 8, /* cost of moving MMX register */ > + {8, 8}, /* cost of loading MMX registers > + in SImode and DImode */ > + {8, 8}, /* cost of storing MMX registers > + in SImode and DImode */ > + 2, /* cost of moving SSE register */ > + {4, 8, 16}, /* cost of loading SSE registers > + in SImode, DImode and TImode */ > + {4, 8, 16}, /* cost of storing SSE registers > + in SImode, DImode and TImode */ > + 3, /* MMX or SSE register to integer */ > + 8, /* size of l1 cache. */ > + 8, /* size of l2 cache */ > + 0, /* size of prefetch block */ > + 0, /* number of parallel prefetches */ > + 2, /* Branch cost */ > + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ > + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ > + COSTS_N_INSNS (39), /* cost of FDIV instruction. */ > + COSTS_N_INSNS (1), /* cost of FABS instruction. */ > + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ > + COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ > + pentium_memcpy, > + pentium_memset, > + 1, /* scalar_stmt_cost. */ > + 1, /* scalar load_cost. */ > + 1, /* scalar_store_cost. */ > + 1, /* vec_stmt_cost. */ > + 1, /* vec_to_scalar_cost. */ > + 1, /* scalar_to_vec_cost. */ > + 1, /* vec_align_load_cost. */ > + 2, /* vec_unalign_load_cost. */ > + 1, /* vec_store_cost. */ > + 3, /* cond_taken_branch_cost. */ > + 1, /* cond_not_taken_branch_cost. */ > +}; > + > /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes > (we ensure the alignment). For small blocks inline loop is still a > noticeable win, for bigger blocks either rep movsl or rep movsb is > @@ -2027,6 +2095,7 @@ const struct processor_costs *ix86_cost = &pentium_cost; > #define m_386 (1<<PROCESSOR_I386) > #define m_486 (1<<PROCESSOR_I486) > #define m_PENT (1<<PROCESSOR_PENTIUM) > +#define m_IAMCU (1<<PROCESSOR_IAMCU) > #define m_PPRO (1<<PROCESSOR_PENTIUMPRO) > #define m_PENT4 (1<<PROCESSOR_PENTIUM4) > #define m_NOCONA (1<<PROCESSOR_NOCONA) > @@ -2086,7 +2155,7 @@ unsigned char ix86_arch_features[X86_ARCH_LAST]; > ix86_arch_features based on the processor mask. */ > static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { > /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */ > - ~(m_386 | m_486 | m_PENT | m_K6), > + ~(m_386 | m_486 | m_PENT | m_IAMCU | m_K6), > > /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */ > ~m_386, > @@ -2497,6 +2566,7 @@ static const struct ptt > processor_target_table[PROCESSOR_max] = > {"i386", &i386_cost, 4, 3, 4, 3, 4}, > {"i486", &i486_cost, 16, 15, 16, 15, 16}, > {"pentium", &pentium_cost, 16, 7, 16, 7, 16}, > + {"iamcu", &iamcu_cost, 16, 7, 16, 7, 16}, > {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16}, > {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0}, > {"nocona", &nocona_cost, 0, 0, 0, 0, 0}, > @@ -3246,6 +3316,7 @@ ix86_option_override_internal (bool main_args_p, > {"i486", PROCESSOR_I486, CPU_NONE, 0}, > {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0}, > {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0}, > + {"iamcu", PROCESSOR_IAMCU, CPU_PENTIUM, 0}, > {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX}, > {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX}, > {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | > PTA_PRFCHW}, > @@ -26138,6 +26209,7 @@ ix86_issue_rate (void) > switch (ix86_tune) > { > case PROCESSOR_PENTIUM: > + case PROCESSOR_IAMCU: > case PROCESSOR_BONNELL: > case PROCESSOR_SILVERMONT: > case PROCESSOR_KNL: > @@ -26324,6 +26396,7 @@ ix86_adjust_cost (rtx_insn *insn, rtx link, rtx_insn > *dep_insn, int cost) > switch (ix86_tune) > { > case PROCESSOR_PENTIUM: > + case PROCESSOR_IAMCU: > /* Address Generation Interlock adds a cycle of latency. */ > if (insn_type == TYPE_LEA) > { > @@ -26533,6 +26606,7 @@ ia32_multipass_dfa_lookahead (void) > switch (ix86_tune) > { > case PROCESSOR_PENTIUM: > + case PROCESSOR_IAMCU: > return 2; > > case PROCESSOR_PENTIUMPRO: > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index d710b3d..f357e79 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -2266,6 +2266,7 @@ enum processor_type > PROCESSOR_I386, /* 80386 */ > PROCESSOR_I486, /* 80486DX, 80486SX, 80486DX[24] */ > PROCESSOR_PENTIUM, > + PROCESSOR_IAMCU, > PROCESSOR_PENTIUMPRO, > PROCESSOR_PENTIUM4, > PROCESSOR_NOCONA, > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index bb3209d..42a560b 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -40,8 +40,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. > If not, see > > /* X86_TUNE_SCHEDULE: Enable scheduling. */ > DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", > - m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL > - | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) > + m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT > + | m_INTEL | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) > > /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming > on modern chips. Preffer stores affecting whole integer register > @@ -172,19 +172,21 @@ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", > /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred > over esp subtraction. */ > DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT > - | m_K6_GEODE) > + | m_IAMCU | m_K6_GEODE) > > /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred > over esp subtraction. */ > -DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE) > +DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_IAMCU > + | m_K6_GEODE) > > /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred > over esp addition. */ > -DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO) > +DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT > + | m_IAMCU | m_PPRO) > > /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred > over esp addition. */ > -DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT) > +DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_IAMCU) > > > /*****************************************************************************/ > /* Branch predictor tuning > */ > @@ -224,7 +226,7 @@ DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | > m_GENERIC) > > /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such > as "add mem, reg". */ > -DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO)) > +DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_IAMCU | m_PPRO)) > > /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ > DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", > @@ -284,7 +286,8 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", > > /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ > DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", > - ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL | m_K6)) > + ~(m_PENT | m_IAMCU | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL > + | m_K6)) > > /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ > DEF_TUNE (X86_TUNE_USE_BT, "use_bt", > @@ -304,8 +307,8 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", > /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit > integer operand. */ > DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", > - ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT > - | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) > + ~(m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL > + | m_SILVERMONT | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) > > /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ > DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) > @@ -444,7 +447,8 @@ DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486) > > /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead > of mozbl/movwl. */ > -DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", m_486 | > m_PENT) > +DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", > + m_486 | m_PENT | m_IAMCU) > > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode > and SImode multiply, but 386 and 486 do HImode multiply faster. */ > @@ -454,19 +458,21 @@ DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, > "promote_himode_imul", > /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic > into 16bit/8bit when resulting sequence is shorter. For example > for "and $-65536, reg" to 16bit store of 0. */ > -DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT)) > +DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", > + ~(m_386 | m_486 | m_PENT | m_IAMCU)) > > /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions > such as "add $1, mem". */ > -DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT) > +DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", > + ~(m_PENT | m_IAMCU)) > > /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR > than a MOV. */ > -DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT) > +DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_IAMCU) > > /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, > but one byte longer. */ > -DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT) > +DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_IAMCU) > > /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled > use of partial registers by renaming. This improved performance of 16bit > diff --git a/gcc/testsuite/gcc.target/i386/pr66749.c > b/gcc/testsuite/gcc.target/i386/pr66749.c > new file mode 100644 > index 0000000..affda08 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr66749.c > @@ -0,0 +1,14 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target ia32 } */ > +/* { dg-require-effective-target nonpic } */ > +/* { dg-options "-O2 -miamcu -mtune=iamcu" } */ > + > +char a[10], b[10]; > + > +int f(int i) > +{ > + return a[i+1] + b[i+1]; > +} > + > +/* { dg-final { scan-assembler "a\\+1" } } */ > +/* { dg-final { scan-assembler "b\\+1" } } */ > -- > 2.4.3 >