IA MCU is based on Intel Pentium ISA without x87 and passing parameters in registers. We want to optimize for IA MCU without changing existing Pentium codegen. This patch adds PROCESSOR_IAMCU for -march=iamcu, which is based on -march=pentium with updated cost tables.
OK for trunk? Thanks. H.J. -- gcc/ PR target/66749 * config/i386/i386.c (iamcu_cost): New. (m_IAMCU): Likewise. (initial_ix86_arch_features): Disable X86_ARCH_CMOV for m_IAMCU. (processor_target_table): Add an entry for "iamcu". (processor_alias_table): Likewise. (ix86_issue_rate): Handle PROCESSOR_IAMCU. (ix86_adjust_cost): Likewise. (ia32_multipass_dfa_lookahead): Likewise. * config/i386/i386.h (processor_type): Add PROCESSOR_IAMCU. * config/i386/x86-tune.def: Updated for m_IAMCU. gcc/testsuite/ PR target/66749 * gcc.target/i386/pr66749.c: New test. --- gcc/config/i386/i386.c | 76 ++++++++++++++++++++++++++++++++- gcc/config/i386/i386.h | 1 + gcc/config/i386/x86-tune.def | 36 +++++++++------- gcc/testsuite/gcc.target/i386/pr66749.c | 14 ++++++ 4 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr66749.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7d26e8c..98250c4 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -426,6 +426,74 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static const +struct processor_costs iamcu_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (4), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ + COSTS_N_INSNS (11), /* HI */ + COSTS_N_INSNS (11), /* SI */ + COSTS_N_INSNS (11), /* DI */ + COSTS_N_INSNS (11)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (25), /* HI */ + COSTS_N_INSNS (25), /* SI */ + COSTS_N_INSNS (25), /* DI */ + COSTS_N_INSNS (25)}, /* other */ + COSTS_N_INSNS (3), /* cost of movsx */ + COSTS_N_INSNS (2), /* cost of movzx */ + 8, /* "large" insn */ + 6, /* MOVE_RATIO */ + 6, /* cost for loading QImode using movzbl */ + {2, 4, 2}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {2, 4, 2}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {2, 2, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 8, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 8, 16}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 8, 16}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + 8, /* size of l1 cache. */ + 8, /* size of l2 cache */ + 0, /* size of prefetch block */ + 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (3), /* cost of FMUL instruction. */ + COSTS_N_INSNS (39), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + pentium_memcpy, + pentium_memset, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure the alignment). For small blocks inline loop is still a noticeable win, for bigger blocks either rep movsl or rep movsb is @@ -2027,6 +2095,7 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_386 (1<<PROCESSOR_I386) #define m_486 (1<<PROCESSOR_I486) #define m_PENT (1<<PROCESSOR_PENTIUM) +#define m_IAMCU (1<<PROCESSOR_IAMCU) #define m_PPRO (1<<PROCESSOR_PENTIUMPRO) #define m_PENT4 (1<<PROCESSOR_PENTIUM4) #define m_NOCONA (1<<PROCESSOR_NOCONA) @@ -2086,7 +2155,7 @@ unsigned char ix86_arch_features[X86_ARCH_LAST]; ix86_arch_features based on the processor mask. */ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */ - ~(m_386 | m_486 | m_PENT | m_K6), + ~(m_386 | m_486 | m_PENT | m_IAMCU | m_K6), /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */ ~m_386, @@ -2497,6 +2566,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] = {"i386", &i386_cost, 4, 3, 4, 3, 4}, {"i486", &i486_cost, 16, 15, 16, 15, 16}, {"pentium", &pentium_cost, 16, 7, 16, 7, 16}, + {"iamcu", &iamcu_cost, 16, 7, 16, 7, 16}, {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16}, {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0}, {"nocona", &nocona_cost, 0, 0, 0, 0, 0}, @@ -3246,6 +3316,7 @@ ix86_option_override_internal (bool main_args_p, {"i486", PROCESSOR_I486, CPU_NONE, 0}, {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0}, {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0}, + {"iamcu", PROCESSOR_IAMCU, CPU_PENTIUM, 0}, {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX}, {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX}, {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW}, @@ -26138,6 +26209,7 @@ ix86_issue_rate (void) switch (ix86_tune) { case PROCESSOR_PENTIUM: + case PROCESSOR_IAMCU: case PROCESSOR_BONNELL: case PROCESSOR_SILVERMONT: case PROCESSOR_KNL: @@ -26324,6 +26396,7 @@ ix86_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost) switch (ix86_tune) { case PROCESSOR_PENTIUM: + case PROCESSOR_IAMCU: /* Address Generation Interlock adds a cycle of latency. */ if (insn_type == TYPE_LEA) { @@ -26533,6 +26606,7 @@ ia32_multipass_dfa_lookahead (void) switch (ix86_tune) { case PROCESSOR_PENTIUM: + case PROCESSOR_IAMCU: return 2; case PROCESSOR_PENTIUMPRO: diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index d710b3d..f357e79 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2266,6 +2266,7 @@ enum processor_type PROCESSOR_I386, /* 80386 */ PROCESSOR_I486, /* 80486DX, 80486SX, 80486DX[24] */ PROCESSOR_PENTIUM, + PROCESSOR_IAMCU, PROCESSOR_PENTIUMPRO, PROCESSOR_PENTIUM4, PROCESSOR_NOCONA, diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index bb3209d..42a560b 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -40,8 +40,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* X86_TUNE_SCHEDULE: Enable scheduling. */ DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", - m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL - | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) + m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT + | m_INTEL | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming on modern chips. Preffer stores affecting whole integer register @@ -172,19 +172,21 @@ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred over esp subtraction. */ DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT - | m_K6_GEODE) + | m_IAMCU | m_K6_GEODE) /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred over esp subtraction. */ -DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE) +DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_IAMCU + | m_K6_GEODE) /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred over esp addition. */ -DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO) +DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT + | m_IAMCU | m_PPRO) /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred over esp addition. */ -DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT) +DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_IAMCU) /*****************************************************************************/ /* Branch predictor tuning */ @@ -224,7 +226,7 @@ DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such as "add mem, reg". */ -DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO)) +DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_IAMCU | m_PPRO)) /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", @@ -284,7 +286,8 @@ DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", - ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL | m_K6)) + ~(m_PENT | m_IAMCU | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL + | m_K6)) /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ DEF_TUNE (X86_TUNE_USE_BT, "use_bt", @@ -304,8 +307,8 @@ DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit integer operand. */ DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", - ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT - | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) + ~(m_PENT | m_IAMCU | m_PPRO | m_CORE_ALL | m_BONNELL + | m_SILVERMONT | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) @@ -444,7 +447,8 @@ DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486) /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead of mozbl/movwl. */ -DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", m_486 | m_PENT) +DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", + m_486 | m_PENT | m_IAMCU) /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode and SImode multiply, but 386 and 486 do HImode multiply faster. */ @@ -454,19 +458,21 @@ DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic into 16bit/8bit when resulting sequence is shorter. For example for "and $-65536, reg" to 16bit store of 0. */ -DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT)) +DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", + ~(m_386 | m_486 | m_PENT | m_IAMCU)) /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions such as "add $1, mem". */ -DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT) +DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", + ~(m_PENT | m_IAMCU)) /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR than a MOV. */ -DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT) +DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_IAMCU) /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, but one byte longer. */ -DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT) +DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_IAMCU) /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled use of partial registers by renaming. This improved performance of 16bit diff --git a/gcc/testsuite/gcc.target/i386/pr66749.c b/gcc/testsuite/gcc.target/i386/pr66749.c new file mode 100644 index 0000000..affda08 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr66749.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ia32 } */ +/* { dg-require-effective-target nonpic } */ +/* { dg-options "-O2 -miamcu -mtune=iamcu" } */ + +char a[10], b[10]; + +int f(int i) +{ + return a[i+1] + b[i+1]; +} + +/* { dg-final { scan-assembler "a\\+1" } } */ +/* { dg-final { scan-assembler "b\\+1" } } */ -- 2.4.3