On Thursday 22 October 2009, Aurelien Jarno wrote:
> Probably the second. Changing the instruction pointer in the helper
> instead of using the proper goto_tb TCG op prevents TB chaining, and
> therefore as a huge impact on performance.
>
> It's something not difficult to implement, and that I would definitely
> want to see in the patch before getting it merged.
OK, I implemented it, and the surprising result is that performance does
not get any better; in fact it even suffers a little bit. (My standard
quick test, the polarssl test suite, shows about a 2% performance impact
when profiled with cachegrind).
Could there be anything I overlooked? I modelled my implementation after
those in the existing targets. (See the attached patch that goes on top
of my other S/390 patches.)
CU
Uli
--
SUSE LINUX Products GmbH, GF: Markus Rex, HRB 16746 (AG Nürnberg)
diff --git a/target-s390/helpers.h b/target-s390/helpers.h
index 0d16760..6009312 100644
--- a/target-s390/helpers.h
+++ b/target-s390/helpers.h
@@ -15,7 +15,6 @@ DEF_HELPER_FLAGS_1(set_cc_comp_s64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64)
DEF_HELPER_FLAGS_1(set_cc_nz_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32)
DEF_HELPER_FLAGS_1(set_cc_nz_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64)
DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32)
-DEF_HELPER_4(brc, void, i32, i32, i64, s32)
DEF_HELPER_3(brctg, void, i64, i64, s32)
DEF_HELPER_3(brct, void, i32, i64, s32)
DEF_HELPER_4(brcl, void, i32, i32, i64, s64)
diff --git a/target-s390/op_helper.c b/target-s390/op_helper.c
index 637d22f..f7f52ba 100644
--- a/target-s390/op_helper.c
+++ b/target-s390/op_helper.c
@@ -218,17 +218,6 @@ uint32_t HELPER(set_cc_icm)(uint32_t mask, uint32_t val)
return cc;
}
-/* relative conditional branch */
-void HELPER(brc)(uint32_t cc, uint32_t mask, uint64_t pc, int32_t offset)
-{
- if ( mask & ( 1 << (3 - cc) ) ) {
- env->psw.addr = pc + offset;
- }
- else {
- env->psw.addr = pc + 4;
- }
-}
-
/* branch relative on 64-bit count (condition is computed inline, this only
does the branch */
void HELPER(brctg)(uint64_t flag, uint64_t pc, int32_t offset)
diff --git a/target-s390/translate.c b/target-s390/translate.c
index 9ffa7bd..5a7cfe7 100644
--- a/target-s390/translate.c
+++ b/target-s390/translate.c
@@ -49,6 +49,7 @@ struct DisasContext {
uint64_t pc;
int is_jmp;
CPUS390XState *env;
+ struct TranslationBlock *tb;
};
#define DISAS_EXCP 4
@@ -359,23 +360,55 @@ static void gen_bcr(uint32_t mask, int tr, uint64_t offset)
tcg_temp_free(target);
}
-static void gen_brc(uint32_t mask, uint64_t pc, int32_t offset)
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong pc)
{
- TCGv p;
- TCGv_i32 m, o;
+ TranslationBlock *tb;
+
+ tb = s->tb;
+ /* NOTE: we handle the case where the TB spans two pages here */
+ if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
+ (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK)) {
+ /* jump to same page: we can use a direct jump */
+ tcg_gen_mov_i32(global_cc, cc);
+ tcg_gen_goto_tb(tb_num);
+ tcg_gen_movi_i64(psw_addr, pc);
+ tcg_gen_exit_tb((long)tb + tb_num);
+ } else {
+ /* jump to another page: currently not optimized */
+ tcg_gen_movi_i64(psw_addr, pc);
+ tcg_gen_mov_i32(global_cc, cc);
+ tcg_gen_exit_tb(0);
+ }
+}
+
+static void gen_brc(uint32_t mask, DisasContext *s, int32_t offset)
+{
+ TCGv_i32 r;
+ TCGv_i32 tmp, tmp2;
+ int skip;
if (mask == 0xf) { /* unconditional */
- tcg_gen_movi_i64(psw_addr, pc + offset);
+ //tcg_gen_movi_i64(psw_addr, s->pc + offset);
+ gen_goto_tb(s, 0, s->pc + offset);
}
else {
- m = tcg_const_i32(mask);
- p = tcg_const_i64(pc);
- o = tcg_const_i32(offset);
- gen_helper_brc(cc, m, p, o);
- tcg_temp_free(m);
- tcg_temp_free(p);
- tcg_temp_free(o);
+ tmp = tcg_const_i32(3);
+ tcg_gen_sub_i32(tmp, tmp, cc); /* 3 - cc */
+ tmp2 = tcg_const_i32(1);
+ tcg_gen_shl_i32(tmp2, tmp2, tmp); /* 1 << (3 - cc) */
+ r = tcg_const_i32(mask);
+ tcg_gen_and_i32(r, r, tmp2); /* mask & (1 << (3 - cc)) */
+ tcg_temp_free(tmp);
+ tcg_temp_free(tmp2);
+ skip = gen_new_label();
+ tcg_gen_brcondi_i32(TCG_COND_EQ, r, 0, skip);
+ gen_goto_tb(s, 0, s->pc + offset);
+ gen_set_label(skip);
+ gen_goto_tb(s, 1, s->pc + 4);
+ tcg_gen_mov_i32(global_cc, cc);
+ tcg_temp_free(r);
}
+ s->is_jmp = DISAS_TB_JUMP;
}
static void gen_set_cc_add64(TCGv v1, TCGv v2, TCGv vr)
@@ -1143,9 +1176,7 @@ static void disas_a7(DisasContext *s, int op, int r1, int i2)
tcg_temp_free(tmp2);
break;
case 0x4: /* brc m1, i2 */
- /* FIXME: optimize m1 == 0xf (unconditional) case */
- gen_brc(r1, s->pc, i2 * 2);
- s->is_jmp = DISAS_JUMP;
+ gen_brc(r1, s, i2 * 2);
return;
case 0x5: /* BRAS R1,I2 [RI] */
tmp = tcg_const_i64(s->pc + 4);
@@ -2739,6 +2770,7 @@ static inline void gen_intermediate_code_internal (CPUState *env,
dc.env = env;
dc.pc = pc_start;
dc.is_jmp = DISAS_NEXT;
+ dc.tb = tb;
gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
@@ -2778,8 +2810,11 @@ static inline void gen_intermediate_code_internal (CPUState *env,
num_insns++;
} while (!dc.is_jmp && gen_opc_ptr < gen_opc_end && dc.pc < next_page_start
&& num_insns < max_insns && !env->singlestep_enabled);
- tcg_gen_mov_i32(global_cc, cc);
- tcg_temp_free(cc);
+
+ if (dc.is_jmp != DISAS_TB_JUMP) {
+ tcg_gen_mov_i32(global_cc, cc);
+ tcg_temp_free(cc);
+ }
if (!dc.is_jmp) {
tcg_gen_st_i64(tcg_const_i64(dc.pc), cpu_env, offsetof(CPUState, psw.addr));
@@ -2794,7 +2829,9 @@ static inline void gen_intermediate_code_internal (CPUState *env,
if (tb->cflags & CF_LAST_IO)
gen_io_end();
/* Generate the return instruction */
- tcg_gen_exit_tb(0);
+ if (dc.is_jmp != DISAS_TB_JUMP) {
+ tcg_gen_exit_tb(0);
+ }
gen_icount_end(tb, num_insns);
*gen_opc_ptr = INDEX_op_end;
if (search_pc) {