390 CPU emulation

Ulrich Hecht Mon, 02 Nov 2009 07:16:09 -0800

On Thursday 22 October 2009, Aurelien Jarno wrote:
> Probably the second. Changing the instruction pointer in the helper
> instead of using the proper goto_tb TCG op prevents TB chaining, and
> therefore as a huge impact on performance.
>
> It's something not difficult to implement, and that I would definitely
> want to see in the patch before getting it merged.


OK, I implemented it, and the surprising result is that performance  does 
not get any better; in fact it even suffers a little bit. (My standard 
quick test, the polarssl test suite, shows about a 2% performance impact 
when profiled with cachegrind).

Could there be anything I overlooked? I modelled my implementation after 
those in the existing targets. (See the attached patch that goes on top 
of my other S/390 patches.)

CU
Uli

-- 
SUSE LINUX Products GmbH, GF: Markus Rex, HRB 16746 (AG Nürnberg)

diff --git a/target-s390/helpers.h b/target-s390/helpers.h
index 0d16760..6009312 100644
--- a/target-s390/helpers.h
+++ b/target-s390/helpers.h
@@ -15,7 +15,6 @@ DEF_HELPER_FLAGS_1(set_cc_comp_s64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64)
 DEF_HELPER_FLAGS_1(set_cc_nz_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32)
 DEF_HELPER_FLAGS_1(set_cc_nz_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64)
 DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32)
-DEF_HELPER_4(brc, void, i32, i32, i64, s32)
 DEF_HELPER_3(brctg, void, i64, i64, s32)
 DEF_HELPER_3(brct, void, i32, i64, s32)
 DEF_HELPER_4(brcl, void, i32, i32, i64, s64)
diff --git a/target-s390/op_helper.c b/target-s390/op_helper.c
index 637d22f..f7f52ba 100644
--- a/target-s390/op_helper.c
+++ b/target-s390/op_helper.c
@@ -218,17 +218,6 @@ uint32_t HELPER(set_cc_icm)(uint32_t mask, uint32_t val)
     return cc;
 }
 
-/* relative conditional branch */
-void HELPER(brc)(uint32_t cc, uint32_t mask, uint64_t pc, int32_t offset)
-{
-    if ( mask & ( 1 << (3 - cc) ) ) {
-        env->psw.addr = pc + offset;
-    }
-    else {
-        env->psw.addr = pc + 4;
-    }
-}
-
 /* branch relative on 64-bit count (condition is computed inline, this only
    does the branch */
 void HELPER(brctg)(uint64_t flag, uint64_t pc, int32_t offset)
diff --git a/target-s390/translate.c b/target-s390/translate.c
index 9ffa7bd..5a7cfe7 100644
--- a/target-s390/translate.c
+++ b/target-s390/translate.c
@@ -49,6 +49,7 @@ struct DisasContext {
     uint64_t pc;
     int is_jmp;
     CPUS390XState *env;
+    struct TranslationBlock *tb;
 };
 
 #define DISAS_EXCP 4
@@ -359,23 +360,55 @@ static void gen_bcr(uint32_t mask, int tr, uint64_t offset)
     tcg_temp_free(target);
 }
 
-static void gen_brc(uint32_t mask, uint64_t pc, int32_t offset)
+static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong pc)
 {
-    TCGv p;
-    TCGv_i32 m, o;
+    TranslationBlock *tb;
+
+    tb = s->tb;
+    /* NOTE: we handle the case where the TB spans two pages here */
+    if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) ||
+        (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK))  {
+        /* jump to same page: we can use a direct jump */
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_gen_goto_tb(tb_num);
+        tcg_gen_movi_i64(psw_addr, pc);
+        tcg_gen_exit_tb((long)tb + tb_num);
+    } else {
+        /* jump to another page: currently not optimized */
+        tcg_gen_movi_i64(psw_addr, pc);
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_gen_exit_tb(0);
+    }
+}
+
+static void gen_brc(uint32_t mask, DisasContext *s, int32_t offset)
+{
+    TCGv_i32 r;
+    TCGv_i32 tmp, tmp2;
+    int skip;
     
     if (mask == 0xf) {	/* unconditional */
-      tcg_gen_movi_i64(psw_addr, pc + offset);
+      //tcg_gen_movi_i64(psw_addr, s->pc + offset);
+      gen_goto_tb(s, 0, s->pc + offset);
     }
     else {
-      m = tcg_const_i32(mask);
-      p = tcg_const_i64(pc);
-      o = tcg_const_i32(offset);
-      gen_helper_brc(cc, m, p, o);
-      tcg_temp_free(m);
-      tcg_temp_free(p);
-      tcg_temp_free(o);
+      tmp = tcg_const_i32(3);
+      tcg_gen_sub_i32(tmp, tmp, cc);	/* 3 - cc */
+      tmp2 = tcg_const_i32(1);
+      tcg_gen_shl_i32(tmp2, tmp2, tmp);	/* 1 << (3 - cc) */
+      r = tcg_const_i32(mask);
+      tcg_gen_and_i32(r, r, tmp2);	/* mask & (1 << (3 - cc)) */
+      tcg_temp_free(tmp);
+      tcg_temp_free(tmp2);
+      skip = gen_new_label();
+      tcg_gen_brcondi_i32(TCG_COND_EQ, r, 0, skip);
+      gen_goto_tb(s, 0, s->pc + offset);
+      gen_set_label(skip);
+      gen_goto_tb(s, 1, s->pc + 4);
+      tcg_gen_mov_i32(global_cc, cc);
+      tcg_temp_free(r);
     }
+    s->is_jmp = DISAS_TB_JUMP;
 }
 
 static void gen_set_cc_add64(TCGv v1, TCGv v2, TCGv vr)
@@ -1143,9 +1176,7 @@ static void disas_a7(DisasContext *s, int op, int r1, int i2)
         tcg_temp_free(tmp2);
         break;
     case 0x4: /* brc m1, i2 */
-        /* FIXME: optimize m1 == 0xf (unconditional) case */
-        gen_brc(r1, s->pc, i2 * 2);
-        s->is_jmp = DISAS_JUMP;
+        gen_brc(r1, s, i2 * 2);
         return;
     case 0x5: /* BRAS     R1,I2     [RI] */
         tmp = tcg_const_i64(s->pc + 4);
@@ -2739,6 +2770,7 @@ static inline void gen_intermediate_code_internal (CPUState *env,
     dc.env = env;
     dc.pc = pc_start;
     dc.is_jmp = DISAS_NEXT;
+    dc.tb = tb;
     
     gen_opc_end = gen_opc_buf + OPC_MAX_SIZE;
     
@@ -2778,8 +2810,11 @@ static inline void gen_intermediate_code_internal (CPUState *env,
         num_insns++;
     } while (!dc.is_jmp && gen_opc_ptr < gen_opc_end && dc.pc < next_page_start
              && num_insns < max_insns && !env->singlestep_enabled);
-    tcg_gen_mov_i32(global_cc, cc);
-    tcg_temp_free(cc);
+
+    if (dc.is_jmp != DISAS_TB_JUMP) {
+        tcg_gen_mov_i32(global_cc, cc);
+        tcg_temp_free(cc);
+    }
     
     if (!dc.is_jmp) {
         tcg_gen_st_i64(tcg_const_i64(dc.pc), cpu_env, offsetof(CPUState, psw.addr));
@@ -2794,7 +2829,9 @@ static inline void gen_intermediate_code_internal (CPUState *env,
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
     /* Generate the return instruction */
-    tcg_gen_exit_tb(0);
+    if (dc.is_jmp != DISAS_TB_JUMP) {
+        tcg_gen_exit_tb(0);
+    }
     gen_icount_end(tb, num_insns);
     *gen_opc_ptr = INDEX_op_end;
     if (search_pc) {

Re: [Qemu-devel] [PATCH 2/9] S/390 CPU emulation

Reply via email to