On 06/02/2016 01:07 PM, Michael Rolnik wrote:
Signed-off-by: Michael Rolnik <mrol...@gmail.com>
---
target-avr/translate-inst.c | 2443 +++++++++++++++++++++++++++++++++++++++++++
Is there a reason this code isn't going into translate.c?
You wouldn't need the declarations in translate-inst.h or translate.h.
+/*
+ NOTE: all registers are assumed to hold 8 bit values.
+ so all operations done on registers should preseve this property
+*/
+
+/*
+ NOTE: the flags C,H,V,N,V have either 0 or 1 values
+ NOTE: the flag Z has inverse logic, when the value of Zf is 0 the flag
is assumed to be set, non zero - not set
+*/
This documentation belongs with the declarations in cpu.h.
+void gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr);
+void gen_add_Vf(TCGv R, TCGv Rd, TCGv Rr);
+void gen_sub_CHf(TCGv R, TCGv Rd, TCGv Rr);
+void gen_sub_Vf(TCGv R, TCGv Rd, TCGv Rr);
+void gen_ZNSf(TCGv R);
+void gen_push_ret(CPUAVRState *env, int ret);
+void gen_pop_ret(CPUAVRState *env, TCGv ret);
+void gen_jmp_ez(void);
+void gen_jmp_z(void);
+
+void gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv l); /* H:M:L = addr */
+void gen_set_xaddr(TCGv addr);
+void gen_set_yaddr(TCGv addr);
+void gen_set_zaddr(TCGv addr);
+
+TCGv gen_get_addr(TCGv H, TCGv M, TCGv L); /* addr = H:M:L */
+TCGv gen_get_xaddr(void);
+TCGv gen_get_yaddr(void);
+TCGv gen_get_zaddr(void);
+int sex(int Imm, unsigned bits);
Order these functions properly and you don't need forward declarations.
+
+void gen_add_CHf(TCGv R, TCGv Rd, TCGv Rr)
+{
+ TCGv t1 = tcg_temp_new_i32();
+ TCGv t2 = tcg_temp_new_i32();
+ TCGv t3 = tcg_temp_new_i32();
+
+ tcg_gen_and_tl(t1, Rd, Rr); /* t1 = Rd & Rr */
+ tcg_gen_not_tl(t2, R); /* t2 = Rd & ~R */
+ tcg_gen_and_tl(t2, Rd, t2);
tcg_gen_andc_tl.
+ tcg_gen_not_tl(t3, R); /* t3 = Rr *~R */
+ tcg_gen_and_tl(t3, Rr, t3);
Likewise.
+ tcg_gen_or_tl(t1, t1, t2); /* t1 = t1 | t2 | t3 */
+ tcg_gen_or_tl(t1, t1, t3);
While this is exactly the formula in the manual, it's also equal to
((Rd ^ Rr) ^ R) & 16
where we examine the difference between the non-carry addition (Rd ^ Rr) and
the carry addition (R) to find the carry out of bit 3. This reduces the
operation count to 2 from 5.
+ tcg_gen_shri_tl(cpu_Cf, t1, 7); /* Cf = t1(7) */
Of course, Cf is also bit 8 of R, at least before you masked that off.
+ tcg_gen_shri_tl(cpu_Hf, t1, 3); /* Hf = t1(3) */
+ tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
Consider leaving Hf in bit 3 (or 4, as above) instead of shifting and masking
to bit 0. This makes it (slightly) more complicated to read the full SREG
value, but it saves two instructions per arithmetic instruction. This will add
up quickly.
One can make the same argument for most of the other flags.
Compare how this is handled in target-arm for cpu_[NZCV]F.
+void gen_add_Vf(TCGv R, TCGv Rd, TCGv Rr)
+{
+ TCGv t1 = tcg_temp_new_i32();
+ TCGv t2 = tcg_temp_new_i32();
+
+ tcg_gen_not_tl(t1, Rd); /* t1 = ~Rd & ~Rr & R */
+ tcg_gen_not_tl(t2, Rr);
+ tcg_gen_and_tl(t1, t1, t2);
+ tcg_gen_and_tl(t1, t1, R);
+
+ tcg_gen_not_tl(t2, R); /* t2 = Rd & Rr & ~R */
+ tcg_gen_and_tl(t2, t2, Rd);
+ tcg_gen_and_tl(t2, t2, Rr);
+
+ tcg_gen_or_tl(t1, t1, t2); /* t1 = Rd & Rr & ~R | ~Rd & ~Rr & R */
While this is indeed the formula in the manual, a better expression is
(R ^ Rd) & ~(Rd ^ Rr)
which is 3 operations instead of 5. As alluded above, it might be best to keep
Vf in bit 7 instead of bit 0.
Also note that if you use bit 4 to compute Hf, as above, then one can share a
sub-expression with the computation of Vf.
+void gen_sub_CHf(TCGv R, TCGv Rd, TCGv Rr)
+{
+ TCGv t1 = tcg_temp_new_i32();
+ TCGv t2 = tcg_temp_new_i32();
+ TCGv t3 = tcg_temp_new_i32();
+
+ /* Cf & Hf */
+ tcg_gen_not_tl(t1, Rd); /* t1 = ~Rd */
+ tcg_gen_and_tl(t2, t1, Rr); /* t2 = ~Rd & Rr */
+ tcg_gen_or_tl(t3, t1, Rr); /* t3 = (~Rd | Rr) & R */
+ tcg_gen_and_tl(t3, t3, R);
+ tcg_gen_or_tl(t2, t2, t3); /* t2 = ~Rd & Rr | ~Rd & R | R & Rr */
+ tcg_gen_shri_tl(cpu_Cf, t2, 7); /* Cf = t2(7) */
+ tcg_gen_shri_tl(cpu_Hf, t2, 3); /* Hf = t2(3) */
+ tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
Note that carry and borrow are related, and thus this is *also* computable via
((Rd ^ Rr) ^ R) on bit 4.
+void gen_sub_Vf(TCGv R, TCGv Rd, TCGv Rr)
+{
+ TCGv t1 = tcg_temp_new_i32();
+ TCGv t2 = tcg_temp_new_i32();
+
+ /* Vf */
+ tcg_gen_and_tl(t1, Rr, R); /* t1 = Rd & ~Rr & ~R */
+ tcg_gen_not_tl(t1, t1);
+ tcg_gen_and_tl(t1, t1, Rd);
+ tcg_gen_not_tl(t2, Rd); /* t2 = ~Rd & Rr & R */
+ tcg_gen_and_tl(t2, t2, Rr);
+ tcg_gen_and_tl(t2, t2, R);
+ tcg_gen_or_tl(t1, t1, t2); /* t1 = Rd & ~Rr & ~R | ~Rd & Rr & R */
+ tcg_gen_shri_tl(cpu_Vf, t1, 7); /* Vf = t1(7) */
This is equal to
(R ^ Rd) & (Rd ^ Rr)
+void gen_ZNSf(TCGv R)
+{
+ tcg_gen_mov_tl(cpu_Zf, R); /* Zf = R */
+ tcg_gen_shri_tl(cpu_Nf, R, 7); /* Nf = R(7) */
+ tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/* Sf = Nf & Vf */
This should be xor.
+void gen_push_ret(CPUAVRState *env, int ret)
+{
+ tcg_gen_qemu_st8(tcg_const_local_i32((ret & 0x0000ff)), cpu_sp,
DATA_INDEX);
You don't need a local constant, and you should be freeing these 3 temps.
I'll also note that the piece-wise store is big-endian, so you could perform
this in 1 store for 2_BYTE and 2 stores for 3_BYTE.
+void gen_jmp_ez()
+{
+ tcg_gen_mov_tl(cpu_pc, cpu_eind);
+ tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+ tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[31]);
+ tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+ tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
+ tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffffff);
+ tcg_gen_exit_tb(0);
+}
+
+void gen_jmp_z()
+{
+ tcg_gen_mov_tl(cpu_pc, cpu_r[31]);
+ tcg_gen_shli_tl(cpu_pc, cpu_pc, 8);
+ tcg_gen_or_tl(cpu_pc, cpu_pc, cpu_r[30]);
+ tcg_gen_andi_tl(cpu_pc, cpu_pc, 0xffff);
Note this is
tcg_gen_deposit_tl(cpu_pc, cpu_r[30], cpu_r[31], 8, 8);
Also consider storing EIND (and perhaps already shifted so that you can just OR
it in.
+void gen_set_addr(TCGv addr, TCGv H, TCGv M, TCGv L)
+{
+ tcg_gen_andi_tl(L, addr, 0xff);
+
+ tcg_gen_shri_tl(addr, addr, 8);
+ tcg_gen_andi_tl(M, addr, 0xff);
+
+ tcg_gen_shri_tl(addr, addr, 8);
+ tcg_gen_andi_tl(H, addr, 0xff);
+}
+
+void gen_set_xaddr(TCGv addr)
+{
+ gen_set_addr(addr, cpu_rampX, cpu_r[27], cpu_r[26]);
+}
+
+void gen_set_yaddr(TCGv addr)
+{
+ gen_set_addr(addr, cpu_rampY, cpu_r[29], cpu_r[28]);
+}
+
+void gen_set_zaddr(TCGv addr)
+{
+ gen_set_addr(addr, cpu_rampZ, cpu_r[31], cpu_r[30]);
+}
+
+TCGv gen_get_addr(TCGv H, TCGv M, TCGv L)
+{
+ TCGv addr= tcg_temp_new_i32();
+
+ tcg_gen_mov_tl(addr, H); /* addr = H:M:L */
+ tcg_gen_shli_tl(addr, addr, 8);
+ tcg_gen_or_tl(addr, addr, M);
+ tcg_gen_shli_tl(addr, addr, 8);
+ tcg_gen_or_tl(addr, addr, L);
+
+ return addr;
+}
+
+TCGv gen_get_xaddr()
+{
+ return gen_get_addr(cpu_rampX, cpu_r[27], cpu_r[26]);
+}
+
+TCGv gen_get_yaddr()
+{
+ return gen_get_addr(cpu_rampY, cpu_r[29], cpu_r[28]);
+}
+
+TCGv gen_get_zaddr()
+{
+ return gen_get_addr(cpu_rampZ, cpu_r[31], cpu_r[30]);
+}
Wow. Um... Surely it would be better to store X and Y internally as whole
24-bit quantities, and Z as a 16-bit quantity (to be extended with rampz,
rampd, or eind as needed).
This would allow normal loads, stores, and autoincrement to operate
efficiently. When you see byte accesses to R[26-31], you extract/deposit the
bytes as required. I assume that happens (significantly) less often than
operating on X/Y/Z as a unit...
One might make the same argument for R[24-25] as it pertains to adiw et al.
+int sex(int Imm, unsigned bits)
+{
+ Imm <<= 32 - bits;
+ Imm >>= 32 - bits;
+
+ return Imm;
+}
This is sextract32(imm, 0, bits).
+int avr_translate_ADIW(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ if (avr_feature(env, AVR_FEATURE_ADIW_SBIW) == false) {
+ gen_helper_unsupported(cpu_env);
+
+ return BS_EXCP;
+ }
+
+ TCGv RdL = cpu_r[24 + 2 * ADIW_Rd(opcode)];
+ TCGv RdH = cpu_r[25 + 2 * ADIW_Rd(opcode)];
+ int Imm = (ADIW_hImm(opcode) << 4) | (ADIW_lImm(opcode));
+ TCGv R = tcg_temp_new_i32();
+ TCGv Rd = tcg_temp_new_i32();
+ TCGv t0 = tcg_temp_new_i32();
+
+ /* op */
+ tcg_gen_shli_tl(Rd, RdH, 8); /* R = ((H << 8) | L) + Imm */
+ tcg_gen_or_tl(Rd, RdL, Rd);
+ tcg_gen_addi_tl(R, Rd, Imm);
+ tcg_gen_andi_tl(R, R, 0xffff);/* make it 16 bits */
+
+ /* Cf */
+ tcg_gen_not_tl(t0, R); /* t0 = Rd & ~R */
+ tcg_gen_and_tl(t0, Rd, t0);
+ tcg_gen_shri_tl(cpu_Cf, t0, 15); /* Cf = t0(15) */
Or simply bit 16.
+ /* Sf */
+ tcg_gen_and_tl(cpu_Sf, cpu_Nf, cpu_Vf);/* Sf = Nf & Vf */
xor.
+int avr_translate_ASR(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[ASR_Rd(opcode)];
+ TCGv t1 = tcg_temp_new_i32();
+ TCGv t2 = tcg_temp_new_i32();
+
+ /* op */
+ tcg_gen_andi_tl(t1, Rd, 0x80); /* t1 = (Rd & 0x80) | (Rd >> 1) */
+ tcg_gen_shri_tl(t2, Rd, 1);
+ tcg_gen_or_tl(t1, t1, t2);
+
+ /* Cf */
+ tcg_gen_andi_tl(cpu_Cf, Rd, 1); /* Cf = Rd(0) */
+
+ /* Vf */
+ tcg_gen_and_tl(cpu_Vf, cpu_Nf, cpu_Cf);/* Vf = Nf & Cf */
xor.
+/*
+ Copies the T Flag in the SREG (Status Register) to bit b in register Rd.
+*/
+int avr_translate_BLD(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[BLD_Rd(opcode)];
+ TCGv t1 = tcg_temp_new_i32();
+
+ tcg_gen_shli_tl(t1, cpu_Tf, BLD_Bit(opcode));
+ tcg_gen_or_tl(Rd, Rd, t1);
+ tcg_gen_and_tl(Rd, Rd, t1);
Err... extra and? I'm pretty sure this insn simply sets bit B, and doesn't
clear all of the others.
+/*
+ Clears the specified bits in register Rd. Performs the logical AND between
the contents of register Rd and the
+ complement of the constant mask K. The result will be placed in register
Rd.
+*/
+int avr_translate_COM(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[COM_Rd(opcode)];
+ TCGv R = tcg_temp_new_i32();
+
+ tcg_gen_movi_tl(R, 0xff);
+ tcg_gen_sub_tl(Rd, R, Rd);
Better as xor with 0xff.
+/*
+ This instruction performs a compare between two registers Rd and Rr, and
skips the next instruction if Rd = Rr.
+*/
+int avr_translate_CPSE(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[CPSE_Rd(opcode)];
+ TCGv Rr = cpu_r[(CPSE_hRr(opcode) << 4) | (CPSE_lRr(opcode))];
+ TCGLabel *skip= gen_new_label();
+
+ tcg_gen_movi_tl(cpu_pc, ctx->inst[1].npc); /* PC if next inst is
skipped */
+ tcg_gen_brcond_i32(TCG_COND_EQ, Rd, Rr, skip);
+ tcg_gen_movi_tl(cpu_pc, ctx->inst[0].npc); /* PC if next inst is not
skipped */
+ gen_set_label(skip);
Any reason you're not using goto_tb?
+/*
+ Subtracts one -1- from the contents of register Rd and places the result
in the destination register Rd.
+ The C Flag in SREG is not affected by the operation, thus allowing the DEC
instruction to be used on a loop
+ counter in multiple-precision computations.
+ When operating on unsigned values, only BREQ and BRNE branches can be
expected to perform consistently.
+ When operating on two’s complement values, all signed branches are
available.
+*/
+int avr_translate_DEC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[DEC_Rd(opcode)];
+
+ tcg_gen_subi_tl(Rd, Rd, 1); /* Rd = Rd - 1 */
+ tcg_gen_andi_tl(Rd, Rd, 0xff); /* make it 8 bits */
+
+ tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x7f); /* cpu_Vf = Rd ==
0x7f */
This is INC overflow.
+int avr_translate_INC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[INC_Rd(opcode)];
+
+ tcg_gen_addi_tl(Rd, Rd, 1);
+ tcg_gen_andi_tl(Rd, Rd, 0xff);
+
+ tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, Rd, 0x80); /* cpu_Vf = Rd ==
0x80 */
This is DEC overflow.
+/*
+ Load one byte indirect from data space to register and stores an clear the
bits in data space specified by the
+ register. The instruction can > +/*
only be used towards internal SRAM.
+ The data location is pointed to by the Z (16 bits) Pointer Register in the
Register File. Memory access is limited
+ to the current data segment of 64KB. To access another data segment in
devices with more than 64KB data
+ space, the RAMPZ in register in the I/O area has to be changed.
+ The Z-pointer Register is left unchanged by the operation. This
instruction is especially suited for clearing status
+ bits stored in SRAM.
+*/
+int avr_translate_LAC(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+ gen_helper_unsupported(cpu_env);
+
+ return BS_EXCP;
+ }
+
+ TCGv Rr = cpu_r[LAC_Rr(opcode)];
+ TCGv addr= gen_get_zaddr();
+ TCGv t0 = tcg_temp_new_i32();
+ TCGv t1 = tcg_temp_new_i32();
+
+ tcg_gen_qemu_ld8u(
+ t0, addr, DATA_INDEX); /* t0 = mem[addr] */
+ tcg_gen_movi_tl(t1, 0xff); /* t1 = t0 & (0xff - Rr)
*/
+ tcg_gen_sub_tl(t1, t1, Rr);
+ tcg_gen_and_tl(t1, t0, t1);
These last 3 are
tcg_gen_andc_tl(t1, t0, Rr);
+/*
+ This instruction performs 8-bit x 8-bit -> 16-bit signed multiplication.
+*/
+int avr_translate_MULS(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ if (avr_feature(env, AVR_FEATURE_MUL) == false) {
+ gen_helper_unsupported(cpu_env);
+
+ return BS_EXCP;
+ }
+
+ TCGv R0 = cpu_r[0];
+ TCGv R1 = cpu_r[1];
+ TCGv Rd = cpu_r[16 + MULS_Rd(opcode)];
+ TCGv Rr = cpu_r[16 + MULS_Rr(opcode)];
+ TCGv R = tcg_temp_new_i32();
+
+ tcg_gen_ext8s_tl(Rd, Rd); /* make Rd full 32 bit signed */
+ tcg_gen_ext8s_tl(Rr, Rr); /* make Rr full 32 bit signed */
You need to either zero-extend these again afterward, or you need to perform
the extensions into a temporary.
+/*
+ Replaces the contents of register Rd with its two’s complement; the value
$80 is left unchanged.
+*/
+int avr_translate_NEG(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ TCGv Rd = cpu_r[NEG_Rd(opcode)];
+ TCGv R = tcg_temp_new_i32();
+
+ tcg_gen_neg_tl(R, Rd);
+
+ tcg_gen_setcondi_tl(TCG_COND_NE, cpu_Cf, R, 0x00); /* Cf = R != 0x00 */
+ tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_Vf, R, 0x80); /* Vf = R == 0x80 */
Correct, but surely it's better to re-use the normal subtraction
infrastructure. In particular, setcond is at minimum two operations. The tcg
optimizer should do approximately as well folding away the zero from the subtract.
+ tcg_gen_not_tl(cpu_Hf, Rd); /* Hf = (~Rd
| R)(3) */
+ tcg_gen_or_tl(cpu_Hf, cpu_Hf, R);
+ tcg_gen_shri_tl(cpu_Hf, cpu_Hf, 3);
+ tcg_gen_andi_tl(cpu_Hf, cpu_Hf, 1);
+ gen_ZNSf(R);
+
+ tcg_temp_free_i32(R);
You failed to write back the result.
+int avr_translate_XCH(CPUAVRState *env, DisasContext *ctx, uint32_t opcode)
+{
+ if (avr_feature(env, AVR_FEATURE_RMW) == false) {
+ gen_helper_unsupported(cpu_env);
+
+ return BS_EXCP;
+ }
+
+ TCGv Rd = cpu_r[XCH_Rd(opcode)];
+ TCGv t0 = tcg_temp_new_i32();
+ TCGv addr = gen_get_zaddr();
+
+ tcg_gen_qemu_ld8u(
+ addr, t0, DATA_INDEX);
+ tcg_gen_qemu_st8(
+ addr, Rd, DATA_INDEX);
The address and data temporaries are swapped.
r~