date:20230503

[PATCH v4 06/54] tcg/i386: Introduce prepare_host_addr

2023-05-03 Thread Richard Henderson

Merge tcg_out_tlb_load, add_qemu_ldst_label,
tcg_out_test_alignment, and some code that lived in both
tcg_out_qemu_ld and tcg_out_qemu_st into one function
that returns HostAddress and TCGLabelQemuLdst structures.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 344 --
 1 file changed, 143 insertions(+), 201 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index aae698121a..237b154194 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1802,135 +1802,6 @@ static void * const qemu_st_helpers[(MO_SIZE | 
MO_BSWAP) + 1] = {
 [MO_BEUQ] = helper_be_stq_mmu,
 };
 
-/* Perform the TLB load and compare.
-
-   Inputs:
-   ADDRLO and ADDRHI contain the low and high part of the address.
-
-   MEM_INDEX and S_BITS are the memory context and log2 size of the load.
-
-   WHICH is the offset into the CPUTLBEntry structure of the slot to read.
-   This should be offsetof addr_read or addr_write.
-
-   Outputs:
-   LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
-   positions of the displacements of forward jumps to the TLB miss case.
-
-   Second argument register is loaded with the low part of the address.
-   In the TLB hit case, it has been adjusted as indicated by the TLB
-   and so is a host address.  In the TLB miss case, it continues to
-   hold a guest address.
-
-   First argument register is clobbered.  */
-
-static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg 
addrhi,
-int mem_index, MemOp opc,
-tcg_insn_unit **label_ptr, int which)
-{
-TCGType ttype = TCG_TYPE_I32;
-TCGType tlbtype = TCG_TYPE_I32;
-int trexw = 0, hrexw = 0, tlbrexw = 0;
-unsigned a_bits = get_alignment_bits(opc);
-unsigned s_bits = opc & MO_SIZE;
-unsigned a_mask = (1 << a_bits) - 1;
-unsigned s_mask = (1 << s_bits) - 1;
-target_ulong tlb_mask;
-
-if (TCG_TARGET_REG_BITS == 64) {
-if (TARGET_LONG_BITS == 64) {
-ttype = TCG_TYPE_I64;
-trexw = P_REXW;
-}
-if (TCG_TYPE_PTR == TCG_TYPE_I64) {
-hrexw = P_REXW;
-if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
-tlbtype = TCG_TYPE_I64;
-tlbrexw = P_REXW;
-}
-}
-}
-
-tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
-tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
-   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-
-tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
- TLB_MASK_TABLE_OFS(mem_index) +
- offsetof(CPUTLBDescFast, mask));
-
-tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
- TLB_MASK_TABLE_OFS(mem_index) +
- offsetof(CPUTLBDescFast, table));
-
-/* If the required alignment is at least as large as the access, simply
-   copy the address and mask.  For lesser alignments, check that we don't
-   cross pages for the complete access.  */
-if (a_bits >= s_bits) {
-tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
-} else {
-tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
- addrlo, s_mask - a_mask);
-}
-tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
-tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
-
-/* cmp 0(TCG_REG_L0), TCG_REG_L1 */
-tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
- TCG_REG_L1, TCG_REG_L0, which);
-
-/* Prepare for both the fast path add of the tlb addend, and the slow
-   path function argument setup.  */
-tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
-
-/* jne slow_path */
-tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
-label_ptr[0] = s->code_ptr;
-s->code_ptr += 4;
-
-if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-/* cmp 4(TCG_REG_L0), addrhi */
-tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, which + 4);
-
-/* jne slow_path */
-tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
-label_ptr[1] = s->code_ptr;
-s->code_ptr += 4;
-}
-
-/* TLB Hit.  */
-
-/* add addend(TCG_REG_L0), TCG_REG_L1 */
-tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L1, TCG_REG_L0,
- offsetof(CPUTLBEntry, addend));
-}
-
-/*
- * Record the context of a call to the out of line helper code for the slow 
path
- * for a load or store, so that we can later generate the correct helper code
- */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
-TCGType type, MemOpIdx oi,
-TCGReg datalo, TCGReg datahi,
-TCGReg addrlo, TCGReg addrhi,
-tcg_insn_unit *raddr,
-

[PATCH v4 10/54] tcg/aarch64: Introduce prepare_host_addr

2023-05-03 Thread Richard Henderson

Merge tcg_out_tlb_load, add_qemu_ldst_label, tcg_out_test_alignment,
and some code that lived in both tcg_out_qemu_ld and tcg_out_qemu_st
into one function that returns HostAddress and TCGLabelQemuLdst structures.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 313 +++
 1 file changed, 133 insertions(+), 180 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index d8d464e4a0..202b90c001 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1667,113 +1667,7 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 tcg_out_goto(s, lb->raddr);
 return true;
 }
-
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
-TCGType ext, TCGReg data_reg, TCGReg addr_reg,
-tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
-{
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->oi = oi;
-label->type = ext;
-label->datalo_reg = data_reg;
-label->addrlo_reg = addr_reg;
-label->raddr = tcg_splitwx_to_rx(raddr);
-label->label_ptr[0] = label_ptr;
-}
-
-/* We expect to use a 7-bit scaled negative offset from ENV.  */
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -512);
-
-/* These offsets are built into the LDP below.  */
-QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
-QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
-
-/* Load and compare a TLB entry, emitting the conditional jump to the
-   slow path for the failure case, which will be patched later when finalizing
-   the slow path. Generated code returns the host addend in X1,
-   clobbers X0,X2,X3,TMP. */
-static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, MemOp opc,
- tcg_insn_unit **label_ptr, int mem_index,
- bool is_read)
-{
-unsigned a_bits = get_alignment_bits(opc);
-unsigned s_bits = opc & MO_SIZE;
-unsigned a_mask = (1u << a_bits) - 1;
-unsigned s_mask = (1u << s_bits) - 1;
-TCGReg x3;
-TCGType mask_type;
-uint64_t compare_mask;
-
-mask_type = (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32
- ? TCG_TYPE_I64 : TCG_TYPE_I32);
-
-/* Load env_tlb(env)->f[mmu_idx].{mask,table} into {x0,x1}.  */
-tcg_out_insn(s, 3314, LDP, TCG_REG_X0, TCG_REG_X1, TCG_AREG0,
- TLB_MASK_TABLE_OFS(mem_index), 1, 0);
-
-/* Extract the TLB index from the address into X0.  */
-tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
- TCG_REG_X0, TCG_REG_X0, addr_reg,
- TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-
-/* Add the tlb_table pointer, creating the CPUTLBEntry address into X1.  */
-tcg_out_insn(s, 3502, ADD, 1, TCG_REG_X1, TCG_REG_X1, TCG_REG_X0);
-
-/* Load the tlb comparator into X0, and the fast path addend into X1.  */
-tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_X0, TCG_REG_X1, is_read
-   ? offsetof(CPUTLBEntry, addr_read)
-   : offsetof(CPUTLBEntry, addr_write));
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_X1, TCG_REG_X1,
-   offsetof(CPUTLBEntry, addend));
-
-/* For aligned accesses, we check the first byte and include the alignment
-   bits within the address.  For unaligned access, we check that we don't
-   cross pages using the address of the last byte of the access.  */
-if (a_bits >= s_bits) {
-x3 = addr_reg;
-} else {
-tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
- TCG_REG_X3, addr_reg, s_mask - a_mask);
-x3 = TCG_REG_X3;
-}
-compare_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
-
-/* Store the page mask part of the address into X3.  */
-tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
- TCG_REG_X3, x3, compare_mask);
-
-/* Perform the address comparison. */
-tcg_out_cmp(s, TARGET_LONG_BITS == 64, TCG_REG_X0, TCG_REG_X3, 0);
-
-/* If not equal, we jump to the slow path. */
-*label_ptr = s->code_ptr;
-tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
-}
-
 #else
-static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
-   unsigned a_bits)
-{
-unsigned a_mask = (1 << a_bits) - 1;
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->addrlo_reg = addr_reg;
-
-/* tst addr, #mask */
-tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
-
-label->label_ptr[0] = s->code_ptr;
-
-/* b.ne slow_path */
-tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
-
-label->raddr = tcg_splitwx_to_rx(s->code_ptr);
-}
-
 static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 {
 if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
@@ -1801,6 +1695,125 @@

[PATCH v4 30/54] tcg: Move TCGLabelQemuLdst to tcg.c

2023-05-03 Thread Richard Henderson

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/tcg.c  | 13 +
 tcg/tcg-ldst.c.inc | 14 --
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index cfd3262a4a..6f5daaee5f 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -94,6 +94,19 @@ typedef struct QEMU_PACKED {
 DebugFrameFDEHeader fde;
 } DebugFrameHeader;
 
+typedef struct TCGLabelQemuLdst {
+bool is_ld; /* qemu_ld: true, qemu_st: false */
+MemOpIdx oi;
+TCGType type;   /* result type of a load */
+TCGReg addrlo_reg;  /* reg index for low word of guest virtual addr */
+TCGReg addrhi_reg;  /* reg index for high word of guest virtual addr */
+TCGReg datalo_reg;  /* reg index for low word to be loaded or stored */
+TCGReg datahi_reg;  /* reg index for high word to be loaded or stored 
*/
+const tcg_insn_unit *raddr;   /* addr of the next IR of qemu_ld/st IR */
+tcg_insn_unit *label_ptr[2]; /* label pointers to be updated */
+QSIMPLEQ_ENTRY(TCGLabelQemuLdst) next;
+} TCGLabelQemuLdst;
+
 static void tcg_register_jit_int(const void *buf, size_t size,
  const void *debug_frame,
  size_t debug_frame_size)
diff --git a/tcg/tcg-ldst.c.inc b/tcg/tcg-ldst.c.inc
index 403cbb0f06..ffada04af0 100644
--- a/tcg/tcg-ldst.c.inc
+++ b/tcg/tcg-ldst.c.inc
@@ -20,20 +20,6 @@
  * THE SOFTWARE.
  */
 
-typedef struct TCGLabelQemuLdst {
-bool is_ld; /* qemu_ld: true, qemu_st: false */
-MemOpIdx oi;
-TCGType type;   /* result type of a load */
-TCGReg addrlo_reg;  /* reg index for low word of guest virtual addr */
-TCGReg addrhi_reg;  /* reg index for high word of guest virtual addr */
-TCGReg datalo_reg;  /* reg index for low word to be loaded or stored */
-TCGReg datahi_reg;  /* reg index for high word to be loaded or stored 
*/
-const tcg_insn_unit *raddr;   /* addr of the next IR of qemu_ld/st IR */
-tcg_insn_unit *label_ptr[2]; /* label pointers to be updated */
-QSIMPLEQ_ENTRY(TCGLabelQemuLdst) next;
-} TCGLabelQemuLdst;
-
-
 /*
  * Generate TB finalization at the end of block
  */
-- 
2.34.1

[PATCH v4 15/54] tcg/loongarch64: Introduce HostAddress

2023-05-03 Thread Richard Henderson

Collect the 2 parts of the host address into a struct.
Reorg tcg_out_qemu_{ld,st}_direct to use it.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/loongarch64/tcg-target.c.inc | 55 +---
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 2e3c67054b..6a87a5e5a3 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -1013,36 +1013,41 @@ static TCGReg tcg_out_zext_addr_if_32_bit(TCGContext *s,
 return addr;
 }
 
-static void tcg_out_qemu_ld_indexed(TCGContext *s, TCGReg rd, TCGReg rj,
-   TCGReg rk, MemOp opc, TCGType type)
+typedef struct {
+TCGReg base;
+TCGReg index;
+} HostAddress;
+
+static void tcg_out_qemu_ld_indexed(TCGContext *s, MemOp opc, TCGType type,
+TCGReg rd, HostAddress h)
 {
 /* Byte swapping is left to middle-end expansion.  */
 tcg_debug_assert((opc & MO_BSWAP) == 0);
 
 switch (opc & MO_SSIZE) {
 case MO_UB:
-tcg_out_opc_ldx_bu(s, rd, rj, rk);
+tcg_out_opc_ldx_bu(s, rd, h.base, h.index);
 break;
 case MO_SB:
-tcg_out_opc_ldx_b(s, rd, rj, rk);
+tcg_out_opc_ldx_b(s, rd, h.base, h.index);
 break;
 case MO_UW:
-tcg_out_opc_ldx_hu(s, rd, rj, rk);
+tcg_out_opc_ldx_hu(s, rd, h.base, h.index);
 break;
 case MO_SW:
-tcg_out_opc_ldx_h(s, rd, rj, rk);
+tcg_out_opc_ldx_h(s, rd, h.base, h.index);
 break;
 case MO_UL:
 if (type == TCG_TYPE_I64) {
-tcg_out_opc_ldx_wu(s, rd, rj, rk);
+tcg_out_opc_ldx_wu(s, rd, h.base, h.index);
 break;
 }
 /* fallthrough */
 case MO_SL:
-tcg_out_opc_ldx_w(s, rd, rj, rk);
+tcg_out_opc_ldx_w(s, rd, h.base, h.index);
 break;
 case MO_UQ:
-tcg_out_opc_ldx_d(s, rd, rj, rk);
+tcg_out_opc_ldx_d(s, rd, h.base, h.index);
 break;
 default:
 g_assert_not_reached();
@@ -1053,23 +1058,23 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 MemOpIdx oi, TCGType data_type)
 {
 MemOp opc = get_memop(oi);
-TCGReg base, index;
+HostAddress h;
 
 #ifdef CONFIG_SOFTMMU
 tcg_insn_unit *label_ptr[1];
 
 tcg_out_tlb_load(s, addr_reg, oi, label_ptr, 1);
-index = TCG_REG_TMP2;
+h.index = TCG_REG_TMP2;
 #else
 unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, true, addr_reg, a_bits);
 }
-index = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
+h.index = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
 #endif
 
-base = tcg_out_zext_addr_if_32_bit(s, addr_reg, TCG_REG_TMP0);
-tcg_out_qemu_ld_indexed(s, data_reg, base, index, opc, data_type);
+h.base = tcg_out_zext_addr_if_32_bit(s, addr_reg, TCG_REG_TMP0);
+tcg_out_qemu_ld_indexed(s, opc, data_type, data_reg, h);
 
 #ifdef CONFIG_SOFTMMU
 add_qemu_ldst_label(s, true, oi, data_type, data_reg, addr_reg,
@@ -1077,24 +1082,24 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 #endif
 }
 
-static void tcg_out_qemu_st_indexed(TCGContext *s, TCGReg data,
-   TCGReg rj, TCGReg rk, MemOp opc)
+static void tcg_out_qemu_st_indexed(TCGContext *s, MemOp opc,
+TCGReg rd, HostAddress h)
 {
 /* Byte swapping is left to middle-end expansion.  */
 tcg_debug_assert((opc & MO_BSWAP) == 0);
 
 switch (opc & MO_SIZE) {
 case MO_8:
-tcg_out_opc_stx_b(s, data, rj, rk);
+tcg_out_opc_stx_b(s, rd, h.base, h.index);
 break;
 case MO_16:
-tcg_out_opc_stx_h(s, data, rj, rk);
+tcg_out_opc_stx_h(s, rd, h.base, h.index);
 break;
 case MO_32:
-tcg_out_opc_stx_w(s, data, rj, rk);
+tcg_out_opc_stx_w(s, rd, h.base, h.index);
 break;
 case MO_64:
-tcg_out_opc_stx_d(s, data, rj, rk);
+tcg_out_opc_stx_d(s, rd, h.base, h.index);
 break;
 default:
 g_assert_not_reached();
@@ -1105,23 +1110,23 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 MemOpIdx oi, TCGType data_type)
 {
 MemOp opc = get_memop(oi);
-TCGReg base, index;
+HostAddress h;
 
 #ifdef CONFIG_SOFTMMU
 tcg_insn_unit *label_ptr[1];
 
 tcg_out_tlb_load(s, addr_reg, oi, label_ptr, 0);
-index = TCG_REG_TMP2;
+h.index = TCG_REG_TMP2;
 #else
 unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, false, addr_reg, a_bits);
 }
-index = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
+h.index = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO;
 #endif
 
-base = tcg_out_ze

[PATCH v4 36/54] tcg/i386: Convert tcg_out_qemu_st_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_st_helper_args.  This eliminates the use of a tail call to
the store helper.  This may or may not be an improvement, depending on
the call/return branch prediction of the host microarchitecture.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 57 +++
 1 file changed, 4 insertions(+), 53 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 17ad3c5963..7dbfcbd20f 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1854,11 +1854,8 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
  */
 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
-MemOp s_bits = opc & MO_SIZE;
+MemOp opc = get_memop(l->oi);
 tcg_insn_unit **label_ptr = &l->label_ptr[0];
-TCGReg retaddr;
 
 /* resolve label address */
 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
@@ -1866,56 +1863,10 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
 }
 
-if (TCG_TARGET_REG_BITS == 32) {
-int ofs = 0;
+tcg_out_st_helper_args(s, l, &ldst_helper_param);
+tcg_out_branch(s, 1, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
-tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-ofs += 4;
-
-tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (TARGET_LONG_BITS == 64) {
-tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-}
-
-tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (s_bits == MO_64) {
-tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-}
-
-tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
-ofs += 4;
-
-retaddr = TCG_REG_EAX;
-tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
-} else {
-tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
-l->addrlo_reg);
-tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-tcg_target_call_iarg_regs[2], l->datalo_reg);
-tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
-
-if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
-retaddr = tcg_target_call_iarg_regs[4];
-tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-} else {
-retaddr = TCG_REG_RAX;
-tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
-   TCG_TARGET_CALL_STACK_OFFSET);
-}
-}
-
-/* "Tail call" to the helper, with the return address back inline.  */
-tcg_out_push(s, retaddr);
-tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+tcg_out_jmp(s, l->raddr);
 return true;
 }
 #else
-- 
2.34.1

[PATCH v4 37/54] tcg/aarch64: Convert tcg_out_qemu_{ld,st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 40 +++-
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 202b90c001..62dd22d73c 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1580,13 +1580,6 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 }
 }
 
-static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
-{
-ptrdiff_t offset = tcg_pcrel_diff(s, target);
-tcg_debug_assert(offset == sextract64(offset, 0, 21));
-tcg_out_insn(s, 3406, ADR, rd, offset);
-}
-
 typedef struct {
 TCGReg base;
 TCGReg index;
@@ -1627,47 +1620,46 @@ static void * const qemu_st_helpers[MO_SIZE + 1] = {
 #endif
 };
 
+static const TCGLdstHelperParam ldst_helper_param = {
+.ntmp = 1, .tmp = { TCG_REG_TMP }
+};
+
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-MemOpIdx oi = lb->oi;
-MemOp opc = get_memop(oi);
+MemOp opc = get_memop(lb->oi);
 
 if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
 return false;
 }
 
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
-tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
-tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
-tcg_out_adr(s, TCG_REG_X3, lb->raddr);
+tcg_out_ld_helper_args(s, lb, &ldst_helper_param);
 tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
-
-tcg_out_movext(s, lb->type, lb->datalo_reg,
-   TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_X0);
+tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param);
 tcg_out_goto(s, lb->raddr);
 return true;
 }
 
 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-MemOpIdx oi = lb->oi;
-MemOp opc = get_memop(oi);
-MemOp size = opc & MO_SIZE;
+MemOp opc = get_memop(lb->oi);
 
 if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
 return false;
 }
 
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
-tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
-tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
-tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
-tcg_out_adr(s, TCG_REG_X4, lb->raddr);
+tcg_out_st_helper_args(s, lb, &ldst_helper_param);
 tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]);
 tcg_out_goto(s, lb->raddr);
 return true;
 }
 #else
+static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
+{
+ptrdiff_t offset = tcg_pcrel_diff(s, target);
+tcg_debug_assert(offset == sextract64(offset, 0, 21));
+tcg_out_insn(s, 3406, ADR, rd, offset);
+}
+
 static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 {
 if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
-- 
2.34.1

[PATCH v4 39/54] tcg/loongarch64: Convert tcg_out_qemu_{ld, st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.

Signed-off-by: Richard Henderson 
---
 tcg/loongarch64/tcg-target.c.inc | 37 ++--
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 2f2c34b930..60d2c904dd 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -824,51 +824,36 @@ static bool tcg_out_goto(TCGContext *s, const 
tcg_insn_unit *target)
 return reloc_br_sd10k16(s->code_ptr - 1, target);
 }
 
+static const TCGLdstHelperParam ldst_helper_param = {
+.ntmp = 1, .tmp = { TCG_REG_TMP0 }
+};
+
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
-MemOp size = opc & MO_SIZE;
+MemOp opc = get_memop(l->oi);
 
 /* resolve label address */
 if (!reloc_br_sk16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
 return false;
 }
 
-/* call load helper */
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A1, l->addrlo_reg);
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A2, oi);
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A3, (tcg_target_long)l->raddr);
-
-tcg_out_call_int(s, qemu_ld_helpers[size], false);
-
-tcg_out_movext(s, l->type, l->datalo_reg,
-   TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_A0);
+tcg_out_ld_helper_args(s, l, &ldst_helper_param);
+tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE], false);
+tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
 return tcg_out_goto(s, l->raddr);
 }
 
 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
-MemOp size = opc & MO_SIZE;
+MemOp opc = get_memop(l->oi);
 
 /* resolve label address */
 if (!reloc_br_sk16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
 return false;
 }
 
-/* call store helper */
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A1, l->addrlo_reg);
-tcg_out_movext(s, size == MO_64 ? TCG_TYPE_I32 : TCG_TYPE_I32, TCG_REG_A2,
-   l->type, size, l->datalo_reg);
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A3, oi);
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A4, (tcg_target_long)l->raddr);
-
-tcg_out_call_int(s, qemu_st_helpers[size], false);
-
+tcg_out_st_helper_args(s, l, &ldst_helper_param);
+tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE], false);
 return tcg_out_goto(s, l->raddr);
 }
 #else
-- 
2.34.1

[PATCH v4 12/54] tcg/arm: Introduce HostAddress

2023-05-03 Thread Richard Henderson

Collect the parts of the host address, and condition, into a struct.
Merge tcg_out_qemu_*_{index,direct} and use it.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.c.inc | 248 ++-
 1 file changed, 115 insertions(+), 133 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index 6ce52b9612..b6b4ffc546 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1337,6 +1337,13 @@ static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
 tcg_out32(s, insn | (rn << 16) | encode_vd(rd) | 0xf);
 }
 
+typedef struct {
+ARMCond cond;
+TCGReg base;
+int index;
+bool index_scratch;
+} HostAddress;
+
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
@@ -1696,29 +1703,49 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 }
 #endif /* SOFTMMU */
 
-static void tcg_out_qemu_ld_index(TCGContext *s, MemOp opc,
-  TCGReg datalo, TCGReg datahi,
-  TCGReg addrlo, TCGReg addend,
-  bool scratch_addend)
+static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
+   TCGReg datahi, HostAddress h)
 {
+TCGReg base;
+
 /* Byte swapping is left to middle-end expansion. */
 tcg_debug_assert((opc & MO_BSWAP) == 0);
 
 switch (opc & MO_SSIZE) {
 case MO_UB:
-tcg_out_ld8_r(s, COND_AL, datalo, addrlo, addend);
+if (h.index < 0) {
+tcg_out_ld8_12(s, h.cond, datalo, h.base, 0);
+} else {
+tcg_out_ld8_r(s, h.cond, datalo, h.base, h.index);
+}
 break;
 case MO_SB:
-tcg_out_ld8s_r(s, COND_AL, datalo, addrlo, addend);
+if (h.index < 0) {
+tcg_out_ld8s_8(s, h.cond, datalo, h.base, 0);
+} else {
+tcg_out_ld8s_r(s, h.cond, datalo, h.base, h.index);
+}
 break;
 case MO_UW:
-tcg_out_ld16u_r(s, COND_AL, datalo, addrlo, addend);
+if (h.index < 0) {
+tcg_out_ld16u_8(s, h.cond, datalo, h.base, 0);
+} else {
+tcg_out_ld16u_r(s, h.cond, datalo, h.base, h.index);
+}
 break;
 case MO_SW:
-tcg_out_ld16s_r(s, COND_AL, datalo, addrlo, addend);
+if (h.index < 0) {
+tcg_out_ld16s_8(s, h.cond, datalo, h.base, 0);
+} else {
+tcg_out_ld16s_r(s, h.cond, datalo, h.base, h.index);
+}
 break;
 case MO_UL:
-tcg_out_ld32_r(s, COND_AL, datalo, addrlo, addend);
+if (h.index < 0) {
+tcg_out_ld32_12(s, h.cond, datalo, h.base, 0);
+} else {
+tcg_out_ld32_r(s, h.cond, datalo, h.base, h.index);
+}
 break;
 case MO_UQ:
 /* We used pair allocation for datalo, so already should be aligned. */
@@ -1726,87 +1753,59 @@ static void tcg_out_qemu_ld_index(TCGContext *s, MemOp 
opc,
 tcg_debug_assert(datahi == datalo + 1);
 /* LDRD requires alignment; double-check that. */
 if (get_alignment_bits(opc) >= MO_64) {
+if (h.index < 0) {
+tcg_out_ldrd_8(s, h.cond, datalo, h.base, 0);
+break;
+}
 /*
  * Rm (the second address op) must not overlap Rt or Rt + 1.
  * Since datalo is aligned, we can simplify the test via alignment.
  * Flip the two address arguments if that works.
  */
-if ((addend & ~1) != datalo) {
-tcg_out_ldrd_r(s, COND_AL, datalo, addrlo, addend);
+if ((h.index & ~1) != datalo) {
+tcg_out_ldrd_r(s, h.cond, datalo, h.base, h.index);
 break;
 }
-if ((addrlo & ~1) != datalo) {
-tcg_out_ldrd_r(s, COND_AL, datalo, addend, addrlo);
+if ((h.base & ~1) != datalo) {
+tcg_out_ldrd_r(s, h.cond, datalo, h.index, h.base);
 break;
 }
 }
-if (scratch_addend) {
-tcg_out_ld32_rwb(s, COND_AL, datalo, addend, addrlo);
-tcg_out_ld32_12(s, COND_AL, datahi, addend, 4);
+if (h.index < 0) {
+base = h.base;
+if (datalo == h.base) {
+tcg_out_mov_reg(s, h.cond, TCG_REG_TMP, base);
+base = TCG_REG_TMP;
+}
+} else if (h.index_scratch) {
+tcg_out_ld32_rwb(s, h.cond, datalo, h.index, h.base);
+tcg_out_ld32_12(s, h.cond, datahi, h.index, 4);
+break;
 } else {
-tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_TMP,
-addend, addrlo, SHIFT_IMM_LSL(0));
-tcg_out_ld32_12(s, COND_AL, datalo, TCG_REG_TMP, 0);
-tcg_out_ld32_12(s, COND_AL, da

[PATCH v4 08/54] tcg/aarch64: Rationalize args to tcg_out_qemu_{ld, st}

2023-05-03 Thread Richard Henderson

Rename the 'ext' parameter 'data_type' to make the use clearer;
pass it to tcg_out_qemu_st as well to even out the interfaces.
Rename the 'otype' local 'addr_type' to make the use clearer.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 36 +---
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 4ec3cf3172..ecbf6564fc 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1851,22 +1851,21 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp 
memop,
 }
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-MemOpIdx oi, TCGType ext)
+MemOpIdx oi, TCGType data_type)
 {
 MemOp memop = get_memop(oi);
-const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+TCGType addr_type = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 
 /* Byte swapping is left to middle-end expansion. */
 tcg_debug_assert((memop & MO_BSWAP) == 0);
 
 #ifdef CONFIG_SOFTMMU
-unsigned mem_index = get_mmuidx(oi);
 tcg_insn_unit *label_ptr;
 
-tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 1);
-tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
-   TCG_REG_X1, otype, addr_reg);
-add_qemu_ldst_label(s, true, oi, ext, data_reg, addr_reg,
+tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, get_mmuidx(oi), 1);
+tcg_out_qemu_ld_direct(s, memop, data_type, data_reg,
+   TCG_REG_X1, addr_type, addr_reg);
+add_qemu_ldst_label(s, true, oi, data_type, data_reg, addr_reg,
 s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
 unsigned a_bits = get_alignment_bits(memop);
@@ -1874,33 +1873,32 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 tcg_out_test_alignment(s, true, addr_reg, a_bits);
 }
 if (USE_GUEST_BASE) {
-tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
-   TCG_REG_GUEST_BASE, otype, addr_reg);
+tcg_out_qemu_ld_direct(s, memop, data_type, data_reg,
+   TCG_REG_GUEST_BASE, addr_type, addr_reg);
 } else {
-tcg_out_qemu_ld_direct(s, memop, ext, data_reg,
+tcg_out_qemu_ld_direct(s, memop, data_type, data_reg,
addr_reg, TCG_TYPE_I64, TCG_REG_XZR);
 }
 #endif /* CONFIG_SOFTMMU */
 }
 
 static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
-MemOpIdx oi)
+MemOpIdx oi, TCGType data_type)
 {
 MemOp memop = get_memop(oi);
-const TCGType otype = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+TCGType addr_type = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 
 /* Byte swapping is left to middle-end expansion. */
 tcg_debug_assert((memop & MO_BSWAP) == 0);
 
 #ifdef CONFIG_SOFTMMU
-unsigned mem_index = get_mmuidx(oi);
 tcg_insn_unit *label_ptr;
 
-tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, mem_index, 0);
+tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, get_mmuidx(oi), 0);
 tcg_out_qemu_st_direct(s, memop, data_reg,
-   TCG_REG_X1, otype, addr_reg);
-add_qemu_ldst_label(s, false, oi, (memop & MO_SIZE)== MO_64,
-data_reg, addr_reg, s->code_ptr, label_ptr);
+   TCG_REG_X1, addr_type, addr_reg);
+add_qemu_ldst_label(s, false, oi, data_type, data_reg, addr_reg,
+s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
 unsigned a_bits = get_alignment_bits(memop);
 if (a_bits) {
@@ -1908,7 +1906,7 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 }
 if (USE_GUEST_BASE) {
 tcg_out_qemu_st_direct(s, memop, data_reg,
-   TCG_REG_GUEST_BASE, otype, addr_reg);
+   TCG_REG_GUEST_BASE, addr_type, addr_reg);
 } else {
 tcg_out_qemu_st_direct(s, memop, data_reg,
addr_reg, TCG_TYPE_I64, TCG_REG_XZR);
@@ -2249,7 +2247,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 break;
 case INDEX_op_qemu_st_i32:
 case INDEX_op_qemu_st_i64:
-tcg_out_qemu_st(s, REG0(0), a1, a2);
+tcg_out_qemu_st(s, REG0(0), a1, a2, ext);
 break;
 
 case INDEX_op_bswap64_i64:
-- 
2.34.1

[PATCH v4 28/54] tcg/sparc64: Drop is_64 test from tcg_out_qemu_ld data return

2023-05-03 Thread Richard Henderson

In tcg_canonicalize_memop, we remove MO_SIGN from MO_32 operations
with TCG_TYPE_I32.  Thus this is never set.  We already have an
identical test just above which does not include is_64

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/sparc64/tcg-target.c.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index 4f477d539c..dbe4bf96b9 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -1220,7 +1220,7 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, 
TCGReg addr,
 tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_O2, oi);
 
 /* We let the helper sign-extend SB and SW, but leave SL for here.  */
-if (is_64 && (memop & MO_SSIZE) == MO_SL) {
+if ((memop & MO_SSIZE) == MO_SL) {
 tcg_out_ext32s(s, data, TCG_REG_O0);
 } else {
 tcg_out_mov(s, TCG_TYPE_REG, data, TCG_REG_O0);
-- 
2.34.1

[PATCH v4 33/54] tcg: Widen helper_*_st[bw]_mmu val arguments

2023-05-03 Thread Richard Henderson

While the old type was correct in the ideal sense, some ABIs require
the argument to be zero-extended.  Using uint32_t for all such values
is a decent compromise.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-ldst.h | 10 +++---
 accel/tcg/cputlb.c |  6 +++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/tcg/tcg-ldst.h b/include/tcg/tcg-ldst.h
index 2ba22bd5fe..684e394b06 100644
--- a/include/tcg/tcg-ldst.h
+++ b/include/tcg/tcg-ldst.h
@@ -55,15 +55,19 @@ tcg_target_ulong helper_be_ldsw_mmu(CPUArchState *env, 
target_ulong addr,
 tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
 MemOpIdx oi, uintptr_t retaddr);
 
-void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
+/*
+ * Value extended to at least uint32_t, so that some ABIs do not require
+ * zero-extension from uint8_t or uint16_t.
+ */
+void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 MemOpIdx oi, uintptr_t retaddr);
-void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
MemOpIdx oi, uintptr_t retaddr);
 void helper_le_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
MemOpIdx oi, uintptr_t retaddr);
 void helper_le_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
MemOpIdx oi, uintptr_t retaddr);
-void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
MemOpIdx oi, uintptr_t retaddr);
 void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
MemOpIdx oi, uintptr_t retaddr);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index c8bd642d0e..3117886af1 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2508,7 +2508,7 @@ full_stb_mmu(CPUArchState *env, target_ulong addr, 
uint64_t val,
 store_helper(env, addr, val, oi, retaddr, MO_UB);
 }
 
-void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
+void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
 MemOpIdx oi, uintptr_t retaddr)
 {
 full_stb_mmu(env, addr, val, oi, retaddr);
@@ -2521,7 +2521,7 @@ static void full_le_stw_mmu(CPUArchState *env, 
target_ulong addr, uint64_t val,
 store_helper(env, addr, val, oi, retaddr, MO_LEUW);
 }
 
-void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
MemOpIdx oi, uintptr_t retaddr)
 {
 full_le_stw_mmu(env, addr, val, oi, retaddr);
@@ -2534,7 +2534,7 @@ static void full_be_stw_mmu(CPUArchState *env, 
target_ulong addr, uint64_t val,
 store_helper(env, addr, val, oi, retaddr, MO_BEUW);
 }
 
-void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+void helper_be_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
MemOpIdx oi, uintptr_t retaddr)
 {
 full_be_stw_mmu(env, addr, val, oi, retaddr);
-- 
2.34.1

[PATCH v4 54/54] tcg/s390x: Simplify constraints on qemu_ld/st

2023-05-03 Thread Richard Henderson

Adjust the softmmu tlb to use R0+R1, not any of the normally available
registers.  Since we handle overlap betwen inputs and helper arguments,
we can allow any allocatable reg.

Signed-off-by: Richard Henderson 
---
 tcg/s390x/tcg-target-con-set.h |  2 --
 tcg/s390x/tcg-target-con-str.h |  1 -
 tcg/s390x/tcg-target.c.inc | 36 --
 3 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index 15f1c55103..ecc079bb6d 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -10,12 +10,10 @@
  * tcg-target-con-str.h; the constraint combination is inclusive or.
  */
 C_O0_I1(r)
-C_O0_I2(L, L)
 C_O0_I2(r, r)
 C_O0_I2(r, ri)
 C_O0_I2(r, rA)
 C_O0_I2(v, r)
-C_O1_I1(r, L)
 C_O1_I1(r, r)
 C_O1_I1(v, r)
 C_O1_I1(v, v)
diff --git a/tcg/s390x/tcg-target-con-str.h b/tcg/s390x/tcg-target-con-str.h
index 6fa64a1ed6..25675b449e 100644
--- a/tcg/s390x/tcg-target-con-str.h
+++ b/tcg/s390x/tcg-target-con-str.h
@@ -9,7 +9,6 @@
  * REGS(letter, register_mask)
  */
 REGS('r', ALL_GENERAL_REGS)
-REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
 REGS('v', ALL_VECTOR_REGS)
 REGS('o', 0x) /* odd numbered general regs */
 
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index dd13326670..aacbaf21d5 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -44,18 +44,6 @@
 #define ALL_GENERAL_REGS MAKE_64BIT_MASK(0, 16)
 #define ALL_VECTOR_REGS  MAKE_64BIT_MASK(32, 32)
 
-/*
- * For softmmu, we need to avoid conflicts with the first 3
- * argument registers to perform the tlb lookup, and to call
- * the helper function.
- */
-#ifdef CONFIG_SOFTMMU
-#define SOFTMMU_RESERVE_REGS MAKE_64BIT_MASK(TCG_REG_R2, 3)
-#else
-#define SOFTMMU_RESERVE_REGS 0
-#endif
-
-
 /* Several places within the instruction set 0 means "no register"
rather than TCG_REG_R0.  */
 #define TCG_REG_NONE0
@@ -1814,13 +1802,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 ldst->oi = oi;
 ldst->addrlo_reg = addr_reg;
 
-tcg_out_sh64(s, RSY_SRLG, TCG_REG_R2, addr_reg, TCG_REG_NONE,
+tcg_out_sh64(s, RSY_SRLG, TCG_TMP0, addr_reg, TCG_REG_NONE,
  TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 19));
-tcg_out_insn(s, RXY, NG, TCG_REG_R2, TCG_AREG0, TCG_REG_NONE, mask_off);
-tcg_out_insn(s, RXY, AG, TCG_REG_R2, TCG_AREG0, TCG_REG_NONE, table_off);
+tcg_out_insn(s, RXY, NG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, mask_off);
+tcg_out_insn(s, RXY, AG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, table_off);
 
 /*
  * For aligned accesses, we check the first byte and include the alignment
@@ -1830,10 +1818,10 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 a_off = (a_bits >= s_bits ? 0 : s_mask - a_mask);
 tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
 if (a_off == 0) {
-tgen_andi_risbg(s, TCG_REG_R3, addr_reg, tlb_mask);
+tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask);
 } else {
-tcg_out_insn(s, RX, LA, TCG_REG_R3, addr_reg, TCG_REG_NONE, a_off);
-tgen_andi(s, TCG_TYPE_TL, TCG_REG_R3, tlb_mask);
+tcg_out_insn(s, RX, LA, TCG_REG_R0, addr_reg, TCG_REG_NONE, a_off);
+tgen_andi(s, TCG_TYPE_TL, TCG_REG_R0, tlb_mask);
 }
 
 if (is_ld) {
@@ -1842,16 +1830,16 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 ofs = offsetof(CPUTLBEntry, addr_write);
 }
 if (TARGET_LONG_BITS == 32) {
-tcg_out_insn(s, RX, C, TCG_REG_R3, TCG_REG_R2, TCG_REG_NONE, ofs);
+tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
 } else {
-tcg_out_insn(s, RXY, CG, TCG_REG_R3, TCG_REG_R2, TCG_REG_NONE, ofs);
+tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
 }
 
 tcg_out16(s, RI_BRC | (S390_CC_NE << 4));
 ldst->label_ptr[0] = s->code_ptr++;
 
-h->index = TCG_REG_R2;
-tcg_out_insn(s, RXY, LG, h->index, TCG_REG_R2, TCG_REG_NONE,
+h->index = TCG_TMP0;
+tcg_out_insn(s, RXY, LG, h->index, TCG_TMP0, TCG_REG_NONE,
  offsetof(CPUTLBEntry, addend));
 
 if (TARGET_LONG_BITS == 32) {
@@ -3155,10 +3143,10 @@ static TCGConstraintSetIndex 
tcg_target_op_def(TCGOpcode op)
 
 case INDEX_op_qemu_ld_i32:
 case INDEX_op_qemu_ld_i64:
-return C_O1_I1(r, L);
+return C_O1_I1(r, r);
 case INDEX_op_qemu_st_i64:
 case INDEX_op_qemu_st_i32:
-return C_O0_I2(L, L);
+return C_O0_I2(r, r);
 
 case INDEX_op_deposit_i32:
 case INDEX_op_deposit_i64:
-- 
2.34.1

[PATCH v4 13/54] tcg/arm: Introduce prepare_host_addr

2023-05-03 Thread Richard Henderson

Merge tcg_out_tlb_load, add_qemu_ldst_label, and some code that lived
in both tcg_out_qemu_ld and tcg_out_qemu_st into one function that
returns HostAddress and TCGLabelQemuLdst structures.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.c.inc | 351 ++-
 1 file changed, 159 insertions(+), 192 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index b6b4ffc546..c744512778 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1434,125 +1434,6 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg 
argreg,
 }
 }
 
-#define TLB_SHIFT  (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
-
-/* We expect to use an 9-bit sign-magnitude negative offset from ENV.  */
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
-
-/* These offsets are built into the LDRD below.  */
-QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
-QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
-
-/* Load and compare a TLB entry, leaving the flags set.  Returns the register
-   containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */
-
-static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-   MemOp opc, int mem_index, bool is_load)
-{
-int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
-   : offsetof(CPUTLBEntry, addr_write));
-int fast_off = TLB_MASK_TABLE_OFS(mem_index);
-unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
-unsigned a_mask = (1 << get_alignment_bits(opc)) - 1;
-TCGReg t_addr;
-
-/* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
-tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
-
-/* Extract the tlb index from the address into R0.  */
-tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
-SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
-
-/*
- * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
- * Load the tlb comparator into R2/R3 and the fast path addend into R1.
- */
-if (cmp_off == 0) {
-if (TARGET_LONG_BITS == 64) {
-tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
-} else {
-tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
-}
-} else {
-tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
-TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
-if (TARGET_LONG_BITS == 64) {
-tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
-} else {
-tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
-}
-}
-
-/* Load the tlb addend.  */
-tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
-offsetof(CPUTLBEntry, addend));
-
-/*
- * Check alignment, check comparators.
- * Do this in 2-4 insns.  Use MOVW for v7, if possible,
- * to reduce the number of sequential conditional instructions.
- * Almost all guests have at least 4k pages, which means that we need
- * to clear at least 9 bits even for an 8-byte memory, which means it
- * isn't worth checking for an immediate operand for BIC.
- *
- * For unaligned accesses, test the page of the last unit of alignment.
- * This leaves the least significant alignment bits unchanged, and of
- * course must be zero.
- */
-t_addr = addrlo;
-if (a_mask < s_mask) {
-t_addr = TCG_REG_R0;
-tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr,
-addrlo, s_mask - a_mask);
-}
-if (use_armv7_instructions && TARGET_PAGE_BITS <= 16) {
-tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(TARGET_PAGE_MASK | a_mask));
-tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
-t_addr, TCG_REG_TMP, 0);
-tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
-} else {
-if (a_mask) {
-tcg_debug_assert(a_mask <= 0xff);
-tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
-}
-tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr,
-SHIFT_IMM_LSR(TARGET_PAGE_BITS));
-tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP,
-0, TCG_REG_R2, TCG_REG_TMP,
-SHIFT_IMM_LSL(TARGET_PAGE_BITS));
-}
-
-if (TARGET_LONG_BITS == 64) {
-tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);
-}
-
-return TCG_REG_R1;
-}
-
-/* Record the context of a call to the out of line helper code for the slow
-   path for a load or store, so that we can later generate the correct
-   helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
-MemOpIdx oi, TCGType type,
-TCGReg datalo, TC

[PATCH v4 49/54] tcg/ppc: Adjust constraints on qemu_ld/st

2023-05-03 Thread Richard Henderson

The softmmu tlb uses TCG_REG_{TMP1,TMP2,R0}, not any of the normally
available registers.  Now that we handle overlap betwen inputs and
helper arguments, we can allow any allocatable reg.

Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target-con-set.h | 11 ---
 tcg/ppc/tcg-target-con-str.h |  2 --
 tcg/ppc/tcg-target.c.inc | 32 ++--
 3 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/tcg/ppc/tcg-target-con-set.h b/tcg/ppc/tcg-target-con-set.h
index a1a345883d..f206b29205 100644
--- a/tcg/ppc/tcg-target-con-set.h
+++ b/tcg/ppc/tcg-target-con-set.h
@@ -12,18 +12,15 @@
 C_O0_I1(r)
 C_O0_I2(r, r)
 C_O0_I2(r, ri)
-C_O0_I2(S, S)
 C_O0_I2(v, r)
-C_O0_I3(S, S, S)
+C_O0_I3(r, r, r)
 C_O0_I4(r, r, ri, ri)
-C_O0_I4(S, S, S, S)
-C_O1_I1(r, L)
+C_O0_I4(r, r, r, r)
 C_O1_I1(r, r)
 C_O1_I1(v, r)
 C_O1_I1(v, v)
 C_O1_I1(v, vr)
 C_O1_I2(r, 0, rZ)
-C_O1_I2(r, L, L)
 C_O1_I2(r, rI, ri)
 C_O1_I2(r, rI, rT)
 C_O1_I2(r, r, r)
@@ -36,7 +33,7 @@ C_O1_I2(v, v, v)
 C_O1_I3(v, v, v, v)
 C_O1_I4(r, r, ri, rZ, rZ)
 C_O1_I4(r, r, r, ri, ri)
-C_O2_I1(L, L, L)
-C_O2_I2(L, L, L, L)
+C_O2_I1(r, r, r)
+C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, rI, rZM, r, r)
 C_O2_I4(r, r, r, r, rI, rZM)
diff --git a/tcg/ppc/tcg-target-con-str.h b/tcg/ppc/tcg-target-con-str.h
index 298ca20d5b..f3bf030bc3 100644
--- a/tcg/ppc/tcg-target-con-str.h
+++ b/tcg/ppc/tcg-target-con-str.h
@@ -14,8 +14,6 @@ REGS('A', 1u << TCG_REG_R3)
 REGS('B', 1u << TCG_REG_R4)
 REGS('C', 1u << TCG_REG_R5)
 REGS('D', 1u << TCG_REG_R6)
-REGS('L', ALL_QLOAD_REGS)
-REGS('S', ALL_QSTORE_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 6850ecbc80..5a4ec0470a 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -93,18 +93,6 @@
 #define ALL_GENERAL_REGS  0xu
 #define ALL_VECTOR_REGS   0xull
 
-#ifdef CONFIG_SOFTMMU
-#define ALL_QLOAD_REGS \
-(ALL_GENERAL_REGS & \
- ~((1 << TCG_REG_R3) | (1 << TCG_REG_R4) | (1 << TCG_REG_R5)))
-#define ALL_QSTORE_REGS \
-(ALL_GENERAL_REGS & ~((1 << TCG_REG_R3) | (1 << TCG_REG_R4) | \
-  (1 << TCG_REG_R5) | (1 << TCG_REG_R6)))
-#else
-#define ALL_QLOAD_REGS  (ALL_GENERAL_REGS & ~(1 << TCG_REG_R3))
-#define ALL_QSTORE_REGS ALL_QLOAD_REGS
-#endif
-
 TCGPowerISA have_isa;
 static bool have_isel;
 bool have_altivec;
@@ -3752,23 +3740,23 @@ static TCGConstraintSetIndex 
tcg_target_op_def(TCGOpcode op)
 
 case INDEX_op_qemu_ld_i32:
 return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-? C_O1_I1(r, L)
-: C_O1_I2(r, L, L));
+? C_O1_I1(r, r)
+: C_O1_I2(r, r, r));
 
 case INDEX_op_qemu_st_i32:
 return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-? C_O0_I2(S, S)
-: C_O0_I3(S, S, S));
+? C_O0_I2(r, r)
+: C_O0_I3(r, r, r));
 
 case INDEX_op_qemu_ld_i64:
-return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
-: TARGET_LONG_BITS == 32 ? C_O2_I1(L, L, L)
-: C_O2_I2(L, L, L, L));
+return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r)
+: TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, r)
+: C_O2_I2(r, r, r, r));
 
 case INDEX_op_qemu_st_i64:
-return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(S, S)
-: TARGET_LONG_BITS == 32 ? C_O0_I3(S, S, S)
-: C_O0_I4(S, S, S, S));
+return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, r)
+: TARGET_LONG_BITS == 32 ? C_O0_I3(r, r, r)
+: C_O0_I4(r, r, r, r));
 
 case INDEX_op_add_vec:
 case INDEX_op_sub_vec:
-- 
2.34.1

[PATCH v4 35/54] tcg/i386: Convert tcg_out_qemu_ld_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args and tcg_out_ld_helper_ret.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 71 +++
 1 file changed, 28 insertions(+), 43 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 8752968af2..17ad3c5963 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1802,13 +1802,37 @@ static void * const qemu_st_helpers[(MO_SIZE | 
MO_BSWAP) + 1] = {
 [MO_BEUQ] = helper_be_stq_mmu,
 };
 
+/*
+ * Because i686 has no register parameters and because x86_64 has xchg
+ * to handle addr/data register overlap, we have placed all input arguments
+ * before we need might need a scratch reg.
+ *
+ * Even then, a scratch is only needed for l->raddr.  Rather than expose
+ * a general-purpose scratch when we don't actually know it's available,
+ * use the ra_gen hook to load into RAX if needed.
+ */
+#if TCG_TARGET_REG_BITS == 64
+static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
+{
+if (arg < 0) {
+arg = TCG_REG_RAX;
+}
+tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
+return arg;
+}
+static const TCGLdstHelperParam ldst_helper_param = {
+.ra_gen = ldst_ra_gen
+};
+#else
+static const TCGLdstHelperParam ldst_helper_param = { };
+#endif
+
 /*
  * Generate code for the slow path for a load at the end of block
  */
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
+MemOp opc = get_memop(l->oi);
 tcg_insn_unit **label_ptr = &l->label_ptr[0];
 
 /* resolve label address */
@@ -1817,49 +1841,10 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
 }
 
-if (TCG_TARGET_REG_BITS == 32) {
-int ofs = 0;
-
-tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-ofs += 4;
-
-tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (TARGET_LONG_BITS == 64) {
-tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-}
-
-tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
-ofs += 4;
-
-tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
-} else {
-tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
-l->addrlo_reg);
-tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
-tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
- (uintptr_t)l->raddr);
-}
-
+tcg_out_ld_helper_args(s, l, &ldst_helper_param);
 tcg_out_branch(s, 1, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
 
-if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
-TCGMovExtend ext[2] = {
-{ .dst = l->datalo_reg, .dst_type = TCG_TYPE_I32,
-  .src = TCG_REG_EAX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
-{ .dst = l->datahi_reg, .dst_type = TCG_TYPE_I32,
-  .src = TCG_REG_EDX, .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
-};
-tcg_out_movext2(s, &ext[0], &ext[1], -1);
-} else {
-tcg_out_movext(s, l->type, l->datalo_reg,
-   TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_EAX);
-}
-
-/* Jump to the code corresponding to next IR of qemu_st */
 tcg_out_jmp(s, l->raddr);
 return true;
 }
-- 
2.34.1

[PATCH v4 44/54] tcg/loongarch64: Simplify constraints on qemu_ld/st

2023-05-03 Thread Richard Henderson

The softmmu tlb uses TCG_REG_TMP[0-2], not any of the normally available
registers.  Now that we handle overlap betwen inputs and helper arguments,
we can allow any allocatable reg.

Signed-off-by: Richard Henderson 
---
 tcg/loongarch64/tcg-target-con-set.h |  2 --
 tcg/loongarch64/tcg-target-con-str.h |  1 -
 tcg/loongarch64/tcg-target.c.inc | 23 ---
 3 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/tcg/loongarch64/tcg-target-con-set.h 
b/tcg/loongarch64/tcg-target-con-set.h
index 172c107289..c2bde44613 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -17,9 +17,7 @@
 C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
-C_O0_I2(LZ, L)
 C_O1_I1(r, r)
-C_O1_I1(r, L)
 C_O1_I2(r, r, rC)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
diff --git a/tcg/loongarch64/tcg-target-con-str.h 
b/tcg/loongarch64/tcg-target-con-str.h
index 541ff47fa9..6e9ccca3ad 100644
--- a/tcg/loongarch64/tcg-target-con-str.h
+++ b/tcg/loongarch64/tcg-target-con-str.h
@@ -14,7 +14,6 @@
  * REGS(letter, register_mask)
  */
 REGS('r', ALL_GENERAL_REGS)
-REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 60d2c904dd..83fa45c802 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -133,18 +133,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind 
kind, int slot)
 #define TCG_CT_CONST_C12   0x1000
 #define TCG_CT_CONST_WSZ   0x2000
 
-#define ALL_GENERAL_REGS  MAKE_64BIT_MASK(0, 32)
-/*
- * For softmmu, we need to avoid conflicts with the first 5
- * argument registers to call the helper.  Some of these are
- * also used for the tlb lookup.
- */
-#ifdef CONFIG_SOFTMMU
-#define SOFTMMU_RESERVE_REGS  MAKE_64BIT_MASK(TCG_REG_A0, 5)
-#else
-#define SOFTMMU_RESERVE_REGS  0
-#endif
-
+#define ALL_GENERAL_REGS   MAKE_64BIT_MASK(0, 32)
 
 static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
 {
@@ -1541,16 +1530,14 @@ static TCGConstraintSetIndex 
tcg_target_op_def(TCGOpcode op)
 case INDEX_op_st32_i64:
 case INDEX_op_st_i32:
 case INDEX_op_st_i64:
+case INDEX_op_qemu_st_i32:
+case INDEX_op_qemu_st_i64:
 return C_O0_I2(rZ, r);
 
 case INDEX_op_brcond_i32:
 case INDEX_op_brcond_i64:
 return C_O0_I2(rZ, rZ);
 
-case INDEX_op_qemu_st_i32:
-case INDEX_op_qemu_st_i64:
-return C_O0_I2(LZ, L);
-
 case INDEX_op_ext8s_i32:
 case INDEX_op_ext8s_i64:
 case INDEX_op_ext8u_i32:
@@ -1586,11 +1573,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_ld32u_i64:
 case INDEX_op_ld_i32:
 case INDEX_op_ld_i64:
-return C_O1_I1(r, r);
-
 case INDEX_op_qemu_ld_i32:
 case INDEX_op_qemu_ld_i64:
-return C_O1_I1(r, L);
+return C_O1_I1(r, r);
 
 case INDEX_op_andc_i32:
 case INDEX_op_andc_i64:
-- 
2.34.1

[PATCH v4 43/54] tcg/s390x: Convert tcg_out_qemu_{ld,st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.

Signed-off-by: Richard Henderson 
---
 tcg/s390x/tcg-target.c.inc | 35 ++-
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index c3157d22be..dfcf4d9e34 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -1718,26 +1718,22 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp 
opc, TCGReg data,
 }
 
 #if defined(CONFIG_SOFTMMU)
+static const TCGLdstHelperParam ldst_helper_param = {
+.ntmp = 1, .tmp = { TCG_TMP0 }
+};
+
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-TCGReg addr_reg = lb->addrlo_reg;
-TCGReg data_reg = lb->datalo_reg;
-MemOpIdx oi = lb->oi;
-MemOp opc = get_memop(oi);
+MemOp opc = get_memop(lb->oi);
 
 if (!patch_reloc(lb->label_ptr[0], R_390_PC16DBL,
  (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 2)) {
 return false;
 }
 
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_AREG0);
-if (TARGET_LONG_BITS == 64) {
-tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R3, addr_reg);
-}
-tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R4, oi);
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R5, (uintptr_t)lb->raddr);
-tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
-tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_R2);
+tcg_out_ld_helper_args(s, lb, &ldst_helper_param);
+tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param);
 
 tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
 return true;
@@ -1745,25 +1741,14 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 
 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-TCGReg addr_reg = lb->addrlo_reg;
-TCGReg data_reg = lb->datalo_reg;
-MemOpIdx oi = lb->oi;
-MemOp opc = get_memop(oi);
-MemOp size = opc & MO_SIZE;
+MemOp opc = get_memop(lb->oi);
 
 if (!patch_reloc(lb->label_ptr[0], R_390_PC16DBL,
  (intptr_t)tcg_splitwx_to_rx(s->code_ptr), 2)) {
 return false;
 }
 
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R2, TCG_AREG0);
-if (TARGET_LONG_BITS == 64) {
-tcg_out_mov(s, TCG_TYPE_I64, TCG_REG_R3, addr_reg);
-}
-tcg_out_movext(s, size == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32,
-   TCG_REG_R4, lb->type, size, data_reg);
-tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_R5, oi);
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R6, (uintptr_t)lb->raddr);
+tcg_out_st_helper_args(s, lb, &ldst_helper_param);
 tcg_out_call_int(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
 tgen_gotoi(s, S390_CC_ALWAYS, lb->raddr);
-- 
2.34.1

[PATCH v4 04/54] tcg/i386: Drop r0+r1 local variables from tcg_out_tlb_load

2023-05-03 Thread Richard Henderson

Use TCG_REG_L[01] constants directly.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 909eecd4a3..78160f453b 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1810,8 +1810,6 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 int mem_index, MemOp opc,
 tcg_insn_unit **label_ptr, int which)
 {
-const TCGReg r0 = TCG_REG_L0;
-const TCGReg r1 = TCG_REG_L1;
 TCGType ttype = TCG_TYPE_I32;
 TCGType tlbtype = TCG_TYPE_I32;
 int trexw = 0, hrexw = 0, tlbrexw = 0;
@@ -1835,15 +1833,15 @@ static inline void tcg_out_tlb_load(TCGContext *s, 
TCGReg addrlo, TCGReg addrhi,
 }
 }
 
-tcg_out_mov(s, tlbtype, r0, addrlo);
-tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
+tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
+tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 
-tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
+tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
  TLB_MASK_TABLE_OFS(mem_index) +
  offsetof(CPUTLBDescFast, mask));
 
-tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
+tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
  TLB_MASK_TABLE_OFS(mem_index) +
  offsetof(CPUTLBDescFast, table));
 
@@ -1851,19 +1849,21 @@ static inline void tcg_out_tlb_load(TCGContext *s, 
TCGReg addrlo, TCGReg addrhi,
copy the address and mask.  For lesser alignments, check that we don't
cross pages for the complete access.  */
 if (a_bits >= s_bits) {
-tcg_out_mov(s, ttype, r1, addrlo);
+tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
 } else {
-tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
+tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
+ addrlo, s_mask - a_mask);
 }
 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
-tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
+tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
 
-/* cmp 0(r0), r1 */
-tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
+/* cmp 0(TCG_REG_L0), TCG_REG_L1 */
+tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
+ TCG_REG_L1, TCG_REG_L0, which);
 
 /* Prepare for both the fast path add of the tlb addend, and the slow
path function argument setup.  */
-tcg_out_mov(s, ttype, r1, addrlo);
+tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
 
 /* jne slow_path */
 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
@@ -1871,8 +1871,8 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 s->code_ptr += 4;
 
 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-/* cmp 4(r0), addrhi */
-tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
+/* cmp 4(TCG_REG_L0), addrhi */
+tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, which + 4);
 
 /* jne slow_path */
 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
@@ -1882,8 +1882,8 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 
 /* TLB Hit.  */
 
-/* add addend(r0), r1 */
-tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
+/* add addend(TCG_REG_L0), TCG_REG_L1 */
+tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L1, TCG_REG_L0,
  offsetof(CPUTLBEntry, addend));
 }
 
-- 
2.34.1

[PATCH v4 23/54] tcg/riscv: Rationalize args to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

Interpret the variable argument placement in the caller.  Pass data_type
instead of is64 -- there are several places where we already convert back
from bool to type.  Clean things up by using type throughout.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/riscv/tcg-target.c.inc | 66 ++
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index 7a674ff5ce..a4cf60ca75 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -1087,7 +1087,7 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg val,
-   TCGReg base, MemOp opc, bool is_64)
+   TCGReg base, MemOp opc, TCGType type)
 {
 /* Byte swapping is left to middle-end expansion. */
 tcg_debug_assert((opc & MO_BSWAP) == 0);
@@ -1106,7 +1106,7 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
val,
 tcg_out_opc_imm(s, OPC_LH, val, base, 0);
 break;
 case MO_UL:
-if (is_64) {
+if (type == TCG_TYPE_I64) {
 tcg_out_opc_imm(s, OPC_LWU, val, base, 0);
 break;
 }
@@ -1122,30 +1122,21 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
TCGReg val,
 }
 }
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg addr_reg, data_reg;
-MemOpIdx oi;
-MemOp opc;
-#if defined(CONFIG_SOFTMMU)
-tcg_insn_unit *label_ptr[1];
-#else
-unsigned a_bits;
-#endif
+MemOp opc = get_memop(oi);
 TCGReg base;
 
-data_reg = *args++;
-addr_reg = *args++;
-oi = *args++;
-opc = get_memop(oi);
-
 #if defined(CONFIG_SOFTMMU)
+tcg_insn_unit *label_ptr[1];
+
 base = tcg_out_tlb_load(s, addr_reg, oi, label_ptr, 1);
-tcg_out_qemu_ld_direct(s, data_reg, base, opc, is_64);
-add_qemu_ldst_label(s, 1, oi, (is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-data_reg, addr_reg, s->code_ptr, label_ptr);
+tcg_out_qemu_ld_direct(s, data_reg, base, opc, data_type);
+add_qemu_ldst_label(s, true, oi, data_type, data_reg, addr_reg,
+s->code_ptr, label_ptr);
 #else
-a_bits = get_alignment_bits(opc);
+unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, true, addr_reg, a_bits);
 }
@@ -1158,7 +1149,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, bool is_64)
 tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_GUEST_BASE_REG, base);
 base = TCG_REG_TMP0;
 }
-tcg_out_qemu_ld_direct(s, data_reg, base, opc, is_64);
+tcg_out_qemu_ld_direct(s, data_reg, base, opc, data_type);
 #endif
 }
 
@@ -1186,30 +1177,21 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
TCGReg val,
 }
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg addr_reg, data_reg;
-MemOpIdx oi;
-MemOp opc;
-#if defined(CONFIG_SOFTMMU)
-tcg_insn_unit *label_ptr[1];
-#else
-unsigned a_bits;
-#endif
+MemOp opc = get_memop(oi);
 TCGReg base;
 
-data_reg = *args++;
-addr_reg = *args++;
-oi = *args++;
-opc = get_memop(oi);
-
 #if defined(CONFIG_SOFTMMU)
+tcg_insn_unit *label_ptr[1];
+
 base = tcg_out_tlb_load(s, addr_reg, oi, label_ptr, 0);
 tcg_out_qemu_st_direct(s, data_reg, base, opc);
-add_qemu_ldst_label(s, 0, oi, (is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-data_reg, addr_reg, s->code_ptr, label_ptr);
+add_qemu_ldst_label(s, false, oi, data_type, data_reg, addr_reg,
+s->code_ptr, label_ptr);
 #else
-a_bits = get_alignment_bits(opc);
+unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, false, addr_reg, a_bits);
 }
@@ -1508,16 +1490,16 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 break;
 
 case INDEX_op_qemu_ld_i32:
-tcg_out_qemu_ld(s, args, false);
+tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I32);
 break;
 case INDEX_op_qemu_ld_i64:
-tcg_out_qemu_ld(s, args, true);
+tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64);
 break;
 case INDEX_op_qemu_st_i32:
-tcg_out_qemu_st(s, args, false);
+tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32);
 break;
 case INDEX_op_qemu_st_i64:
-tcg_out_qemu_st(s, args, true);
+tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64);
 break;
 
 case INDEX_

[PATCH v4 01/54] tcg/i386: Rationalize args to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

Interpret the variable argument placement in the caller.  Pass data_type
instead of is64 -- there are several places where we already convert back
from bool to type.  Clean things up by using type throughout.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 111 +-
 1 file changed, 50 insertions(+), 61 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index caf91a3151..cfa2349b03 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1884,8 +1884,8 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
  * Record the context of a call to the out of line helper code for the slow 
path
  * for a load or store, so that we can later generate the correct helper code
  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
-MemOpIdx oi,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
+TCGType type, MemOpIdx oi,
 TCGReg datalo, TCGReg datahi,
 TCGReg addrlo, TCGReg addrhi,
 tcg_insn_unit *raddr,
@@ -1895,7 +1895,7 @@ static void add_qemu_ldst_label(TCGContext *s, bool 
is_ld, bool is_64,
 
 label->is_ld = is_ld;
 label->oi = oi;
-label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+label->type = type;
 label->datalo_reg = datalo;
 label->datahi_reg = datahi;
 label->addrlo_reg = addrlo;
@@ -2152,11 +2152,10 @@ static inline int setup_guest_base_seg(void)
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
TCGReg base, int index, intptr_t ofs,
-   int seg, bool is64, MemOp memop)
+   int seg, TCGType type, MemOp memop)
 {
-TCGType type = is64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 bool use_movbe = false;
-int rexw = is64 * P_REXW;
+int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 int movop = OPC_MOVL_GvEv;
 
 /* Do big-endian loads with movbe.  */
@@ -2246,50 +2245,34 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
 }
 }
 
-/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
-   EAX. It will be useful once fixed registers globals are less
-   common. */
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg datalo, datahi, addrlo;
-TCGReg addrhi __attribute__((unused));
-MemOpIdx oi;
-MemOp opc;
+MemOp opc = get_memop(oi);
+
 #if defined(CONFIG_SOFTMMU)
-int mem_index;
 tcg_insn_unit *label_ptr[2];
-#else
-unsigned a_bits;
-#endif
 
-datalo = *args++;
-datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
-addrlo = *args++;
-addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
-oi = *args++;
-opc = get_memop(oi);
-
-#if defined(CONFIG_SOFTMMU)
-mem_index = get_mmuidx(oi);
-
-tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
+tcg_out_tlb_load(s, addrlo, addrhi, get_mmuidx(oi), opc,
  label_ptr, offsetof(CPUTLBEntry, addr_read));
 
 /* TLB Hit.  */
-tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
+tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1,
+   -1, 0, 0, data_type, opc);
 
 /* Record the current context of a load into ldst label */
-add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
-s->code_ptr, label_ptr);
+add_qemu_ldst_label(s, true, data_type, oi, datalo, datahi,
+addrlo, addrhi, s->code_ptr, label_ptr);
 #else
-a_bits = get_alignment_bits(opc);
+unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
 }
 
 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
x86_guest_base_offset, x86_guest_base_seg,
-   is64, opc);
+   data_type, opc);
 #endif
 }
 
@@ -2345,40 +2328,26 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
 }
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg datalo, datahi, addrlo;
-TCGReg addrhi __attribute__((unused));
-MemOpIdx oi;
-MemOp opc;
+MemOp opc = get_memo

[PATCH v4 15/57] accel/tcg: Use have_atomic16 in ldst_atomicity.c.inc

2023-05-03 Thread Richard Henderson

Hosts using Intel and AMD AVX cpus are quite common.
Add fast paths through ldst_atomicity using this.

Only enable with CONFIG_INT128; some older clang versions do not
support __int128_t, and the inline assembly won't work on structures.

Signed-off-by: Richard Henderson 
---
 accel/tcg/ldst_atomicity.c.inc | 76 +++---
 1 file changed, 60 insertions(+), 16 deletions(-)

diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index c43f101ebe..07bfa5c3c8 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -35,6 +35,14 @@
 
 #if defined(CONFIG_ATOMIC128)
 # define HAVE_al16_fasttrue
+#elif defined(CONFIG_TCG_INTERPRETER)
+/*
+ * FIXME: host specific detection for this is in tcg/$host/,
+ * but we're using tcg/tci/ instead.
+ */
+# define HAVE_al16_fastfalse
+#elif defined(__x86_64__) && defined(CONFIG_INT128)
+# define HAVE_al16_fastlikely(have_atomic16)
 #else
 # define HAVE_al16_fastfalse
 #endif
@@ -162,6 +170,12 @@ load_atomic16(void *pv)
 
 r.u = qatomic_read__nocheck(p);
 return r.s;
+#elif defined(__x86_64__) && defined(CONFIG_INT128)
+Int128Alias r;
+
+/* Via HAVE_al16_fast, have_atomic16 is true. */
+asm("vmovdqa %1, %0" : "=x" (r.u) : "m" (*(Int128 *)pv));
+return r.s;
 #else
 qemu_build_not_reached();
 #endif
@@ -383,6 +397,24 @@ load_atom_extract_al16_or_al8(void *pv, int s)
 r = qatomic_read__nocheck(p16);
 }
 return r >> shr;
+#elif defined(__x86_64__) && defined(CONFIG_INT128)
+uintptr_t pi = (uintptr_t)pv;
+int shr = (pi & 7) * 8;
+uint64_t a, b;
+
+/* Via HAVE_al16_fast, have_atomic16 is true. */
+pv = (void *)(pi & ~7);
+if (pi & 8) {
+uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
+a = qatomic_read__nocheck(p8);
+b = qatomic_read__nocheck(p8 + 1);
+} else {
+asm("vmovdqa %2, %0\n\tvpextrq $1, %0, %1"
+: "=x"(a), "=r"(b) : "m" (*(__uint128_t *)pv));
+}
+asm("shrd %b2, %1, %0" : "+r"(a) : "r"(b), "c"(shr));
+
+return a;
 #else
 qemu_build_not_reached();
 #endif
@@ -699,23 +731,35 @@ static inline void ATTRIBUTE_ATOMIC128_OPT
 store_atomic16(void *pv, Int128Alias val)
 {
 #if defined(CONFIG_ATOMIC128)
-__uint128_t *pu = __builtin_assume_aligned(pv, 16);
-qatomic_set__nocheck(pu, val.u);
-#elif defined(CONFIG_CMPXCHG128)
-__uint128_t *pu = __builtin_assume_aligned(pv, 16);
-__uint128_t o;
-
-/*
- * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
- * defer to libatomic, so we must use __sync_val_compare_and_swap_16
- * and accept the sequential consistency that comes with it.
- */
-do {
-o = *pu;
-} while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
-#else
-qemu_build_not_reached();
+{
+__uint128_t *pu = __builtin_assume_aligned(pv, 16);
+qatomic_set__nocheck(pu, val.u);
+return;
+}
 #endif
+#if defined(__x86_64__) && defined(CONFIG_INT128)
+if (HAVE_al16_fast) {
+asm("vmovdqa %1, %0" : "=m"(*(__uint128_t *)pv) : "x" (val.u));
+return;
+}
+#endif
+#if defined(CONFIG_CMPXCHG128)
+{
+__uint128_t *pu = __builtin_assume_aligned(pv, 16);
+__uint128_t o;
+
+/*
+ * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
+ * defer to libatomic, so we must use __sync_val_compare_and_swap_16
+ * and accept the sequential consistency that comes with it.
+ */
+do {
+o = *pu;
+} while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
+return;
+}
+#endif
+qemu_build_not_reached();
 }
 
 /**
-- 
2.34.1

[PATCH v4 20/57] tcg: Introduce TCG_OPF_TYPE_MASK

2023-05-03 Thread Richard Henderson

Reorg TCG_OPF_64BIT and TCG_OPF_VECTOR into a two-bit field so
that we can add TCG_OPF_128BIT without requiring another bit.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg.h| 22 --
 tcg/optimize.c   | 15 ---
 tcg/tcg.c|  4 ++--
 tcg/aarch64/tcg-target.c.inc |  8 +---
 tcg/tci/tcg-target.c.inc |  3 ++-
 5 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index b19e167e1d..efbd891f87 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -932,24 +932,26 @@ typedef struct TCGArgConstraint {
 
 /* Bits for TCGOpDef->flags, 8 bits available, all used.  */
 enum {
+/* Two bits describing the output type. */
+TCG_OPF_TYPE_MASK= 0x03,
+TCG_OPF_32BIT= 0x00,
+TCG_OPF_64BIT= 0x01,
+TCG_OPF_VECTOR   = 0x02,
+TCG_OPF_128BIT   = 0x03,
 /* Instruction exits the translation block.  */
-TCG_OPF_BB_EXIT  = 0x01,
+TCG_OPF_BB_EXIT  = 0x04,
 /* Instruction defines the end of a basic block.  */
-TCG_OPF_BB_END   = 0x02,
+TCG_OPF_BB_END   = 0x08,
 /* Instruction clobbers call registers and potentially update globals.  */
-TCG_OPF_CALL_CLOBBER = 0x04,
+TCG_OPF_CALL_CLOBBER = 0x10,
 /* Instruction has side effects: it cannot be removed if its outputs
are not used, and might trigger exceptions.  */
-TCG_OPF_SIDE_EFFECTS = 0x08,
-/* Instruction operands are 64-bits (otherwise 32-bits).  */
-TCG_OPF_64BIT= 0x10,
+TCG_OPF_SIDE_EFFECTS = 0x20,
 /* Instruction is optional and not implemented by the host, or insn
is generic and should not be implemened by the host.  */
-TCG_OPF_NOT_PRESENT  = 0x20,
-/* Instruction operands are vectors.  */
-TCG_OPF_VECTOR   = 0x40,
+TCG_OPF_NOT_PRESENT  = 0x40,
 /* Instruction is a conditional branch. */
-TCG_OPF_COND_BRANCH  = 0x80
+TCG_OPF_COND_BRANCH  = 0x80,
 };
 
 typedef struct TCGOpDef {
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 9614fa3638..37d46f2a1f 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -2051,12 +2051,21 @@ void tcg_optimize(TCGContext *s)
 copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
 /* Pre-compute the type of the operation. */
-if (def->flags & TCG_OPF_VECTOR) {
+switch (def->flags & TCG_OPF_TYPE_MASK) {
+case TCG_OPF_VECTOR:
 ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
-} else if (def->flags & TCG_OPF_64BIT) {
+break;
+case TCG_OPF_128BIT:
+ctx.type = TCG_TYPE_I128;
+break;
+case TCG_OPF_64BIT:
 ctx.type = TCG_TYPE_I64;
-} else {
+break;
+case TCG_OPF_32BIT:
 ctx.type = TCG_TYPE_I32;
+break;
+default:
+qemu_build_not_reached();
 }
 
 /* Assume all bits affected, no bits known zero, no sign reps. */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index d0afabf194..cb5ca9b612 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2294,7 +2294,7 @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool 
have_prefs)
 nb_iargs = def->nb_iargs;
 nb_cargs = def->nb_cargs;
 
-if (def->flags & TCG_OPF_VECTOR) {
+if ((def->flags & TCG_OPF_TYPE_MASK) == TCG_OPF_VECTOR) {
 col += ne_fprintf(f, "v%d,e%d,", 64 << TCGOP_VECL(op),
   8 << TCGOP_VECE(op));
 }
@@ -4782,7 +4782,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp 
*op)
 tcg_out_extrl_i64_i32(s, new_args[0], new_args[1]);
 break;
 default:
-if (def->flags & TCG_OPF_VECTOR) {
+if ((def->flags & TCG_OPF_TYPE_MASK) == TCG_OPF_VECTOR) {
 tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op),
new_args, const_args);
 } else {
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 3adc5fd3a3..43acb4fbcb 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1921,9 +1921,11 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg args[TCG_MAX_OP_ARGS],
const int const_args[TCG_MAX_OP_ARGS])
 {
-/* 99% of the time, we can signal the use of extension registers
-   by looking to see if the opcode handles 64-bit data.  */
-TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_64BIT) != 0;
+/*
+ * 99% of the time, we can signal the use of extension registers
+ * by looking to see if the opcode handles 32-bit data or not.
+ */
+TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_TYPE_MASK) != 
TCG_OPF_32BIT;
 
 /* Hoist the loads of the most common arguments.  */
 TCGArg a0 = args[0];
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c

[PATCH v4 09/54] tcg/aarch64: Introduce HostAddress

2023-05-03 Thread Richard Henderson

Collect the 3 potential parts of the host address into a struct.
Reorg tcg_out_qemu_{ld,st}_direct to use it.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 86 +---
 1 file changed, 59 insertions(+), 27 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index ecbf6564fc..d8d464e4a0 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1587,6 +1587,12 @@ static void tcg_out_adr(TCGContext *s, TCGReg rd, const 
void *target)
 tcg_out_insn(s, 3406, ADR, rd, offset);
 }
 
+typedef struct {
+TCGReg base;
+TCGReg index;
+TCGType index_ext;
+} HostAddress;
+
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * MemOpIdx oi, uintptr_t ra)
@@ -1796,32 +1802,31 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
-   TCGReg data_r, TCGReg addr_r,
-   TCGType otype, TCGReg off_r)
+   TCGReg data_r, HostAddress h)
 {
 switch (memop & MO_SSIZE) {
 case MO_UB:
-tcg_out_ldst_r(s, I3312_LDRB, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_LDRB, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_SB:
 tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
-   data_r, addr_r, otype, off_r);
+   data_r, h.base, h.index_ext, h.index);
 break;
 case MO_UW:
-tcg_out_ldst_r(s, I3312_LDRH, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_LDRH, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_SW:
 tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
-   data_r, addr_r, otype, off_r);
+   data_r, h.base, h.index_ext, h.index);
 break;
 case MO_UL:
-tcg_out_ldst_r(s, I3312_LDRW, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_LDRW, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_SL:
-tcg_out_ldst_r(s, I3312_LDRSWX, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_LDRSWX, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_UQ:
-tcg_out_ldst_r(s, I3312_LDRX, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_LDRX, data_r, h.base, h.index_ext, h.index);
 break;
 default:
 g_assert_not_reached();
@@ -1829,21 +1834,20 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp 
memop, TCGType ext,
 }
 
 static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
-   TCGReg data_r, TCGReg addr_r,
-   TCGType otype, TCGReg off_r)
+   TCGReg data_r, HostAddress h)
 {
 switch (memop & MO_SIZE) {
 case MO_8:
-tcg_out_ldst_r(s, I3312_STRB, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_STRB, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_16:
-tcg_out_ldst_r(s, I3312_STRH, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_STRH, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_32:
-tcg_out_ldst_r(s, I3312_STRW, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_STRW, data_r, h.base, h.index_ext, h.index);
 break;
 case MO_64:
-tcg_out_ldst_r(s, I3312_STRX, data_r, addr_r, otype, off_r);
+tcg_out_ldst_r(s, I3312_STRX, data_r, h.base, h.index_ext, h.index);
 break;
 default:
 g_assert_not_reached();
@@ -1855,6 +1859,7 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 {
 MemOp memop = get_memop(oi);
 TCGType addr_type = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
+HostAddress h;
 
 /* Byte swapping is left to middle-end expansion. */
 tcg_debug_assert((memop & MO_BSWAP) == 0);
@@ -1863,8 +1868,14 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 tcg_insn_unit *label_ptr;
 
 tcg_out_tlb_read(s, addr_reg, memop, &label_ptr, get_mmuidx(oi), 1);
-tcg_out_qemu_ld_direct(s, memop, data_type, data_reg,
-   TCG_REG_X1, addr_type, addr_reg);
+
+h = (HostAddress){
+.base = TCG_REG_X1,
+.index = addr_reg,
+.index_ext = addr_type
+};
+tcg_out_qemu_ld_direct(s, memop, data_type, data_reg, h);
+
 add_qemu_ldst_label(s, true, oi, data_type, data_reg, addr_reg,
 s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
@@ -1873,12 +1884,19 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg

[PATCH v4 10/57] accel/tcg: Implement helper_{ld, st}*_mmu for user-only

2023-05-03 Thread Richard Henderson

TCG backends may need to defer to a helper to implement
the atomicity required by a given operation.  Mirror the
interface used in system mode.

Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-ldst.h |   6 +-
 accel/tcg/user-exec.c  | 393 -
 tcg/tcg.c  |   6 +-
 3 files changed, 278 insertions(+), 127 deletions(-)

diff --git a/include/tcg/tcg-ldst.h b/include/tcg/tcg-ldst.h
index 3d897ca942..57fafa14b1 100644
--- a/include/tcg/tcg-ldst.h
+++ b/include/tcg/tcg-ldst.h
@@ -25,8 +25,6 @@
 #ifndef TCG_LDST_H
 #define TCG_LDST_H
 
-#ifdef CONFIG_SOFTMMU
-
 /* Value zero-extended to tcg register size.  */
 tcg_target_ulong helper_ldub_mmu(CPUArchState *env, target_ulong addr,
  MemOpIdx oi, uintptr_t retaddr);
@@ -58,10 +56,10 @@ void helper_stl_mmu(CPUArchState *env, target_ulong addr, 
uint32_t val,
 void helper_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 MemOpIdx oi, uintptr_t retaddr);
 
-#else
+#ifdef CONFIG_USER_ONLY
 
 G_NORETURN void helper_unaligned_ld(CPUArchState *env, target_ulong addr);
 G_NORETURN void helper_unaligned_st(CPUArchState *env, target_ulong addr);
 
-#endif /* CONFIG_SOFTMMU */
+#endif /* CONFIG_USER_ONLY */
 #endif /* TCG_LDST_H */
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index b89fa35a83..d9f9766b7f 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -889,21 +889,6 @@ void page_reset_target_data(target_ulong start, 
target_ulong last) { }
 
 /* The softmmu versions of these helpers are in cputlb.c.  */
 
-/*
- * Verify that we have passed the correct MemOp to the correct function.
- *
- * We could present one function to target code, and dispatch based on
- * the MemOp, but so far we have worked hard to avoid an indirect function
- * call along the memory path.
- */
-static void validate_memop(MemOpIdx oi, MemOp expected)
-{
-#ifdef CONFIG_DEBUG_TCG
-MemOp have = get_memop(oi) & (MO_SIZE | MO_BSWAP);
-assert(have == expected);
-#endif
-}
-
 void helper_unaligned_ld(CPUArchState *env, target_ulong addr)
 {
 cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_LOAD, GETPC());
@@ -914,10 +899,9 @@ void helper_unaligned_st(CPUArchState *env, target_ulong 
addr)
 cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, GETPC());
 }
 
-static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr,
-MemOpIdx oi, uintptr_t ra, MMUAccessType type)
+static void *cpu_mmu_lookup(CPUArchState *env, abi_ptr addr,
+MemOp mop, uintptr_t ra, MMUAccessType type)
 {
-MemOp mop = get_memop(oi);
 int a_bits = get_alignment_bits(mop);
 void *ret;
 
@@ -933,100 +917,206 @@ static void *cpu_mmu_lookup(CPUArchState *env, 
target_ulong addr,
 
 #include "ldst_atomicity.c.inc"
 
-uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
-MemOpIdx oi, uintptr_t ra)
+static uint8_t do_ld1_mmu(CPUArchState *env, abi_ptr addr,
+  MemOp mop, uintptr_t ra)
 {
 void *haddr;
 uint8_t ret;
 
-validate_memop(oi, MO_UB);
-haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
+tcg_debug_assert((mop & MO_SIZE) == MO_8);
+haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
 ret = ldub_p(haddr);
 clear_helper_retaddr();
+return ret;
+}
+
+tcg_target_ulong helper_ldub_mmu(CPUArchState *env, target_ulong addr,
+ MemOpIdx oi, uintptr_t ra)
+{
+return do_ld1_mmu(env, addr, get_memop(oi), ra);
+}
+
+tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, target_ulong addr,
+ MemOpIdx oi, uintptr_t ra)
+{
+return (int8_t)do_ld1_mmu(env, addr, get_memop(oi), ra);
+}
+
+uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
+MemOpIdx oi, uintptr_t ra)
+{
+uint8_t ret = do_ld1_mmu(env, addr, get_memop(oi), ra);
 qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
 return ret;
 }
 
+static uint16_t do_ld2_he_mmu(CPUArchState *env, abi_ptr addr,
+  MemOp mop, uintptr_t ra)
+{
+void *haddr;
+uint16_t ret;
+
+tcg_debug_assert((mop & MO_SIZE) == MO_16);
+haddr = cpu_mmu_lookup(env, addr, mop, ra, MMU_DATA_LOAD);
+ret = load_atom_2(env, ra, haddr, mop);
+clear_helper_retaddr();
+return ret;
+}
+
+tcg_target_ulong helper_lduw_mmu(CPUArchState *env, target_ulong addr,
+ MemOpIdx oi, uintptr_t ra)
+{
+MemOp mop = get_memop(oi);
+uint16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
+
+if (mop & MO_BSWAP) {
+ret = bswap16(ret);
+}
+return ret;
+}
+
+tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, target_ulong addr,
+ MemOpIdx oi, uintptr_t ra)
+{
+MemOp mop = get_memop(oi);
+int16_t ret = do_ld2_he_mmu(env, addr, mop, ra);
+
+if (mop & MO_BSWAP) {
+

[PATCH v4 51/54] tcg/ppc: Remove unused constraint J

2023-05-03 Thread Richard Henderson

Never used since its introduction.

Fixes: 3d582c6179c ("tcg-ppc64: Rearrange integer constant constraints")
Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target-con-str.h | 1 -
 tcg/ppc/tcg-target.c.inc | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/tcg/ppc/tcg-target-con-str.h b/tcg/ppc/tcg-target-con-str.h
index 9dcbc3df50..094613cbcb 100644
--- a/tcg/ppc/tcg-target-con-str.h
+++ b/tcg/ppc/tcg-target-con-str.h
@@ -16,7 +16,6 @@ REGS('v', ALL_VECTOR_REGS)
  * CONST(letter, TCG_CT_CONST_* bit set)
  */
 CONST('I', TCG_CT_CONST_S16)
-CONST('J', TCG_CT_CONST_U16)
 CONST('M', TCG_CT_CONST_MONE)
 CONST('T', TCG_CT_CONST_S32)
 CONST('U', TCG_CT_CONST_U32)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 5a4ec0470a..0a14c3e997 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -83,7 +83,6 @@
 #define SZR  (TCG_TARGET_REG_BITS / 8)
 
 #define TCG_CT_CONST_S16  0x100
-#define TCG_CT_CONST_U16  0x200
 #define TCG_CT_CONST_S32  0x400
 #define TCG_CT_CONST_U32  0x800
 #define TCG_CT_CONST_ZERO 0x1000
@@ -270,8 +269,6 @@ static bool tcg_target_const_match(int64_t val, TCGType 
type, int ct)
 
 if ((ct & TCG_CT_CONST_S16) && val == (int16_t)val) {
 return 1;
-} else if ((ct & TCG_CT_CONST_U16) && val == (uint16_t)val) {
-return 1;
 } else if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 return 1;
 } else if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
-- 
2.34.1

[PATCH v4 38/54] tcg/arm: Convert tcg_out_qemu_{ld,st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.  This allows our local
tcg_out_arg_* infrastructure to be removed.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.c.inc | 140 +--
 1 file changed, 18 insertions(+), 122 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index c744512778..df514e56fc 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -690,8 +690,8 @@ tcg_out_ldrd_rwb(TCGContext *s, ARMCond cond, TCGReg rt, 
TCGReg rn, TCGReg rm)
 tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 1);
 }
 
-static void tcg_out_strd_8(TCGContext *s, ARMCond cond, TCGReg rt,
-   TCGReg rn, int imm8)
+static void __attribute__((unused))
+tcg_out_strd_8(TCGContext *s, ARMCond cond, TCGReg rt, TCGReg rn, int imm8)
 {
 tcg_out_memop_8(s, cond, INSN_STRD_IMM, rt, rn, imm8, 1, 0);
 }
@@ -969,28 +969,16 @@ static void tcg_out_ext8u(TCGContext *s, TCGReg rd, 
TCGReg rn)
 tcg_out_dat_imm(s, COND_AL, ARITH_AND, rd, rn, 0xff);
 }
 
-static void __attribute__((unused))
-tcg_out_ext8u_cond(TCGContext *s, ARMCond cond, TCGReg rd, TCGReg rn)
-{
-tcg_out_dat_imm(s, cond, ARITH_AND, rd, rn, 0xff);
-}
-
 static void tcg_out_ext16s(TCGContext *s, TCGType t, TCGReg rd, TCGReg rn)
 {
 /* sxth */
 tcg_out32(s, 0x06bf0070 | (COND_AL << 28) | (rd << 12) | rn);
 }
 
-static void tcg_out_ext16u_cond(TCGContext *s, ARMCond cond,
-TCGReg rd, TCGReg rn)
-{
-/* uxth */
-tcg_out32(s, 0x06ff0070 | (cond << 28) | (rd << 12) | rn);
-}
-
 static void tcg_out_ext16u(TCGContext *s, TCGReg rd, TCGReg rn)
 {
-tcg_out_ext16u_cond(s, COND_AL, rd, rn);
+/* uxth */
+tcg_out32(s, 0x06ff0070 | (COND_AL << 28) | (rd << 12) | rn);
 }
 
 static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rn)
@@ -1382,92 +1370,29 @@ static void * const qemu_st_helpers[MO_SIZE + 1] = {
 #endif
 };
 
-/* Helper routines for marshalling helper function arguments into
- * the correct registers and stack.
- * argreg is where we want to put this argument, arg is the argument itself.
- * Return value is the updated argreg ready for the next call.
- * Note that argreg 0..3 is real registers, 4+ on stack.
- *
- * We provide routines for arguments which are: immediate, 32 bit
- * value in register, 16 and 8 bit values in register (which must be zero
- * extended before use) and 64 bit value in a lo:hi register pair.
- */
-#define DEFINE_TCG_OUT_ARG(NAME, ARGTYPE, MOV_ARG, EXT_ARG)\
-static TCGReg NAME(TCGContext *s, TCGReg argreg, ARGTYPE arg)  \
-{  \
-if (argreg < 4) {  \
-MOV_ARG(s, COND_AL, argreg, arg);  \
-} else {   \
-int ofs = (argreg - 4) * 4;\
-EXT_ARG;   \
-tcg_debug_assert(ofs + 4 <= TCG_STATIC_CALL_ARGS_SIZE);\
-tcg_out_st32_12(s, COND_AL, arg, TCG_REG_CALL_STACK, ofs); \
-}  \
-return argreg + 1; \
-}
-
-DEFINE_TCG_OUT_ARG(tcg_out_arg_imm32, uint32_t, tcg_out_movi32,
-(tcg_out_movi32(s, COND_AL, TCG_REG_TMP, arg), arg = TCG_REG_TMP))
-DEFINE_TCG_OUT_ARG(tcg_out_arg_reg8, TCGReg, tcg_out_ext8u_cond,
-(tcg_out_ext8u_cond(s, COND_AL, TCG_REG_TMP, arg), arg = TCG_REG_TMP))
-DEFINE_TCG_OUT_ARG(tcg_out_arg_reg16, TCGReg, tcg_out_ext16u_cond,
-(tcg_out_ext16u_cond(s, COND_AL, TCG_REG_TMP, arg), arg = TCG_REG_TMP))
-DEFINE_TCG_OUT_ARG(tcg_out_arg_reg32, TCGReg, tcg_out_mov_reg, )
-
-static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
-TCGReg arglo, TCGReg arghi)
+static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
 {
-/* 64 bit arguments must go in even/odd register pairs
- * and in 8-aligned stack slots.
- */
-if (argreg & 1) {
-argreg++;
-}
-if (argreg >= 4 && (arglo & 1) == 0 && arghi == arglo + 1) {
-tcg_out_strd_8(s, COND_AL, arglo,
-   TCG_REG_CALL_STACK, (argreg - 4) * 4);
-return argreg + 2;
-} else {
-argreg = tcg_out_arg_reg32(s, argreg, arglo);
-argreg = tcg_out_arg_reg32(s, argreg, arghi);
-return argreg;
-}
+/* We arrive at the slow path via "BLNE", so R14 contains l->raddr. */
+return TCG_REG_R14;
 }
 
+static const TCGLdstHelperParam ldst_helper_param = {
+.ra_gen = ldst_ra_gen,
+.ntmp = 1,
+.tmp = { TCG_REG_TMP },
+};
+
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-TCG

[PATCH v4 18/54] tcg/mips: Introduce prepare_host_addr

2023-05-03 Thread Richard Henderson

Merge tcg_out_tlb_load, add_qemu_ldst_label, tcg_out_test_alignment,
and some code that lived in both tcg_out_qemu_ld and tcg_out_qemu_st
into one function that returns HostAddress and TCGLabelQemuLdst structures.

Signed-off-by: Richard Henderson 
---
 tcg/mips/tcg-target.c.inc | 404 --
 1 file changed, 172 insertions(+), 232 deletions(-)

diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index ef8350e9cd..94708e6ea7 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -1181,120 +1181,6 @@ static int tcg_out_call_iarg_reg2(TCGContext *s, int i, 
TCGReg al, TCGReg ah)
 return i;
 }
 
-/* We expect to use a 16-bit negative offset from ENV.  */
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
-
-/*
- * Perform the tlb comparison operation.
- * The complete host address is placed in BASE.
- * Clobbers TMP0, TMP1, TMP2, TMP3.
- */
-static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
- TCGReg addrh, MemOpIdx oi,
- tcg_insn_unit *label_ptr[2], bool is_load)
-{
-MemOp opc = get_memop(oi);
-unsigned a_bits = get_alignment_bits(opc);
-unsigned s_bits = opc & MO_SIZE;
-unsigned a_mask = (1 << a_bits) - 1;
-unsigned s_mask = (1 << s_bits) - 1;
-int mem_index = get_mmuidx(oi);
-int fast_off = TLB_MASK_TABLE_OFS(mem_index);
-int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
-int table_off = fast_off + offsetof(CPUTLBDescFast, table);
-int add_off = offsetof(CPUTLBEntry, addend);
-int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
-   : offsetof(CPUTLBEntry, addr_write));
-target_ulong tlb_mask;
-
-/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off);
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, TCG_AREG0, table_off);
-
-/* Extract the TLB index from the address into TMP3.  */
-tcg_out_opc_sa(s, ALIAS_TSRL, TCG_TMP3, addrl,
-   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-tcg_out_opc_reg(s, OPC_AND, TCG_TMP3, TCG_TMP3, TCG_TMP0);
-
-/* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3.  */
-tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1);
-
-/* Load the (low-half) tlb comparator.  */
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
-} else {
-tcg_out_ldst(s, (TARGET_LONG_BITS == 64 ? OPC_LD
- : TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW),
- TCG_TMP0, TCG_TMP3, cmp_off);
-}
-
-/* Zero extend a 32-bit guest address for a 64-bit host. */
-if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
-tcg_out_ext32u(s, base, addrl);
-addrl = base;
-}
-
-/*
- * Mask the page bits, keeping the alignment bits to compare against.
- * For unaligned accesses, compare against the end of the access to
- * verify that it does not cross a page boundary.
- */
-tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
-tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, tlb_mask);
-if (a_mask >= s_mask) {
-tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrl);
-} else {
-tcg_out_opc_imm(s, ALIAS_PADDI, TCG_TMP2, addrl, s_mask - a_mask);
-tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2);
-}
-
-if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
-/* Load the tlb addend for the fast path.  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
-}
-
-label_ptr[0] = s->code_ptr;
-tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0);
-
-/* Load and test the high half tlb comparator.  */
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-/* delay slot */
-tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
-
-/* Load the tlb addend for the fast path.  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
-
-label_ptr[1] = s->code_ptr;
-tcg_out_opc_br(s, OPC_BNE, addrh, TCG_TMP0);
-}
-
-/* delay slot */
-tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_TMP2, addrl);
-}
-
-static void add_qemu_ldst_label(TCGContext *s, int is_ld, MemOpIdx oi,
-TCGType ext,
-TCGReg datalo, TCGReg datahi,
-TCGReg addrlo, TCGReg addrhi,
-void *raddr, tcg_insn_unit *label_ptr[2])
-{
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->oi = oi;
-label->type = ext;
-label->datalo_reg = datalo;
-label->datahi_reg = datahi;
-label->addrlo_reg = addrlo;
-label->addrhi_reg = addrhi;
-label->raddr = tcg_splitwx_to_rx(raddr);
-label->label_ptr[0] = label_ptr[0];
-if (T

[PATCH v4 46/54] tcg/mips: Reorg tlb load within prepare_host_addr

2023-05-03 Thread Richard Henderson

Compare the address vs the tlb entry with sign-extended values.
This simplifies the page+alignment mask constant, and the
generation of the last byte address for the misaligned test.

Move the tlb addend load up, and the zero-extension down.

This frees up a register, which allows us use TMP3 as the returned base
address register instead of A0, which we were using as a 5th temporary.

Signed-off-by: Richard Henderson 
---
 tcg/mips/tcg-target.c.inc | 38 ++
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index 31d58e1977..695c137023 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -370,6 +370,8 @@ typedef enum {
 ALIAS_PADDI= sizeof(void *) == 4 ? OPC_ADDIU : OPC_DADDIU,
 ALIAS_TSRL = TARGET_LONG_BITS == 32 || TCG_TARGET_REG_BITS == 32
  ? OPC_SRL : OPC_DSRL,
+ALIAS_TADDI= TARGET_LONG_BITS == 32 || TCG_TARGET_REG_BITS == 32
+ ? OPC_ADDIU : OPC_DADDIU,
 } MIPSInsn;
 
 /*
@@ -1263,14 +1265,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 int add_off = offsetof(CPUTLBEntry, addend);
 int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
 : offsetof(CPUTLBEntry, addr_write);
-target_ulong tlb_mask;
 
 ldst = new_ldst_label(s);
 ldst->is_ld = is_ld;
 ldst->oi = oi;
 ldst->addrlo_reg = addrlo;
 ldst->addrhi_reg = addrhi;
-base = TCG_REG_A0;
 
 /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
@@ -1290,15 +1290,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
 } else {
-tcg_out_ldst(s, (TARGET_LONG_BITS == 64 ? OPC_LD
- : TCG_TARGET_REG_BITS == 64 ? OPC_LWU : OPC_LW),
- TCG_TMP0, TCG_TMP3, cmp_off);
+tcg_out_ld(s, TCG_TYPE_TL, TCG_TMP0, TCG_TMP3, cmp_off);
 }
 
-/* Zero extend a 32-bit guest address for a 64-bit host. */
-if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
-tcg_out_ext32u(s, base, addrlo);
-addrlo = base;
+if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
+/* Load the tlb addend for the fast path.  */
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off);
 }
 
 /*
@@ -1306,18 +1303,18 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
  * For unaligned accesses, compare against the end of the access to
  * verify that it does not cross a page boundary.
  */
-tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
-tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1, tlb_mask);
-if (a_mask >= s_mask) {
-tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrlo);
-} else {
-tcg_out_opc_imm(s, ALIAS_PADDI, TCG_TMP2, addrlo, s_mask - a_mask);
+tcg_out_movi(s, TCG_TYPE_TL, TCG_TMP1, TARGET_PAGE_MASK | a_mask);
+if (a_mask < s_mask) {
+tcg_out_opc_imm(s, ALIAS_TADDI, TCG_TMP2, addrlo, s_mask - a_mask);
 tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2);
+} else {
+tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrlo);
 }
 
-if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
-/* Load the tlb addend for the fast path.  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
+/* Zero extend a 32-bit guest address for a 64-bit host. */
+if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+tcg_out_ext32u(s, TCG_TMP2, addrlo);
+addrlo = TCG_TMP2;
 }
 
 ldst->label_ptr[0] = s->code_ptr;
@@ -1329,14 +1326,15 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF);
 
 /* Load the tlb addend for the fast path.  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP2, TCG_TMP3, add_off);
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off);
 
 ldst->label_ptr[1] = s->code_ptr;
 tcg_out_opc_br(s, OPC_BNE, addrhi, TCG_TMP0);
 }
 
 /* delay slot */
-tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_TMP2, addrlo);
+base = TCG_TMP3;
+tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_TMP3, addrlo);
 #else
 if (a_mask && (use_mips32r6_instructions || a_bits != s_bits)) {
 ldst = new_ldst_label(s);
-- 
2.34.1

[PATCH v4 00/54] tcg: Simplify calls to load/store helpers

2023-05-03 Thread Richard Henderson

v1: 
https://lore.kernel.org/qemu-devel/20230408024314.3357414-1-richard.hender...@linaro.org/
v2: 
https://lore.kernel.org/qemu-devel/20230411010512.5375-1-richard.hender...@linaro.org/
v3: 
https://lore.kernel.org/qemu-devel/20230424054105.1579315-1-richard.hender...@linaro.org/

There are several changes to the load/store helpers coming, and making
sure that those changes are properly reflected across all of the backends
was harrowing.

I have gone back and restarted by hoisting the code out of the backends
and into tcg.c.  We already have all of the parameters for the host
function call abi for "normal" helpers, we simply need to apply that to
the load/store slow path.

No major changes for v4.  A few patches upstreamed, and one new one
based on Phil's review.


r~


Richard Henderson (54):
  tcg/i386: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/i386: Generalize multi-part load overlap test
  tcg/i386: Introduce HostAddress
  tcg/i386: Drop r0+r1 local variables from tcg_out_tlb_load
  tcg/i386: Introduce tcg_out_testi
  tcg/i386: Introduce prepare_host_addr
  tcg/i386: Use indexed addressing for softmmu fast path
  tcg/aarch64: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/aarch64: Introduce HostAddress
  tcg/aarch64: Introduce prepare_host_addr
  tcg/arm: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/arm: Introduce HostAddress
  tcg/arm: Introduce prepare_host_addr
  tcg/loongarch64: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/loongarch64: Introduce HostAddress
  tcg/loongarch64: Introduce prepare_host_addr
  tcg/mips: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/mips: Introduce prepare_host_addr
  tcg/ppc: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/ppc: Introduce HostAddress
  tcg/ppc: Introduce prepare_host_addr
  tcg/riscv: Require TCG_TARGET_REG_BITS == 64
  tcg/riscv: Rationalize args to tcg_out_qemu_{ld,st}
  tcg/riscv: Introduce prepare_host_addr
  tcg/s390x: Pass TCGType to tcg_out_qemu_{ld,st}
  tcg/s390x: Introduce HostAddress
  tcg/s390x: Introduce prepare_host_addr
  tcg/sparc64: Drop is_64 test from tcg_out_qemu_ld data return
  tcg/sparc64: Pass TCGType to tcg_out_qemu_{ld,st}
  tcg: Move TCGLabelQemuLdst to tcg.c
  tcg: Replace REG_P with arg_loc_reg_p
  tcg: Introduce arg_slot_stk_ofs
  tcg: Widen helper_*_st[bw]_mmu val arguments
  tcg: Add routines for calling slow-path helpers
  tcg/i386: Convert tcg_out_qemu_ld_slow_path
  tcg/i386: Convert tcg_out_qemu_st_slow_path
  tcg/aarch64: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/arm: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/loongarch64: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/mips: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/ppc: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/riscv: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/s390x: Convert tcg_out_qemu_{ld,st}_slow_path
  tcg/loongarch64: Simplify constraints on qemu_ld/st
  tcg/mips: Remove MO_BSWAP handling
  tcg/mips: Reorg tlb load within prepare_host_addr
  tcg/mips: Simplify constraints on qemu_ld/st
  tcg/ppc: Reorg tcg_out_tlb_read
  tcg/ppc: Adjust constraints on qemu_ld/st
  tcg/ppc: Remove unused constraints A, B, C, D
  tcg/ppc: Remove unused constraint J
  tcg/riscv: Simplify constraints on qemu_ld/st
  tcg/s390x: Use ALGFR in constructing softmmu host address
  tcg/s390x: Simplify constraints on qemu_ld/st

 include/tcg/tcg-ldst.h   |  10 +-
 tcg/loongarch64/tcg-target-con-set.h |   2 -
 tcg/loongarch64/tcg-target-con-str.h |   1 -
 tcg/mips/tcg-target-con-set.h|  13 +-
 tcg/mips/tcg-target-con-str.h|   2 -
 tcg/mips/tcg-target.h|   4 +-
 tcg/ppc/tcg-target-con-set.h |  11 +-
 tcg/ppc/tcg-target-con-str.h |   7 -
 tcg/riscv/tcg-target-con-set.h   |  10 -
 tcg/riscv/tcg-target-con-str.h   |   1 -
 tcg/riscv/tcg-target.h   |  22 +-
 tcg/s390x/tcg-target-con-set.h   |   2 -
 tcg/s390x/tcg-target-con-str.h   |   1 -
 tcg/tcg-internal.h   |   4 -
 accel/tcg/cputlb.c   |   6 +-
 tcg/tcg.c| 514 ++-
 tcg/aarch64/tcg-target.c.inc | 363 +--
 tcg/arm/tcg-target.c.inc | 718 
 tcg/i386/tcg-target.c.inc| 700 +---
 tcg/loongarch64/tcg-target.c.inc | 372 ---
 tcg/mips/tcg-target.c.inc| 942 ---
 tcg/ppc/tcg-target.c.inc | 640 --
 tcg/riscv/tcg-target.c.inc   | 534 +--
 tcg/s390x/tcg-target.c.inc   | 393 +--
 tcg/sparc64/tcg-target.c.inc |   8 +-
 tcg/tcg-ldst.c.inc   |  14 -
 26 files changed, 2340 insertions(+), 2954 deletions(-)

-- 
2.34.1

[PATCH v4 19/54] tcg/ppc: Rationalize args to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

Interpret the variable argument placement in the caller.  Pass data_type
instead of is64 -- there are several places where we already convert back
from bool to type.  Clean things up by using type throughout.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target.c.inc | 110 +--
 1 file changed, 59 insertions(+), 51 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 77abb7d20c..d1aa2a9f53 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2118,7 +2118,8 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, MemOp opc,
 /* Record the context of a call to the out of line helper code for the slow
path for a load or store, so that we can later generate the correct
helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
+TCGType type, MemOpIdx oi,
 TCGReg datalo_reg, TCGReg datahi_reg,
 TCGReg addrlo_reg, TCGReg addrhi_reg,
 tcg_insn_unit *raddr, tcg_insn_unit *lptr)
@@ -2126,6 +2127,7 @@ static void add_qemu_ldst_label(TCGContext *s, bool 
is_ld, MemOpIdx oi,
 TCGLabelQemuLdst *label = new_ldst_label(s);
 
 label->is_ld = is_ld;
+label->type = type;
 label->oi = oi;
 label->datalo_reg = datalo_reg;
 label->datahi_reg = datahi_reg;
@@ -2288,30 +2290,18 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 
 #endif /* SOFTMMU */
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg datalo, datahi, addrlo, rbase;
-TCGReg addrhi __attribute__((unused));
-MemOpIdx oi;
-MemOp opc, s_bits;
+MemOp opc = get_memop(oi);
+MemOp s_bits = opc & MO_SIZE;
+TCGReg rbase;
+
 #ifdef CONFIG_SOFTMMU
-int mem_index;
 tcg_insn_unit *label_ptr;
-#else
-unsigned a_bits;
-#endif
 
-datalo = *args++;
-datahi = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
-addrlo = *args++;
-addrhi = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-oi = *args++;
-opc = get_memop(oi);
-s_bits = opc & MO_SIZE;
-
-#ifdef CONFIG_SOFTMMU
-mem_index = get_mmuidx(oi);
-addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, true);
+addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, get_mmuidx(oi), true);
 
 /* Load a pointer into the current opcode w/conditional branch-link. */
 label_ptr = s->code_ptr;
@@ -2319,7 +2309,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, bool is_64)
 
 rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-a_bits = get_alignment_bits(opc);
+unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
 }
@@ -2364,35 +2354,23 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, bool is_64)
 }
 
 #ifdef CONFIG_SOFTMMU
-add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
-s->code_ptr, label_ptr);
+add_qemu_ldst_label(s, true, data_type, oi, datalo, datahi,
+addrlo, addrhi, s->code_ptr, label_ptr);
 #endif
 }
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
+static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg datalo, datahi, addrlo, rbase;
-TCGReg addrhi __attribute__((unused));
-MemOpIdx oi;
-MemOp opc, s_bits;
+MemOp opc = get_memop(oi);
+MemOp s_bits = opc & MO_SIZE;
+TCGReg rbase;
+
 #ifdef CONFIG_SOFTMMU
-int mem_index;
 tcg_insn_unit *label_ptr;
-#else
-unsigned a_bits;
-#endif
 
-datalo = *args++;
-datahi = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
-addrlo = *args++;
-addrhi = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-oi = *args++;
-opc = get_memop(oi);
-s_bits = opc & MO_SIZE;
-
-#ifdef CONFIG_SOFTMMU
-mem_index = get_mmuidx(oi);
-addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, false);
+addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, get_mmuidx(oi), false);
 
 /* Load a pointer into the current opcode w/conditional branch-link. */
 label_ptr = s->code_ptr;
@@ -2400,7 +2378,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
*args, bool is_64)
 
 rbase = TCG_REG_R3;
 #else  /* !CONFIG_SOFTMMU */
-a_bits = get_alignment_bits(opc);
+unsigned a_bits = get_ali

[PATCH v4 14/57] tcg/i386: Add have_atomic16

2023-05-03 Thread Richard Henderson

Notice when Intel or AMD have guaranteed that vmovdqa is atomic.
The new variable will also be used in generated code.

Signed-off-by: Richard Henderson 
---
 include/qemu/cpuid.h  | 18 ++
 tcg/i386/tcg-target.h |  1 +
 tcg/i386/tcg-target.c.inc | 27 +++
 3 files changed, 46 insertions(+)

diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
index 1451e8ef2f..35325f1995 100644
--- a/include/qemu/cpuid.h
+++ b/include/qemu/cpuid.h
@@ -71,6 +71,24 @@
 #define bit_LZCNT   (1 << 5)
 #endif
 
+/*
+ * Signatures for different CPU implementations as returned from Leaf 0.
+ */
+
+#ifndef signature_INTEL_ecx
+/* "Genu" "ineI" "ntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+#endif
+
+#ifndef signature_AMD_ecx
+/* "Auth" "enti" "cAMD" */
+#define signature_AMD_ebx   0x68747541
+#define signature_AMD_edx   0x69746e65
+#define signature_AMD_ecx   0x444d4163
+#endif
+
 static inline unsigned xgetbv_low(unsigned c)
 {
 unsigned a, d;
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index d4f2a6f8c2..0421776cb8 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -120,6 +120,7 @@ extern bool have_avx512dq;
 extern bool have_avx512vbmi2;
 extern bool have_avx512vl;
 extern bool have_movbe;
+extern bool have_atomic16;
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32 1
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index bb603e7968..f838683fc3 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -185,6 +185,7 @@ bool have_avx512dq;
 bool have_avx512vbmi2;
 bool have_avx512vl;
 bool have_movbe;
+bool have_atomic16;
 
 #ifdef CONFIG_CPUID_H
 static bool have_bmi2;
@@ -4024,6 +4025,32 @@ static void tcg_target_init(TCGContext *s)
 have_avx512dq = (b7 & bit_AVX512DQ) != 0;
 have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
 }
+
+/*
+ * The Intel SDM has added:
+ *   Processors that enumerate support for Intel® AVX
+ *   (by setting the feature flag CPUID.01H:ECX.AVX[bit 28])
+ *   guarantee that the 16-byte memory operations performed
+ *   by the following instructions will always be carried
+ *   out atomically:
+ *   - MOVAPD, MOVAPS, and MOVDQA.
+ *   - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
+ *   - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded
+ * with EVEX.128 and k0 (masking disabled).
+ * Note that these instructions require the linear addresses
+ * of their memory operands to be 16-byte aligned.
+ *
+ * AMD has provided an even stronger guarantee that processors
+ * with AVX provide 16-byte atomicity for all cachable,
+ * naturally aligned single loads and stores, e.g. MOVDQU.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688
+ */
+if (have_avx1) {
+__cpuid(0, a, b, c, d);
+have_atomic16 = (c == signature_INTEL_ecx ||
+ c == signature_AMD_ecx);
+}
 }
 }
 }
-- 
2.34.1

[PATCH v4 50/54] tcg/ppc: Remove unused constraints A, B, C, D

2023-05-03 Thread Richard Henderson

These constraints have not been used for quite some time.

Fixes: 77b73de67632 ("Use rem/div[u]_i32 drop div[u]2_i32")
Reviewed-by: Daniel Henrique Barboza 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target-con-str.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/tcg/ppc/tcg-target-con-str.h b/tcg/ppc/tcg-target-con-str.h
index f3bf030bc3..9dcbc3df50 100644
--- a/tcg/ppc/tcg-target-con-str.h
+++ b/tcg/ppc/tcg-target-con-str.h
@@ -10,10 +10,6 @@
  */
 REGS('r', ALL_GENERAL_REGS)
 REGS('v', ALL_VECTOR_REGS)
-REGS('A', 1u << TCG_REG_R3)
-REGS('B', 1u << TCG_REG_R4)
-REGS('C', 1u << TCG_REG_R5)
-REGS('D', 1u << TCG_REG_R6)
 
 /*
  * Define constraint letters for constants:
-- 
2.34.1

[PATCH v4 19/57] accel/tcg: Add have_lse2 support in ldst_atomicity

2023-05-03 Thread Richard Henderson

Add fast paths for FEAT_LSE2, using the detection in tcg.

Signed-off-by: Richard Henderson 
---
 accel/tcg/ldst_atomicity.c.inc | 37 ++
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index 2426b09aef..7ed5d4282d 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -41,6 +41,8 @@
  * but we're using tcg/tci/ instead.
  */
 # define HAVE_al16_fastfalse
+#elif defined(__aarch64__)
+# define HAVE_al16_fastlikely(have_lse2)
 #elif defined(__x86_64__) && defined(CONFIG_INT128)
 # define HAVE_al16_fastlikely(have_atomic16)
 #else
@@ -48,6 +50,8 @@
 #endif
 #if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
 # define HAVE_al16 true
+#elif defined(__aarch64__)
+# define HAVE_al16 true
 #else
 # define HAVE_al16 false
 #endif
@@ -170,6 +174,14 @@ load_atomic16(void *pv)
 
 r.u = qatomic_read__nocheck(p);
 return r.s;
+#elif defined(__aarch64__)
+uint64_t l, h;
+
+/* Via HAVE_al16_fast, FEAT_LSE2 is present: LDP becomes atomic. */
+asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*(__uint128_t *)pv));
+
+qemu_build_assert(!HOST_BIG_ENDIAN);
+return int128_make128(l, h);
 #elif defined(__x86_64__) && defined(CONFIG_INT128)
 Int128Alias r;
 
@@ -412,6 +424,18 @@ load_atom_extract_al16_or_al8(void *pv, int s)
 r = qatomic_read__nocheck(p16);
 }
 return r >> shr;
+#elif defined(__aarch64__)
+/*
+ * Via HAVE_al16_fast, FEAT_LSE2 is present.
+ * LDP becomes single-copy atomic if 16-byte aligned, and
+ * single-copy atomic on the parts if 8-byte aligned.
+ */
+uintptr_t pi = (uintptr_t)pv;
+int shr = (pi & 7) * 8;
+uint64_t l, h;
+
+asm("ldp %0, %1, %2" : "=r"(l), "=r"(h) : "m"(*(__uint128_t *)(pi & ~7)));
+return (l >> shr) | (h << (-shr & 63));
 #elif defined(__x86_64__) && defined(CONFIG_INT128)
 uintptr_t pi = (uintptr_t)pv;
 int shr = (pi & 7) * 8;
@@ -767,10 +791,15 @@ store_atomic16(void *pv, Int128Alias val)
 l = int128_getlo(val.s);
 h = int128_gethi(val.s);
 
-asm("0: ldxp %0, xzr, %1\n\t"
-"stxp %w0, %2, %3, %1\n\t"
-"cbnz %w0, 0b"
-: "=&r"(t), "=Q"(*(__uint128_t *)pv) : "r"(l), "r"(h));
+if (HAVE_al16_fast) {
+/* Via HAVE_al16_fast, FEAT_LSE2 is present: STP becomes atomic. */
+asm("stp %1, %2, %0" : "=Q"(*(__uint128_t *)pv) : "r"(l), "r"(h));
+} else {
+asm("0: ldxp %0, xzr, %1\n\t"
+"stxp %w0, %2, %3, %1\n\t"
+"cbnz %w0, 0b"
+: "=&r"(t), "=Q"(*(__uint128_t *)pv) : "r"(l), "r"(h));
+}
 return;
 }
 #elif defined(CONFIG_CMPXCHG128)
-- 
2.34.1

[PATCH v4 11/54] tcg/arm: Rationalize args to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

Interpret the variable argument placement in the caller.
Pass data_type instead of is_64.  We need to set this in
TCGLabelQemuLdst, so plumb this all the way through from tcg_out_op.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.c.inc | 113 +++
 1 file changed, 56 insertions(+), 57 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index 83c818a58b..6ce52b9612 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1526,15 +1526,18 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 /* Record the context of a call to the out of line helper code for the slow
path for a load or store, so that we can later generate the correct
helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
-TCGReg datalo, TCGReg datahi, TCGReg addrlo,
-TCGReg addrhi, tcg_insn_unit *raddr,
+static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
+MemOpIdx oi, TCGType type,
+TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+tcg_insn_unit *raddr,
 tcg_insn_unit *label_ptr)
 {
 TCGLabelQemuLdst *label = new_ldst_label(s);
 
 label->is_ld = is_ld;
 label->oi = oi;
+label->type = type;
 label->datalo_reg = datalo;
 label->datahi_reg = datahi;
 label->addrlo_reg = addrlo;
@@ -1796,41 +1799,28 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp 
opc, TCGReg datalo,
 }
 #endif
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
+static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
-MemOpIdx oi;
-MemOp opc;
-#ifdef CONFIG_SOFTMMU
-int mem_index;
-TCGReg addend;
-tcg_insn_unit *label_ptr;
-#else
-unsigned a_bits;
-#endif
-
-datalo = *args++;
-datahi = (is64 ? *args++ : 0);
-addrlo = *args++;
-addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-oi = *args++;
-opc = get_memop(oi);
+MemOp opc = get_memop(oi);
 
 #ifdef CONFIG_SOFTMMU
-mem_index = get_mmuidx(oi);
-addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 1);
+TCGReg addend= tcg_out_tlb_read(s, addrlo, addrhi, opc, get_mmuidx(oi), 1);
 
-/* This a conditional BL only to load a pointer within this opcode into LR
-   for the slow path.  We will not be using the value for a tail call.  */
-label_ptr = s->code_ptr;
+/*
+ * This a conditional BL only to load a pointer within this opcode into
+ * LR for the slow path.  We will not be using the value for a tail call.
+ */
+tcg_insn_unit *label_ptr = s->code_ptr;
 tcg_out_bl_imm(s, COND_NE, 0);
 
 tcg_out_qemu_ld_index(s, opc, datalo, datahi, addrlo, addend, true);
 
-add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
-s->code_ptr, label_ptr);
+add_qemu_ldst_label(s, true, oi, data_type, datalo, datahi,
+addrlo, addrhi, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
-a_bits = get_alignment_bits(opc);
+unsigned a_bits = get_alignment_bits(opc);
 if (a_bits) {
 tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
 }
@@ -1918,41 +1908,26 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp 
opc, TCGReg datalo,
 }
 #endif
 
-static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
+static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg addrlo, datalo, datahi, addrhi __attribute__((unused));
-MemOpIdx oi;
-MemOp opc;
-#ifdef CONFIG_SOFTMMU
-int mem_index;
-TCGReg addend;
-tcg_insn_unit *label_ptr;
-#else
-unsigned a_bits;
-#endif
-
-datalo = *args++;
-datahi = (is64 ? *args++ : 0);
-addrlo = *args++;
-addrhi = (TARGET_LONG_BITS == 64 ? *args++ : 0);
-oi = *args++;
-opc = get_memop(oi);
+MemOp opc = get_memop(oi);
 
 #ifdef CONFIG_SOFTMMU
-mem_index = get_mmuidx(oi);
-addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 0);
+TCGReg addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, get_mmuidx(oi), 
0);
 
 tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi,
   addrlo, addend, true);
 
 /* The conditional call must come last, as we're going to return here.  */
-label_ptr = s->code_ptr;
+tcg_insn_unit *label_ptr = s->code_ptr;
 tcg_out_bl_imm(s, COND_NE, 0)

[PATCH v4 48/54] tcg/ppc: Reorg tcg_out_tlb_read

2023-05-03 Thread Richard Henderson

Allocate TCG_REG_TMP2.  Use R0, TMP1, TMP2 instead of any of
the normally allocated registers for the tlb load.

Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target.c.inc | 84 
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 042136fee7..6850ecbc80 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -68,6 +68,7 @@
 #else
 # define TCG_REG_TMP1   TCG_REG_R12
 #endif
+#define TCG_REG_TMP2TCG_REG_R11
 
 #define TCG_VEC_TMP1TCG_REG_V0
 #define TCG_VEC_TMP2TCG_REG_V1
@@ -2015,13 +2016,11 @@ static TCGReg ldst_ra_gen(TCGContext *s, const 
TCGLabelQemuLdst *l, int arg)
 /*
  * For the purposes of ppc32 sorting 4 input registers into 4 argument
  * registers, there is an outside chance we would require 3 temps.
- * Because of constraints, no inputs are in r3, and env will not be
- * placed into r3 until after the sorting is done, and is thus free.
  */
 static const TCGLdstHelperParam ldst_helper_param = {
 .ra_gen = ldst_ra_gen,
 .ntmp = 3,
-.tmp = { TCG_REG_TMP1, TCG_REG_R0, TCG_REG_R3 }
+.tmp = { TCG_REG_TMP1, TCG_REG_TMP2, TCG_REG_R0 }
 };
 
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -2135,41 +2134,44 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_AREG0, mask_off);
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R4, TCG_AREG0, table_off);
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, mask_off);
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_AREG0, table_off);
 
 /* Extract the page index, shifted into place for tlb index.  */
 if (TCG_TARGET_REG_BITS == 32) {
-tcg_out_shri32(s, TCG_REG_TMP1, addrlo,
+tcg_out_shri32(s, TCG_REG_R0, addrlo,
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 } else {
-tcg_out_shri64(s, TCG_REG_TMP1, addrlo,
+tcg_out_shri64(s, TCG_REG_R0, addrlo,
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 }
-tcg_out32(s, AND | SAB(TCG_REG_R3, TCG_REG_R3, TCG_REG_TMP1));
+tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0));
 
-/* Load the TLB comparator.  */
+/* Load the (low part) TLB comparator into TMP2.  */
 if (cmp_off == 0 && TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
 uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32
 ? LWZUX : LDUX);
-tcg_out32(s, lxu | TAB(TCG_REG_TMP1, TCG_REG_R3, TCG_REG_R4));
+tcg_out32(s, lxu | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
 } else {
-tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, TCG_REG_R4));
+tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
 if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP1, TCG_REG_R3, cmp_off + 4);
-tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R4, TCG_REG_R3, cmp_off);
+tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2,
+   TCG_REG_TMP1, cmp_off + 4 * HOST_BIG_ENDIAN);
 } else {
-tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP1, TCG_REG_R3, cmp_off);
+tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
 }
 }
 
-/* Load the TLB addend for use on the fast path.  Do this asap
-   to minimize any load use delay.  */
-h->base = TCG_REG_R3;
-tcg_out_ld(s, TCG_TYPE_PTR, h->base, TCG_REG_R3,
-   offsetof(CPUTLBEntry, addend));
+/*
+ * Load the TLB addend for use on the fast path.
+ * Do this asap to minimize any load use delay.
+ */
+if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
+   offsetof(CPUTLBEntry, addend));
+}
 
-/* Clear the non-page, non-alignment bits from the address */
+/* Clear the non-page, non-alignment bits from the address in R0. */
 if (TCG_TARGET_REG_BITS == 32) {
 /* We don't support unaligned accesses on 32-bits.
  * Preserve the bottom bits and thus trigger a comparison
@@ -2200,9 +2202,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 if (TARGET_LONG_BITS == 32) {
 tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0,
 (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
-/* Zero-extend the address for use in the final address.  */
-tcg_out_ext32u(s, TCG_REG_R4, addrlo);
-addrlo = TCG_REG_R4;
 } else if (a_bits == 0) {
 tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - TARGET_PAGE_BITS);
 } else {
@@ -2211,21 +221

[PATCH v4 05/57] accel/tcg: Reorg system mode store helpers

2023-05-03 Thread Richard Henderson

Instead of trying to unify all operations on uint64_t, use
mmu_lookup() to perform the basic tlb hit and resolution.
Create individual functions to handle access by size.

Signed-off-by: Richard Henderson 
---
 accel/tcg/cputlb.c | 408 +
 1 file changed, 193 insertions(+), 215 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index dd68514260..f52c7e6da0 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2531,322 +2531,300 @@ store_memop(void *haddr, uint64_t val, MemOp op)
 }
 }
 
-static void full_stb_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
- MemOpIdx oi, uintptr_t retaddr);
-
-static void __attribute__((noinline))
-store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
-   uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
-   bool big_endian)
+/**
+ * do_st_mmio_leN:
+ * @env: cpu context
+ * @p: translation parameters
+ * @val_le: data to store
+ * @mmu_idx: virtual address context
+ * @ra: return address into tcg generated code, or 0
+ *
+ * Store @p->size bytes at @p->addr, which is memory-mapped i/o.
+ * The bytes to store are extracted in little-endian order from @val_le;
+ * return the bytes of @val_le beyond @p->size that have not been stored.
+ */
+static uint64_t do_st_mmio_leN(CPUArchState *env, MMULookupPageData *p,
+   uint64_t val_le, int mmu_idx, uintptr_t ra)
 {
-uintptr_t index, index2;
-CPUTLBEntry *entry, *entry2;
-target_ulong page1, page2, tlb_addr, tlb_addr2;
-MemOpIdx oi;
-size_t size2;
-int i;
+CPUTLBEntryFull *full = p->full;
+target_ulong addr = p->addr;
+int i, size = p->size;
 
-/*
- * Ensure the second page is in the TLB.  Note that the first page
- * is already guaranteed to be filled, and that the second page
- * cannot evict the first.  An exception to this rule is PAGE_WRITE_INV
- * handling: the first page could have evicted itself.
- */
-page1 = addr & TARGET_PAGE_MASK;
-page2 = (addr + size) & TARGET_PAGE_MASK;
-size2 = (addr + size) & ~TARGET_PAGE_MASK;
-index2 = tlb_index(env, mmu_idx, page2);
-entry2 = tlb_entry(env, mmu_idx, page2);
-
-tlb_addr2 = tlb_addr_write(entry2);
-if (page1 != page2 && !tlb_hit_page(tlb_addr2, page2)) {
-if (!victim_tlb_hit(env, mmu_idx, index2, MMU_DATA_STORE, page2)) {
-tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
- mmu_idx, retaddr);
-index2 = tlb_index(env, mmu_idx, page2);
-entry2 = tlb_entry(env, mmu_idx, page2);
-}
-tlb_addr2 = tlb_addr_write(entry2);
+QEMU_IOTHREAD_LOCK_GUARD();
+for (i = 0; i < size; i++, val_le >>= 8) {
+io_writex(env, full, mmu_idx, val_le, addr + i, ra, MO_UB);
 }
+return val_le;
+}
 
-index = tlb_index(env, mmu_idx, addr);
-entry = tlb_entry(env, mmu_idx, addr);
-tlb_addr = tlb_addr_write(entry);
+/**
+ * do_st_bytes_leN:
+ * @p: translation parameters
+ * @val_le: data to store
+ *
+ * Store @p->size bytes at @p->haddr, which is RAM.
+ * The bytes to store are extracted in little-endian order from @val_le;
+ * return the bytes of @val_le beyond @p->size that have not been stored.
+ */
+static uint64_t do_st_bytes_leN(MMULookupPageData *p, uint64_t val_le)
+{
+uint8_t *haddr = p->haddr;
+int i, size = p->size;
 
-/*
- * Handle watchpoints.  Since this may trap, all checks
- * must happen before any store.
- */
-if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-cpu_check_watchpoint(env_cpu(env), addr, size - size2,
- env_tlb(env)->d[mmu_idx].fulltlb[index].attrs,
- BP_MEM_WRITE, retaddr);
-}
-if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
-cpu_check_watchpoint(env_cpu(env), page2, size2,
- env_tlb(env)->d[mmu_idx].fulltlb[index2].attrs,
- BP_MEM_WRITE, retaddr);
+for (i = 0; i < size; i++, val_le >>= 8) {
+haddr[i] = val_le;
 }
+return val_le;
+}
 
-/*
- * XXX: not efficient, but simple.
- * This loop must go in the forward direction to avoid issues
- * with self-modifying code in Windows 64-bit.
- */
-oi = make_memop_idx(MO_UB, mmu_idx);
-if (big_endian) {
-for (i = 0; i < size; ++i) {
-/* Big-endian extract.  */
-uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
-full_stb_mmu(env, addr + i, val8, oi, retaddr);
-}
+/*
+ * Wrapper for the above.
+ */
+static uint64_t do_st_leN(CPUArchState *env, MMULookupPageData *p,
+  uint64_t val_le, int mmu_idx, uintptr_t ra)
+{
+if (unlikely(p->flags & TLB_MMIO)) {
+return do_st_mmio_leN(env, p, val_le, mmu_idx, ra);
+} else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
+

[PATCH v4 17/54] tcg/mips: Rationalize args to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

Interpret the variable argument placement in the caller.  There are
several places where we already convert back from bool to type.
Clean things up by using type throughout.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/mips/tcg-target.c.inc | 186 +++---
 1 file changed, 95 insertions(+), 91 deletions(-)

diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index a83ebe8729..ef8350e9cd 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -1479,7 +1479,7 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
-   TCGReg base, MemOp opc, bool is_64)
+   TCGReg base, MemOp opc, TCGType type)
 {
 switch (opc & (MO_SSIZE | MO_BSWAP)) {
 case MO_UB:
@@ -1503,7 +1503,7 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
lo, TCGReg hi,
 tcg_out_opc_imm(s, OPC_LH, lo, base, 0);
 break;
 case MO_UL | MO_BSWAP:
-if (TCG_TARGET_REG_BITS == 64 && is_64) {
+if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
 if (use_mips32r2_instructions) {
 tcg_out_opc_imm(s, OPC_LWU, lo, base, 0);
 tcg_out_bswap32(s, lo, lo, TCG_BSWAP_IZ | TCG_BSWAP_OZ);
@@ -1528,7 +1528,7 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
lo, TCGReg hi,
 }
 break;
 case MO_UL:
-if (TCG_TARGET_REG_BITS == 64 && is_64) {
+if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
 tcg_out_opc_imm(s, OPC_LWU, lo, base, 0);
 break;
 }
@@ -1583,7 +1583,7 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
lo, TCGReg hi,
 }
 
 static void tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg lo, TCGReg hi,
-TCGReg base, MemOp opc, bool is_64)
+TCGReg base, MemOp opc, TCGType type)
 {
 const MIPSInsn lw1 = MIPS_BE ? OPC_LWL : OPC_LWR;
 const MIPSInsn lw2 = MIPS_BE ? OPC_LWR : OPC_LWL;
@@ -1623,7 +1623,7 @@ static void tcg_out_qemu_ld_unalign(TCGContext *s, TCGReg 
lo, TCGReg hi,
 case MO_UL:
 tcg_out_opc_imm(s, lw1, lo, base, 0);
 tcg_out_opc_imm(s, lw2, lo, base, 3);
-if (TCG_TARGET_REG_BITS == 64 && is_64 && !sgn) {
+if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64 && !sgn) {
 tcg_out_ext32u(s, lo, lo);
 }
 break;
@@ -1634,18 +1634,18 @@ static void tcg_out_qemu_ld_unalign(TCGContext *s, 
TCGReg lo, TCGReg hi,
 tcg_out_opc_imm(s, lw1, lo, base, 0);
 tcg_out_opc_imm(s, lw2, lo, base, 3);
 tcg_out_bswap32(s, lo, lo,
-TCG_TARGET_REG_BITS == 64 && is_64
+TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64
 ? (sgn ? TCG_BSWAP_OS : TCG_BSWAP_OZ) : 0);
 } else {
 const tcg_insn_unit *subr =
-(TCG_TARGET_REG_BITS == 64 && is_64 && !sgn
+(TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64 && !sgn
  ? bswap32u_addr : bswap32_addr);
 
 tcg_out_opc_imm(s, lw1, TCG_TMP0, base, 0);
 tcg_out_bswap_subr(s, subr);
 /* delay slot */
 tcg_out_opc_imm(s, lw2, TCG_TMP0, base, 3);
-tcg_out_mov(s, is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32, lo, TCG_TMP3);
+tcg_out_mov(s, type, lo, TCG_TMP3);
 }
 break;
 
@@ -1702,68 +1702,59 @@ static void tcg_out_qemu_ld_unalign(TCGContext *s, 
TCGReg lo, TCGReg hi,
 }
 }
 
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
+static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
+TCGReg addrlo, TCGReg addrhi,
+MemOpIdx oi, TCGType data_type)
 {
-TCGReg addr_regl, addr_regh __attribute__((unused));
-TCGReg data_regl, data_regh;
-MemOpIdx oi;
-MemOp opc;
-#if defined(CONFIG_SOFTMMU)
-tcg_insn_unit *label_ptr[2];
-#else
-#endif
-unsigned a_bits, s_bits;
-TCGReg base = TCG_REG_A0;
-
-data_regl = *args++;
-data_regh = (TCG_TARGET_REG_BITS == 32 && is_64 ? *args++ : 0);
-addr_regl = *args++;
-addr_regh = (TCG_TARGET_REG_BITS < TARGET_LONG_BITS ? *args++ : 0);
-oi = *args++;
-opc = get_memop(oi);
-a_bits = get_alignment_bits(opc);
-s_bits = opc & MO_SIZE;
+MemOp opc = get_memop(oi);
+unsigned a_bits = get_alignment_bits(opc);
+unsigned s_bits = opc & MO_SIZE;
+TCGReg base;
 
 /*
  * R6 removes the left/right instructions but requires the
  * system to support misaligned memory accesses.
  */
 #if defined(CONFIG_SOFTMMU)
-tcg_out_tlb_load(s, base, addr_regl, addr_regh, oi, label_ptr, 1);
+

[PATCH v4 04/57] accel/tcg: Reorg system mode load helpers

2023-05-03 Thread Richard Henderson

Instead of trying to unify all operations on uint64_t, pull out
mmu_lookup() to perform the basic tlb hit and resolution.
Create individual functions to handle access by size.

Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 accel/tcg/cputlb.c | 644 +
 1 file changed, 423 insertions(+), 221 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 5051244c67..dd68514260 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1716,6 +1716,178 @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong 
addr, int mmu_idx,
 
 #endif
 
+/*
+ * Probe for a load/store operation.
+ * Return the host address and into @flags.
+ */
+
+typedef struct MMULookupPageData {
+CPUTLBEntryFull *full;
+void *haddr;
+target_ulong addr;
+int flags;
+int size;
+} MMULookupPageData;
+
+typedef struct MMULookupLocals {
+MMULookupPageData page[2];
+MemOp memop;
+int mmu_idx;
+} MMULookupLocals;
+
+/**
+ * mmu_lookup1: translate one page
+ * @env: cpu context
+ * @data: lookup parameters
+ * @mmu_idx: virtual address context
+ * @access_type: load/store/code
+ * @ra: return address into tcg generated code, or 0
+ *
+ * Resolve the translation for the one page at @data.addr, filling in
+ * the rest of @data with the results.  If the translation fails,
+ * tlb_fill will longjmp out.  Return true if the softmmu tlb for
+ * @mmu_idx may have resized.
+ */
+static bool mmu_lookup1(CPUArchState *env, MMULookupPageData *data,
+int mmu_idx, MMUAccessType access_type, uintptr_t ra)
+{
+target_ulong addr = data->addr;
+uintptr_t index = tlb_index(env, mmu_idx, addr);
+CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+target_ulong tlb_addr = tlb_read_idx(entry, access_type);
+bool maybe_resized = false;
+
+/* If the TLB entry is for a different page, reload and try again.  */
+if (!tlb_hit(tlb_addr, addr)) {
+if (!victim_tlb_hit(env, mmu_idx, index, access_type,
+addr & TARGET_PAGE_MASK)) {
+tlb_fill(env_cpu(env), addr, data->size, access_type, mmu_idx, ra);
+maybe_resized = true;
+index = tlb_index(env, mmu_idx, addr);
+entry = tlb_entry(env, mmu_idx, addr);
+}
+tlb_addr = tlb_read_idx(entry, access_type) & ~TLB_INVALID_MASK;
+}
+
+data->flags = tlb_addr & TLB_FLAGS_MASK;
+data->full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+/* Compute haddr speculatively; depending on flags it might be invalid. */
+data->haddr = (void *)((uintptr_t)addr + entry->addend);
+
+return maybe_resized;
+}
+
+/**
+ * mmu_watch_or_dirty
+ * @env: cpu context
+ * @data: lookup parameters
+ * @access_type: load/store/code
+ * @ra: return address into tcg generated code, or 0
+ *
+ * Trigger watchpoints for @data.addr:@data.size;
+ * record writes to protected clean pages.
+ */
+static void mmu_watch_or_dirty(CPUArchState *env, MMULookupPageData *data,
+   MMUAccessType access_type, uintptr_t ra)
+{
+CPUTLBEntryFull *full = data->full;
+target_ulong addr = data->addr;
+int flags = data->flags;
+int size = data->size;
+
+/* On watchpoint hit, this will longjmp out.  */
+if (flags & TLB_WATCHPOINT) {
+int wp = access_type == MMU_DATA_STORE ? BP_MEM_WRITE : BP_MEM_READ;
+cpu_check_watchpoint(env_cpu(env), addr, size, full->attrs, wp, ra);
+flags &= ~TLB_WATCHPOINT;
+}
+
+if (flags & TLB_NOTDIRTY) {
+notdirty_write(env_cpu(env), addr, size, full, ra);
+flags &= ~TLB_NOTDIRTY;
+}
+data->flags = flags;
+}
+
+/**
+ * mmu_lookup: translate page(s)
+ * @env: cpu context
+ * @addr: virtual address
+ * @oi: combined mmu_idx and MemOp
+ * @ra: return address into tcg generated code, or 0
+ * @access_type: load/store/code
+ * @l: output result
+ *
+ * Resolve the translation for the page(s) beginning at @addr, for MemOp.size
+ * bytes.  Return true if the lookup crosses a page boundary.
+ */
+static bool mmu_lookup(CPUArchState *env, target_ulong addr, MemOpIdx oi,
+   uintptr_t ra, MMUAccessType type, MMULookupLocals *l)
+{
+unsigned a_bits;
+bool crosspage;
+int flags;
+
+l->memop = get_memop(oi);
+l->mmu_idx = get_mmuidx(oi);
+
+tcg_debug_assert(l->mmu_idx < NB_MMU_MODES);
+
+/* Handle CPU specific unaligned behaviour */
+a_bits = get_alignment_bits(l->memop);
+if (addr & ((1 << a_bits) - 1)) {
+cpu_unaligned_access(env_cpu(env), addr, type, l->mmu_idx, ra);
+}
+
+l->page[0].addr = addr;
+l->page[0].size = memop_size(l->memop);
+l->page[1].addr = (addr + l->page[0].size - 1) & TARGET_PAGE_MASK;
+l->page[1].size = 0;
+crosspage = (addr ^ l->page[1].addr) & TARGET_PAGE_MASK;
+
+if (likely(!crosspage)) {
+mmu_lookup1(env, &l->page[0], l->mmu_idx, type, ra);
+
+flags = l->page[0].flags;

[PATCH v4 29/54] tcg/sparc64: Pass TCGType to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

We need to set this in TCGLabelQemuLdst, so plumb this
all the way through from tcg_out_op.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/sparc64/tcg-target.c.inc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index dbe4bf96b9..7e6466d3b6 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -1178,7 +1178,7 @@ static const int qemu_st_opc[(MO_SIZE | MO_BSWAP) + 1] = {
 };
 
 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
-MemOpIdx oi, bool is_64)
+MemOpIdx oi, TCGType data_type)
 {
 MemOp memop = get_memop(oi);
 tcg_insn_unit *label_ptr;
@@ -1636,10 +1636,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 break;
 
 case INDEX_op_qemu_ld_i32:
-tcg_out_qemu_ld(s, a0, a1, a2, false);
+tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I32);
 break;
 case INDEX_op_qemu_ld_i64:
-tcg_out_qemu_ld(s, a0, a1, a2, true);
+tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64);
 break;
 case INDEX_op_qemu_st_i32:
 tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32);
-- 
2.34.1

[PATCH v4 08/57] target/loongarch: Do not include tcg-ldst.h

2023-05-03 Thread Richard Henderson

This header is supposed to be private to tcg and in fact
does not need to be included here at all.

Reviewed-by: Song Gao 
Signed-off-by: Richard Henderson 
---
 target/loongarch/csr_helper.c   | 1 -
 target/loongarch/iocsr_helper.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/target/loongarch/csr_helper.c b/target/loongarch/csr_helper.c
index 7e02787895..6526367946 100644
--- a/target/loongarch/csr_helper.c
+++ b/target/loongarch/csr_helper.c
@@ -15,7 +15,6 @@
 #include "exec/cpu_ldst.h"
 #include "hw/irq.h"
 #include "cpu-csr.h"
-#include "tcg/tcg-ldst.h"
 
 target_ulong helper_csrrd_pgd(CPULoongArchState *env)
 {
diff --git a/target/loongarch/iocsr_helper.c b/target/loongarch/iocsr_helper.c
index 505853e17b..dda9845d6c 100644
--- a/target/loongarch/iocsr_helper.c
+++ b/target/loongarch/iocsr_helper.c
@@ -12,7 +12,6 @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
-#include "tcg/tcg-ldst.h"
 
 #define GET_MEMTXATTRS(cas) \
 ((MemTxAttrs){.requester_id = env_cpu(cas)->cpu_index})
-- 
2.34.1

[PATCH v4 47/54] tcg/mips: Simplify constraints on qemu_ld/st

2023-05-03 Thread Richard Henderson

The softmmu tlb uses TCG_REG_TMP[0-3], not any of the normally available
registers.  Now that we handle overlap betwen inputs and helper arguments,
and have eliminated use of A0, we can allow any allocatable reg.

Signed-off-by: Richard Henderson 
---
 tcg/mips/tcg-target-con-set.h | 13 +
 tcg/mips/tcg-target-con-str.h |  2 --
 tcg/mips/tcg-target.c.inc | 30 --
 3 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/tcg/mips/tcg-target-con-set.h b/tcg/mips/tcg-target-con-set.h
index fe3e868a2f..864034f468 100644
--- a/tcg/mips/tcg-target-con-set.h
+++ b/tcg/mips/tcg-target-con-set.h
@@ -12,15 +12,13 @@
 C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
-C_O0_I2(SZ, S)
-C_O0_I3(SZ, S, S)
-C_O0_I3(SZ, SZ, S)
+C_O0_I3(rZ, r, r)
+C_O0_I3(rZ, rZ, r)
 C_O0_I4(rZ, rZ, rZ, rZ)
-C_O0_I4(SZ, SZ, S, S)
-C_O1_I1(r, L)
+C_O0_I4(rZ, rZ, r, r)
 C_O1_I1(r, r)
 C_O1_I2(r, 0, rZ)
-C_O1_I2(r, L, L)
+C_O1_I2(r, r, r)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
 C_O1_I2(r, r, rIK)
@@ -30,7 +28,6 @@ C_O1_I2(r, rZ, rN)
 C_O1_I2(r, rZ, rZ)
 C_O1_I4(r, rZ, rZ, rZ, 0)
 C_O1_I4(r, rZ, rZ, rZ, rZ)
-C_O2_I1(r, r, L)
-C_O2_I2(r, r, L, L)
+C_O2_I1(r, r, r)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, rZ, rZ, rN, rN)
diff --git a/tcg/mips/tcg-target-con-str.h b/tcg/mips/tcg-target-con-str.h
index e4b2965c72..413c280a7a 100644
--- a/tcg/mips/tcg-target-con-str.h
+++ b/tcg/mips/tcg-target-con-str.h
@@ -9,8 +9,6 @@
  * REGS(letter, register_mask)
  */
 REGS('r', ALL_GENERAL_REGS)
-REGS('L', ALL_QLOAD_REGS)
-REGS('S', ALL_QSTORE_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index 695c137023..5ad9867882 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -176,20 +176,6 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 #define TCG_CT_CONST_WSZ  0x2000   /* word size */
 
 #define ALL_GENERAL_REGS  0xu
-#define NOA0_REGS (ALL_GENERAL_REGS & ~(1 << TCG_REG_A0))
-
-#ifdef CONFIG_SOFTMMU
-#define ALL_QLOAD_REGS \
-(NOA0_REGS & ~((TCG_TARGET_REG_BITS < TARGET_LONG_BITS) << TCG_REG_A2))
-#define ALL_QSTORE_REGS \
-(NOA0_REGS & ~(TCG_TARGET_REG_BITS < TARGET_LONG_BITS   \
-   ? (1 << TCG_REG_A2) | (1 << TCG_REG_A3)  \
-   : (1 << TCG_REG_A1)))
-#else
-#define ALL_QLOAD_REGS   NOA0_REGS
-#define ALL_QSTORE_REGS  NOA0_REGS
-#endif
-
 
 static bool is_p2m1(tcg_target_long val)
 {
@@ -2232,18 +2218,18 @@ static TCGConstraintSetIndex 
tcg_target_op_def(TCGOpcode op)
 
 case INDEX_op_qemu_ld_i32:
 return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-? C_O1_I1(r, L) : C_O1_I2(r, L, L));
+? C_O1_I1(r, r) : C_O1_I2(r, r, r));
 case INDEX_op_qemu_st_i32:
 return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-? C_O0_I2(SZ, S) : C_O0_I3(SZ, S, S));
+? C_O0_I2(rZ, r) : C_O0_I3(rZ, r, r));
 case INDEX_op_qemu_ld_i64:
-return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
-: TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, L)
-: C_O2_I2(r, r, L, L));
+return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, r)
+: TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, r)
+: C_O2_I2(r, r, r, r));
 case INDEX_op_qemu_st_i64:
-return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(SZ, S)
-: TARGET_LONG_BITS == 32 ? C_O0_I3(SZ, SZ, S)
-: C_O0_I4(SZ, SZ, S, S));
+return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(rZ, r)
+: TARGET_LONG_BITS == 32 ? C_O0_I3(rZ, rZ, r)
+: C_O0_I4(rZ, rZ, r, r));
 
 default:
 g_assert_not_reached();
-- 
2.34.1

[PATCH v4 22/54] tcg/riscv: Require TCG_TARGET_REG_BITS == 64

2023-05-03 Thread Richard Henderson

The port currently does not support "oversize" guests, which
means riscv32 can only target 32-bit guests.  We will soon be
building TCG once for all guests.  This implies that we can
only support riscv64.

Since all Linux distributions target riscv64 not riscv32,
this is not much of a restriction and simplifies the code.

The brcond2 and setcond2 opcodes are exclusive to 32-bit hosts,
so we can and should remove the stubs.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/riscv/tcg-target-con-set.h |   8 --
 tcg/riscv/tcg-target.h |  22 ++--
 tcg/riscv/tcg-target.c.inc | 232 +
 3 files changed, 72 insertions(+), 190 deletions(-)

diff --git a/tcg/riscv/tcg-target-con-set.h b/tcg/riscv/tcg-target-con-set.h
index cf0ac4d751..d4cff673b0 100644
--- a/tcg/riscv/tcg-target-con-set.h
+++ b/tcg/riscv/tcg-target-con-set.h
@@ -13,18 +13,10 @@ C_O0_I1(r)
 C_O0_I2(LZ, L)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
-C_O0_I3(LZ, L, L)
-C_O0_I3(LZ, LZ, L)
-C_O0_I4(LZ, LZ, L, L)
-C_O0_I4(rZ, rZ, rZ, rZ)
 C_O1_I1(r, L)
 C_O1_I1(r, r)
-C_O1_I2(r, L, L)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
 C_O1_I2(r, rZ, rN)
 C_O1_I2(r, rZ, rZ)
-C_O1_I4(r, rZ, rZ, rZ, rZ)
-C_O2_I1(r, r, L)
-C_O2_I2(r, r, L, L)
 C_O2_I4(r, r, rZ, rZ, rM, rM)
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index 0deb33701f..dddf2486c1 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -25,11 +25,14 @@
 #ifndef RISCV_TCG_TARGET_H
 #define RISCV_TCG_TARGET_H
 
-#if __riscv_xlen == 32
-# define TCG_TARGET_REG_BITS 32
-#elif __riscv_xlen == 64
-# define TCG_TARGET_REG_BITS 64
+/*
+ * We don't support oversize guests.
+ * Since we will only build tcg once, this in turn requires a 64-bit host.
+ */
+#if __riscv_xlen != 64
+#error "unsupported code generation mode"
 #endif
+#define TCG_TARGET_REG_BITS 64
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 20
@@ -83,13 +86,8 @@ typedef enum {
 #define TCG_TARGET_STACK_ALIGN  16
 #define TCG_TARGET_CALL_STACK_OFFSET0
 #define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
-#if TCG_TARGET_REG_BITS == 32
-#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_EVEN
-#define TCG_TARGET_CALL_ARG_I128TCG_CALL_ARG_EVEN
-#else
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
 #define TCG_TARGET_CALL_ARG_I128TCG_CALL_ARG_NORMAL
-#endif
 #define TCG_TARGET_CALL_RET_I128TCG_CALL_RET_NORMAL
 
 /* optional instructions */
@@ -106,8 +104,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i32 1
 #define TCG_TARGET_HAS_mulu2_i320
 #define TCG_TARGET_HAS_muls2_i320
-#define TCG_TARGET_HAS_muluh_i32(TCG_TARGET_REG_BITS == 32)
-#define TCG_TARGET_HAS_mulsh_i32(TCG_TARGET_REG_BITS == 32)
+#define TCG_TARGET_HAS_muluh_i320
+#define TCG_TARGET_HAS_mulsh_i320
 #define TCG_TARGET_HAS_ext8s_i321
 #define TCG_TARGET_HAS_ext16s_i32   1
 #define TCG_TARGET_HAS_ext8u_i321
@@ -128,7 +126,6 @@ typedef enum {
 #define TCG_TARGET_HAS_setcond2 1
 #define TCG_TARGET_HAS_qemu_st8_i32 0
 
-#if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_movcond_i64  0
 #define TCG_TARGET_HAS_div_i64  1
 #define TCG_TARGET_HAS_rem_i64  1
@@ -165,7 +162,6 @@ typedef enum {
 #define TCG_TARGET_HAS_muls2_i640
 #define TCG_TARGET_HAS_muluh_i641
 #define TCG_TARGET_HAS_mulsh_i641
-#endif
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index 266fe1433d..7a674ff5ce 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -137,15 +137,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind 
kind, int slot)
 #define SOFTMMU_RESERVE_REGS  0
 #endif
 
-
-static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
-{
-if (TCG_TARGET_REG_BITS == 32) {
-return sextract32(val, pos, len);
-} else {
-return sextract64(val, pos, len);
-}
-}
+#define sextreg  sextract64
 
 /* test if a constant matches the constraint */
 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
@@ -235,7 +227,6 @@ typedef enum {
 OPC_XOR = 0x4033,
 OPC_XORI = 0x4013,
 
-#if TCG_TARGET_REG_BITS == 64
 OPC_ADDIW = 0x1b,
 OPC_ADDW = 0x3b,
 OPC_DIVUW = 0x200503b,
@@ -250,23 +241,6 @@ typedef enum {
 OPC_SRLIW = 0x501b,
 OPC_SRLW = 0x503b,
 OPC_SUBW = 0x403b,
-#else
-/* Simplify code throughout by defining aliases for RV32.  */
-OPC_ADDIW = OPC_ADDI,
-OPC_ADDW = OPC_ADD,
-OPC_DIVUW = OPC_DIVU,
-OPC_DIVW = OPC_DIV,
-OPC_MULW = OPC_MUL,
-OPC_REMUW = OPC_REMU,
-OPC_REMW = OPC_REM,
-OPC_SLLIW = OPC_SLLI,
-OPC_SLLW = OPC_SLL,
-OPC_SRAIW = OPC_SRAI,
-OPC_SRAW = OPC_SRA,
-OPC_SRLIW = OPC_SRLI,
-OPC_SRLW = OPC_SRL,
-OPC_SUBW = OPC_SUB,
-#endif

[PATCH v4 03/57] accel/tcg: Introduce tlb_read_idx

2023-05-03 Thread Richard Henderson

Instead of playing with offsetof in various places, use
MMUAccessType to index an array.  This is easily defined
instead of the previous dummy padding array in the union.

Reviewed-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/exec/cpu-defs.h |   7 ++-
 include/exec/cpu_ldst.h |  26 --
 accel/tcg/cputlb.c  | 104 +---
 3 files changed, 59 insertions(+), 78 deletions(-)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index e1c498ef4b..a6e0cf1812 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -111,8 +111,11 @@ typedef struct CPUTLBEntry {
use the corresponding iotlb value.  */
 uintptr_t addend;
 };
-/* padding to get a power of two size */
-uint8_t dummy[1 << CPU_TLB_ENTRY_BITS];
+/*
+ * Padding to get a power of two size, as well as index
+ * access to addr_{read,write,code}.
+ */
+target_ulong addr_idx[(1 << CPU_TLB_ENTRY_BITS) / TARGET_LONG_SIZE];
 };
 } CPUTLBEntry;
 
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index c141f0394f..7c867c94c3 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -360,13 +360,29 @@ static inline void clear_helper_retaddr(void)
 /* Needed for TCG_OVERSIZED_GUEST */
 #include "tcg/tcg.h"
 
+static inline target_ulong tlb_read_idx(const CPUTLBEntry *entry,
+MMUAccessType access_type)
+{
+/* Do not rearrange the CPUTLBEntry structure members. */
+QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_read) !=
+  MMU_DATA_LOAD * TARGET_LONG_SIZE);
+QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) !=
+  MMU_DATA_STORE * TARGET_LONG_SIZE);
+QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) !=
+  MMU_INST_FETCH * TARGET_LONG_SIZE);
+
+const target_ulong *ptr = &entry->addr_idx[access_type];
+#if TCG_OVERSIZED_GUEST
+return *ptr;
+#else
+/* ofs might correspond to .addr_write, so use qatomic_read */
+return qatomic_read(ptr);
+#endif
+}
+
 static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
 {
-#if TCG_OVERSIZED_GUEST
-return entry->addr_write;
-#else
-return qatomic_read(&entry->addr_write);
-#endif
+return tlb_read_idx(entry, MMU_DATA_STORE);
 }
 
 /* Find the TLB index corresponding to the mmu_idx + address pair.  */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 3117886af1..5051244c67 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1441,34 +1441,17 @@ static void io_writex(CPUArchState *env, 
CPUTLBEntryFull *full,
 }
 }
 
-static inline target_ulong tlb_read_ofs(CPUTLBEntry *entry, size_t ofs)
-{
-#if TCG_OVERSIZED_GUEST
-return *(target_ulong *)((uintptr_t)entry + ofs);
-#else
-/* ofs might correspond to .addr_write, so use qatomic_read */
-return qatomic_read((target_ulong *)((uintptr_t)entry + ofs));
-#endif
-}
-
 /* Return true if ADDR is present in the victim tlb, and has been copied
back to the main tlb.  */
 static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
-   size_t elt_ofs, target_ulong page)
+   MMUAccessType access_type, target_ulong page)
 {
 size_t vidx;
 
 assert_cpu_is_self(env_cpu(env));
 for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
 CPUTLBEntry *vtlb = &env_tlb(env)->d[mmu_idx].vtable[vidx];
-target_ulong cmp;
-
-/* elt_ofs might correspond to .addr_write, so use qatomic_read */
-#if TCG_OVERSIZED_GUEST
-cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
-#else
-cmp = qatomic_read((target_ulong *)((uintptr_t)vtlb + elt_ofs));
-#endif
+target_ulong cmp = tlb_read_idx(vtlb, access_type);
 
 if (cmp == page) {
 /* Found entry in victim tlb, swap tlb and iotlb.  */
@@ -1490,11 +1473,6 @@ static bool victim_tlb_hit(CPUArchState *env, size_t 
mmu_idx, size_t index,
 return false;
 }
 
-/* Macro to call the above, with local variables from the use context.  */
-#define VICTIM_TLB_HIT(TY, ADDR) \
-  victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
- (ADDR) & TARGET_PAGE_MASK)
-
 static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
CPUTLBEntryFull *full, uintptr_t retaddr)
 {
@@ -1527,29 +1505,12 @@ static int probe_access_internal(CPUArchState *env, 
target_ulong addr,
 {
 uintptr_t index = tlb_index(env, mmu_idx, addr);
 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
-target_ulong tlb_addr, page_addr;
-size_t elt_ofs;
-int flags;
+target_ulong tlb_addr = tlb_read_idx(entry, access_type);
+target_ulong page_addr = addr & TARGET_PAGE_MASK;
+int flags = TLB_FLAGS_MASK;
 
-switch (access_type) {
-case MMU_DATA_LOAD:
-el

[PATCH v4 21/54] tcg/ppc: Introduce prepare_host_addr

2023-05-03 Thread Richard Henderson

Merge tcg_out_tlb_load, add_qemu_ldst_label, tcg_out_test_alignment,
and some code that lived in both tcg_out_qemu_ld and tcg_out_qemu_st
into one function that returns HostAddress and TCGLabelQemuLdst structures.

Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target.c.inc | 377 +--
 1 file changed, 168 insertions(+), 209 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index cd473deb36..7239335bdf 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2003,140 +2003,6 @@ static void * const qemu_st_helpers[(MO_SIZE | 
MO_BSWAP) + 1] = {
 [MO_BEUQ] = helper_be_stq_mmu,
 };
 
-/* We expect to use a 16-bit negative offset from ENV.  */
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
-
-/* Perform the TLB load and compare.  Places the result of the comparison
-   in CR7, loads the addend of the TLB into R3, and returns the register
-   containing the guest address (zero-extended into R4).  Clobbers R0 and R2. 
*/
-
-static TCGReg tcg_out_tlb_read(TCGContext *s, MemOp opc,
-   TCGReg addrlo, TCGReg addrhi,
-   int mem_index, bool is_read)
-{
-int cmp_off
-= (is_read
-   ? offsetof(CPUTLBEntry, addr_read)
-   : offsetof(CPUTLBEntry, addr_write));
-int fast_off = TLB_MASK_TABLE_OFS(mem_index);
-int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
-int table_off = fast_off + offsetof(CPUTLBDescFast, table);
-unsigned s_bits = opc & MO_SIZE;
-unsigned a_bits = get_alignment_bits(opc);
-
-/* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_AREG0, mask_off);
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R4, TCG_AREG0, table_off);
-
-/* Extract the page index, shifted into place for tlb index.  */
-if (TCG_TARGET_REG_BITS == 32) {
-tcg_out_shri32(s, TCG_REG_TMP1, addrlo,
-   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-} else {
-tcg_out_shri64(s, TCG_REG_TMP1, addrlo,
-   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-}
-tcg_out32(s, AND | SAB(TCG_REG_R3, TCG_REG_R3, TCG_REG_TMP1));
-
-/* Load the TLB comparator.  */
-if (cmp_off == 0 && TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
-uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32
-? LWZUX : LDUX);
-tcg_out32(s, lxu | TAB(TCG_REG_TMP1, TCG_REG_R3, TCG_REG_R4));
-} else {
-tcg_out32(s, ADD | TAB(TCG_REG_R3, TCG_REG_R3, TCG_REG_R4));
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP1, TCG_REG_R3, cmp_off + 4);
-tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_R4, TCG_REG_R3, cmp_off);
-} else {
-tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP1, TCG_REG_R3, cmp_off);
-}
-}
-
-/* Load the TLB addend for use on the fast path.  Do this asap
-   to minimize any load use delay.  */
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3,
-   offsetof(CPUTLBEntry, addend));
-
-/* Clear the non-page, non-alignment bits from the address */
-if (TCG_TARGET_REG_BITS == 32) {
-/* We don't support unaligned accesses on 32-bits.
- * Preserve the bottom bits and thus trigger a comparison
- * failure on unaligned accesses.
- */
-if (a_bits < s_bits) {
-a_bits = s_bits;
-}
-tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0,
-(32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
-} else {
-TCGReg t = addrlo;
-
-/* If the access is unaligned, we need to make sure we fail if we
- * cross a page boundary.  The trick is to add the access size-1
- * to the address before masking the low bits.  That will make the
- * address overflow to the next page if we cross a page boundary,
- * which will then force a mismatch of the TLB compare.
- */
-if (a_bits < s_bits) {
-unsigned a_mask = (1 << a_bits) - 1;
-unsigned s_mask = (1 << s_bits) - 1;
-tcg_out32(s, ADDI | TAI(TCG_REG_R0, t, s_mask - a_mask));
-t = TCG_REG_R0;
-}
-
-/* Mask the address for the requested alignment.  */
-if (TARGET_LONG_BITS == 32) {
-tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0,
-(32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
-/* Zero-extend the address for use in the final address.  */
-tcg_out_ext32u(s, TCG_REG_R4, addrlo);
-addrlo = TCG_REG_R4;
-} else if (a_bits == 0) {
-tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - TARGET_PAGE_BITS);
-} else {
-tcg_out_rld(s, RLDICL, TCG_REG_R0, t,
-64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - a_bits);
-

[PATCH v4 43/57] tcg/i386: Use atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

No change to the ultimate load/store routines yet, so some atomicity
conditions not yet honored, but plumbs the change to alignment through
the relevant functions.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 34 ++
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 7c72bf6684..3e21f067d6 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1774,6 +1774,8 @@ typedef struct {
 int index;
 int ofs;
 int seg;
+MemOp align;
+MemOp atom;
 } HostAddress;
 
 bool tcg_target_has_memory_bswap(MemOp memop)
@@ -1895,8 +1897,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-unsigned a_bits = get_alignment_bits(opc);
-unsigned a_mask = (1 << a_bits) - 1;
+MemOp atom_u;
+unsigned a_mask;
+
+h->align = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
+  MO_ATOM_IFALIGN, false);
+a_mask = (1 << h->align) - 1;
 
 #ifdef CONFIG_SOFTMMU
 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
@@ -1941,10 +1947,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
  TLB_MASK_TABLE_OFS(mem_index) +
  offsetof(CPUTLBDescFast, table));
 
-/* If the required alignment is at least as large as the access, simply
-   copy the address and mask.  For lesser alignments, check that we don't
-   cross pages for the complete access.  */
-if (a_bits >= s_bits) {
+/*
+ * If the required alignment is at least as large as the access, simply
+ * copy the address and mask.  For lesser alignments, check that we don't
+ * cross pages for the complete access.
+ */
+if (a_mask >= s_mask) {
 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
 } else {
 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
@@ -1976,12 +1984,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
offsetof(CPUTLBEntry, addend));
 
-*h = (HostAddress) {
-.base = addrlo,
-.index = TCG_REG_L0,
-};
+h->base = addrlo;
+h->index = TCG_REG_L0;
+h->ofs = 0;
+h->seg = 0;
 #else
-if (a_bits) {
+if (a_mask) {
 ldst = new_ldst_label(s);
 
 ldst->is_ld = is_ld;
@@ -1996,8 +2004,10 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 s->code_ptr += 4;
 }
 
-*h = x86_guest_base;
 h->base = addrlo;
+h->index = x86_guest_base.index;
+h->ofs = x86_guest_base.ofs;
+h->seg = x86_guest_base.seg;
 #endif
 
 return ldst;
-- 
2.34.1

[PATCH v4 45/57] tcg/arm: Use atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

No change to the ultimate load/store routines yet, so some atomicity
conditions not yet honored, but plumbs the change to alignment through
the relevant functions.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.c.inc | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index e5aed03247..edd995e04f 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1323,6 +1323,8 @@ typedef struct {
 TCGReg base;
 int index;
 bool index_scratch;
+MemOp align;
+MemOp atom;
 } HostAddress;
 
 bool tcg_target_has_memory_bswap(MemOp memop)
@@ -1379,8 +1381,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-MemOp a_bits = get_alignment_bits(opc);
-unsigned a_mask = (1 << a_bits) - 1;
+MemOp a_bits, atom_a, atom_u;
+unsigned a_mask;
+
+a_bits = atom_and_align_for_opc(s, &atom_a, &atom_u, opc,
+MO_ATOM_IFALIGN, false);
+a_mask = (1 << a_bits) - 1;
 
 #ifdef CONFIG_SOFTMMU
 int mem_index = get_mmuidx(oi);
@@ -1498,6 +1504,9 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 };
 #endif
 
+h->align = a_bits;
+h->atom = atom_a;
+
 return ldst;
 }
 
-- 
2.34.1

[PATCH v4 24/57] tcg/loongarch64: Use full load/store helpers in user-only mode

2023-05-03 Thread Richard Henderson

Instead of using helper_unaligned_{ld,st}, use the full load/store helpers.
This will allow the fast path to increase alignment to implement atomicity
while not immediately raising an alignment exception.

Signed-off-by: Richard Henderson 
---
 tcg/loongarch64/tcg-target.c.inc | 30 --
 1 file changed, 30 deletions(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index d1bc29826f..e651ec5c71 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -783,7 +783,6 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg 
val,
  * Load/store helpers for SoftMMU, and qemu_ld/st implementations
  */
 
-#if defined(CONFIG_SOFTMMU)
 static bool tcg_out_goto(TCGContext *s, const tcg_insn_unit *target)
 {
 tcg_out_opc_b(s, 0);
@@ -822,35 +821,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE], false);
 return tcg_out_goto(s, l->raddr);
 }
-#else
-static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
-{
-/* resolve label address */
-if (!reloc_br_sk16(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
-return false;
-}
-
-tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_A1, l->addrlo_reg);
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_A0, TCG_AREG0);
-
-/* tail call, with the return address back inline. */
-tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (uintptr_t)l->raddr);
-tcg_out_call_int(s, (const void *)(l->is_ld ? helper_unaligned_ld
-   : helper_unaligned_st), true);
-return true;
-}
-
-static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-
-static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-
-#endif /* CONFIG_SOFTMMU */
 
 typedef struct {
 TCGReg base;
-- 
2.34.1

[PATCH v4 27/57] tcg/arm: Use full load/store helpers in user-only mode

2023-05-03 Thread Richard Henderson

Instead of using helper_unaligned_{ld,st}, use the full load/store helpers.
This will allow the fast path to increase alignment to implement atomicity
while not immediately raising an alignment exception.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.c.inc | 45 
 1 file changed, 45 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index a02804dd69..eb0542f32e 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -1325,7 +1325,6 @@ typedef struct {
 bool index_scratch;
 } HostAddress;
 
-#ifdef CONFIG_SOFTMMU
 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
 {
 /* We arrive at the slow path via "BLNE", so R14 contains l->raddr. */
@@ -1368,50 +1367,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 tcg_out_goto(s, COND_AL, qemu_st_helpers[opc & MO_SIZE]);
 return true;
 }
-#else
-static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
-{
-if (!reloc_pc24(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
-return false;
-}
-
-if (TARGET_LONG_BITS == 64) {
-/* 64-bit target address is aligned into R2:R3. */
-TCGMovExtend ext[2] = {
-{ .dst = TCG_REG_R2, .dst_type = TCG_TYPE_I32,
-  .src = l->addrlo_reg,
-  .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
-{ .dst = TCG_REG_R3, .dst_type = TCG_TYPE_I32,
-  .src = l->addrhi_reg,
-  .src_type = TCG_TYPE_I32, .src_ext = MO_UL },
-};
-tcg_out_movext2(s, &ext[0], &ext[1], TCG_REG_TMP);
-} else {
-tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R1, l->addrlo_reg);
-}
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_AREG0);
-
-/*
- * Tail call to the helper, with the return address back inline,
- * just for the clarity of the debugging traceback -- the helper
- * cannot return.  We have used BLNE to arrive here, so LR is
- * already set.
- */
-tcg_out_goto(s, COND_AL, (const void *)
- (l->is_ld ? helper_unaligned_ld : helper_unaligned_st));
-return true;
-}
-
-static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-
-static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-#endif /* SOFTMMU */
 
 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
TCGReg addrlo, TCGReg addrhi,
-- 
2.34.1

[PATCH v4 38/57] tcg/riscv: Support softmmu unaligned accesses

2023-05-03 Thread Richard Henderson

The system is required to emulate unaligned accesses, even if the
hardware does not support it.  The resulting trap may or may not
be more efficient than the qemu slow path.  There are linux kernel
patches in flight to allow userspace to query hardware support;
we can re-evaluate whether to enable this by default after that.

In the meantime, softmmu now matches useronly, where we already
assumed that unaligned accesses are supported.

Signed-off-by: Richard Henderson 
---
 tcg/riscv/tcg-target.c.inc | 48 ++
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index 19cd4507fb..415e6c6e15 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -910,12 +910,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
TCGReg *pbase,
 
 #ifdef CONFIG_SOFTMMU
 unsigned s_bits = opc & MO_SIZE;
+unsigned s_mask = (1u << s_bits) - 1;
 int mem_index = get_mmuidx(oi);
 int fast_ofs = TLB_MASK_TABLE_OFS(mem_index);
 int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask);
 int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table);
-TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0;
-tcg_target_long compare_mask;
+int compare_mask;
+TCGReg addr_adj;
 
 ldst = new_ldst_label(s);
 ldst->is_ld = is_ld;
@@ -924,14 +925,33 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
TCGReg *pbase,
 
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11));
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, mask_base, mask_ofs);
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, table_base, table_ofs);
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs);
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs);
 
 tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP2, addr_reg,
 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
 tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0);
 tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1);
 
+/*
+ * For aligned accesses, we check the first byte and include the alignment
+ * bits within the address.  For unaligned access, we check that we don't
+ * cross pages using the address of the last byte of the access.
+ */
+addr_adj = addr_reg;
+if (a_bits < s_bits) {
+addr_adj = TCG_REG_TMP0;
+tcg_out_opc_imm(s, TARGET_LONG_BITS == 32 ? OPC_ADDIW : OPC_ADDI,
+addr_adj, addr_reg, s_mask - a_mask);
+}
+compare_mask = TARGET_PAGE_MASK | a_mask;
+if (compare_mask == sextreg(compare_mask, 0, 12)) {
+tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_adj, compare_mask);
+} else {
+tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask);
+tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_adj);
+}
+
 /* Load the tlb comparator and the addend.  */
 tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2,
is_ld ? offsetof(CPUTLBEntry, addr_read)
@@ -939,29 +959,17 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
TCGReg *pbase,
 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2,
offsetof(CPUTLBEntry, addend));
 
-/* We don't support unaligned accesses. */
-if (a_bits < s_bits) {
-a_bits = s_bits;
-}
-/* Clear the non-page, non-alignment bits from the address.  */
-compare_mask = (tcg_target_long)TARGET_PAGE_MASK | a_mask;
-if (compare_mask == sextreg(compare_mask, 0, 12)) {
-tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, compare_mask);
-} else {
-tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask);
-tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_reg);
-}
-
 /* Compare masked address with the TLB entry. */
 ldst->label_ptr[0] = s->code_ptr;
 tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0);
 
 /* TLB Hit - translate address using addend.  */
+addr_adj = addr_reg;
 if (TARGET_LONG_BITS == 32) {
-tcg_out_ext32u(s, TCG_REG_TMP0, addr_reg);
-addr_reg = TCG_REG_TMP0;
+addr_adj = TCG_REG_TMP0;
+tcg_out_ext32u(s, addr_adj, addr_reg);
 }
-tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP2, addr_reg);
+tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP2, addr_adj);
 *pbase = TCG_REG_TMP0;
 #else
 if (a_mask) {
-- 
2.34.1

[PATCH v4 56/57] tcg/ppc: Support 128-bit load/store

2023-05-03 Thread Richard Henderson

Use LQ/STQ with ISA v2.07, and 16-byte atomicity is required.
Note that these instructions do not require 16-byte alignment.

Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target-con-set.h |   2 +
 tcg/ppc/tcg-target-con-str.h |   1 +
 tcg/ppc/tcg-target.h |   3 +-
 tcg/ppc/tcg-target.c.inc | 173 +++
 4 files changed, 158 insertions(+), 21 deletions(-)

diff --git a/tcg/ppc/tcg-target-con-set.h b/tcg/ppc/tcg-target-con-set.h
index f206b29205..bbd7b21247 100644
--- a/tcg/ppc/tcg-target-con-set.h
+++ b/tcg/ppc/tcg-target-con-set.h
@@ -14,6 +14,7 @@ C_O0_I2(r, r)
 C_O0_I2(r, ri)
 C_O0_I2(v, r)
 C_O0_I3(r, r, r)
+C_O0_I3(o, m, r)
 C_O0_I4(r, r, ri, ri)
 C_O0_I4(r, r, r, r)
 C_O1_I1(r, r)
@@ -34,6 +35,7 @@ C_O1_I3(v, v, v, v)
 C_O1_I4(r, r, ri, rZ, rZ)
 C_O1_I4(r, r, r, ri, ri)
 C_O2_I1(r, r, r)
+C_O2_I1(o, m, r)
 C_O2_I2(r, r, r, r)
 C_O2_I4(r, r, rI, rZM, r, r)
 C_O2_I4(r, r, r, r, rI, rZM)
diff --git a/tcg/ppc/tcg-target-con-str.h b/tcg/ppc/tcg-target-con-str.h
index 094613cbcb..20846901de 100644
--- a/tcg/ppc/tcg-target-con-str.h
+++ b/tcg/ppc/tcg-target-con-str.h
@@ -9,6 +9,7 @@
  * REGS(letter, register_mask)
  */
 REGS('r', ALL_GENERAL_REGS)
+REGS('o', ALL_GENERAL_REGS & 0xu)  /* odd registers */
 REGS('v', ALL_VECTOR_REGS)
 
 /*
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 0914380bd7..204b70f86a 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -149,7 +149,8 @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_mulsh_i641
 #endif
 
-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128   \
+(TCG_TARGET_REG_BITS == 64 && have_isa_2_07)
 
 /*
  * While technically Altivec could support V64, it has no 64-bit store
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 60375804cd..682743a466 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -295,25 +295,27 @@ static bool tcg_target_const_match(int64_t val, TCGType 
type, int ct)
 
 #define B  OPCD( 18)
 #define BC OPCD( 16)
+
 #define LBZOPCD( 34)
 #define LHZOPCD( 40)
 #define LHAOPCD( 42)
 #define LWZOPCD( 32)
 #define LWZUX  XO31( 55)
-#define STBOPCD( 38)
-#define STHOPCD( 44)
-#define STWOPCD( 36)
-
-#define STDXO62(  0)
-#define STDU   XO62(  1)
-#define STDX   XO31(149)
-
 #define LD XO58(  0)
 #define LDXXO31( 21)
 #define LDUXO58(  1)
 #define LDUX   XO31( 53)
 #define LWAXO58(  2)
 #define LWAX   XO31(341)
+#define LQ OPCD( 56)
+
+#define STBOPCD( 38)
+#define STHOPCD( 44)
+#define STWOPCD( 36)
+#define STDXO62(  0)
+#define STDU   XO62(  1)
+#define STDX   XO31(149)
+#define STQXO62(  2)
 
 #define ADDIC  OPCD( 12)
 #define ADDI   OPCD( 14)
@@ -2015,11 +2017,25 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 typedef struct {
 TCGReg base;
 TCGReg index;
+MemOp align;
+MemOp atom;
 } HostAddress;
 
 bool tcg_target_has_memory_bswap(MemOp memop)
 {
-return true;
+MemOp atom_a, atom_u;
+
+if ((memop & MO_SIZE) <= MO_64) {
+return true;
+}
+
+/*
+ * Reject 16-byte memop with 16-byte atomicity,
+ * but do allow a pair of 64-bit operations.
+ */
+(void)atom_and_align_for_opc(tcg_ctx, &atom_a, &atom_u, memop,
+ MO_ATOM_IFALIGN, true);
+return atom_a <= MO_64;
 }
 
 /*
@@ -2034,7 +2050,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-MemOp a_bits, atom_a, atom_u;
+MemOp a_bits, atom_u, s_bits;
 
 /*
  * Book II, Section 1.4, Single-Copy Atomicity, specifies:
@@ -2046,10 +2062,19 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
  * As of 3.0, "the non-atomic access is performed as described in
  * the corresponding list", which matches MO_ATOM_SUBALIGN.
  */
-a_bits = atom_and_align_for_opc(s, &atom_a, &atom_u, opc,
+s_bits = opc & MO_SIZE;
+a_bits = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
 have_isa_3_00 ? MO_ATOM_SUBALIGN
   : MO_ATOM_IFALIGN,
-false);
+s_bits == MO_128);
+
+if (TCG_TARGET_REG_BITS == 32) {
+/* We don't support unaligned accesses on 32-bits. */
+if (a_bits < s_bits) {
+a_bits = s_bits;
+}
+}
+h->align = a_bits;
 
 #ifdef CONFIG_SOFTMMU
 int mem_index = get_mmuidx(oi);
@@ -2058,7 +2083,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 int fast_off = TLB_MASK_TABLE_OFS(mem_index);
 int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
 int table_off = fast_off + offsetof(CPUTLBDescFast, table);
-unsigned s_bits = opc & MO_SIZE;
 
 ldst = new

[PATCH v4 48/57] tcg/ppc: Use atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target.c.inc | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index f0a4118bbb..60375804cd 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2034,7 +2034,22 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-unsigned a_bits = get_alignment_bits(opc);
+MemOp a_bits, atom_a, atom_u;
+
+/*
+ * Book II, Section 1.4, Single-Copy Atomicity, specifies:
+ *
+ * Before 3.0, "An access that is not atomic is performed as a set of
+ * smaller disjoint atomic accesses. In general, the number and alignment
+ * of these accesses are implementation-dependent."  Thus MO_ATOM_IFALIGN.
+ *
+ * As of 3.0, "the non-atomic access is performed as described in
+ * the corresponding list", which matches MO_ATOM_SUBALIGN.
+ */
+a_bits = atom_and_align_for_opc(s, &atom_a, &atom_u, opc,
+have_isa_3_00 ? MO_ATOM_SUBALIGN
+  : MO_ATOM_IFALIGN,
+false);
 
 #ifdef CONFIG_SOFTMMU
 int mem_index = get_mmuidx(oi);
-- 
2.34.1

[PATCH v4 32/54] tcg: Introduce arg_slot_stk_ofs

2023-05-03 Thread Richard Henderson

Unify all computation of argument stack offset in one function.
This requires that we adjust ref_slot to be in the same units,
by adding max_reg_slots during init_call_layout.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/tcg.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index fa28db0188..057423c121 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -816,6 +816,15 @@ static inline bool arg_slot_reg_p(unsigned arg_slot)
 return arg_slot < nreg;
 }
 
+static inline int arg_slot_stk_ofs(unsigned arg_slot)
+{
+unsigned max = TCG_STATIC_CALL_ARGS_SIZE / sizeof(tcg_target_long);
+unsigned stk_slot = arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs);
+
+tcg_debug_assert(stk_slot < max);
+return TCG_TARGET_CALL_STACK_OFFSET + stk_slot * sizeof(tcg_target_long);
+}
+
 typedef struct TCGCumulativeArgs {
 int arg_idx;/* tcg_gen_callN args[] */
 int info_in_idx;/* TCGHelperInfo in[] */
@@ -1055,6 +1064,7 @@ static void init_call_layout(TCGHelperInfo *info)
 }
 }
 assert(ref_base + cum.ref_slot <= max_stk_slots);
+ref_base += max_reg_slots;
 
 if (ref_base != 0) {
 for (int i = cum.info_in_idx - 1; i >= 0; --i) {
@@ -4826,7 +4836,7 @@ static void load_arg_reg(TCGContext *s, TCGReg reg, 
TCGTemp *ts,
 }
 }
 
-static void load_arg_stk(TCGContext *s, int stk_slot, TCGTemp *ts,
+static void load_arg_stk(TCGContext *s, unsigned arg_slot, TCGTemp *ts,
  TCGRegSet allocated_regs)
 {
 /*
@@ -4836,8 +4846,7 @@ static void load_arg_stk(TCGContext *s, int stk_slot, 
TCGTemp *ts,
  */
 temp_load(s, ts, tcg_target_available_regs[ts->type], allocated_regs, 0);
 tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK,
-   TCG_TARGET_CALL_STACK_OFFSET +
-   stk_slot * sizeof(tcg_target_long));
+   arg_slot_stk_ofs(arg_slot));
 }
 
 static void load_arg_normal(TCGContext *s, const TCGCallArgumentLoc *l,
@@ -4848,18 +4857,16 @@ static void load_arg_normal(TCGContext *s, const 
TCGCallArgumentLoc *l,
 load_arg_reg(s, reg, ts, *allocated_regs);
 tcg_regset_set_reg(*allocated_regs, reg);
 } else {
-load_arg_stk(s, l->arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs),
- ts, *allocated_regs);
+load_arg_stk(s, l->arg_slot, ts, *allocated_regs);
 }
 }
 
-static void load_arg_ref(TCGContext *s, int arg_slot, TCGReg ref_base,
+static void load_arg_ref(TCGContext *s, unsigned arg_slot, TCGReg ref_base,
  intptr_t ref_off, TCGRegSet *allocated_regs)
 {
 TCGReg reg;
-int stk_slot = arg_slot - ARRAY_SIZE(tcg_target_call_iarg_regs);
 
-if (stk_slot < 0) {
+if (arg_slot_reg_p(arg_slot)) {
 reg = tcg_target_call_iarg_regs[arg_slot];
 tcg_reg_free(s, reg, *allocated_regs);
 tcg_out_addi_ptr(s, reg, ref_base, ref_off);
@@ -4869,8 +4876,7 @@ static void load_arg_ref(TCGContext *s, int arg_slot, 
TCGReg ref_base,
 *allocated_regs, 0, false);
 tcg_out_addi_ptr(s, reg, ref_base, ref_off);
 tcg_out_st(s, TCG_TYPE_PTR, reg, TCG_REG_CALL_STACK,
-   TCG_TARGET_CALL_STACK_OFFSET
-   + stk_slot * sizeof(tcg_target_long));
+   arg_slot_stk_ofs(arg_slot));
 }
 }
 
@@ -4900,8 +4906,7 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 case TCG_CALL_ARG_BY_REF:
 load_arg_stk(s, loc->ref_slot, ts, allocated_regs);
 load_arg_ref(s, loc->arg_slot, TCG_REG_CALL_STACK,
- TCG_TARGET_CALL_STACK_OFFSET
- + loc->ref_slot * sizeof(tcg_target_long),
+ arg_slot_stk_ofs(loc->ref_slot),
  &allocated_regs);
 break;
 case TCG_CALL_ARG_BY_REF_N:
-- 
2.34.1

[PATCH v4 35/57] accel/tcg: Remove helper_unaligned_{ld,st}

2023-05-03 Thread Richard Henderson

These functions are now unused.

Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-ldst.h |  6 --
 accel/tcg/user-exec.c  | 10 --
 2 files changed, 16 deletions(-)

diff --git a/include/tcg/tcg-ldst.h b/include/tcg/tcg-ldst.h
index 64f48e6990..7dd57013e9 100644
--- a/include/tcg/tcg-ldst.h
+++ b/include/tcg/tcg-ldst.h
@@ -60,10 +60,4 @@ void helper_stq_mmu(CPUArchState *env, target_ulong addr, 
uint64_t val,
 void helper_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
  MemOpIdx oi, uintptr_t retaddr);
 
-#ifdef CONFIG_USER_ONLY
-
-G_NORETURN void helper_unaligned_ld(CPUArchState *env, target_ulong addr);
-G_NORETURN void helper_unaligned_st(CPUArchState *env, target_ulong addr);
-
-#endif /* CONFIG_USER_ONLY */
 #endif /* TCG_LDST_H */
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 8f86254eb4..7b824dcde8 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -889,16 +889,6 @@ void page_reset_target_data(target_ulong start, 
target_ulong last) { }
 
 /* The softmmu versions of these helpers are in cputlb.c.  */
 
-void helper_unaligned_ld(CPUArchState *env, target_ulong addr)
-{
-cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_LOAD, GETPC());
-}
-
-void helper_unaligned_st(CPUArchState *env, target_ulong addr)
-{
-cpu_loop_exit_sigbus(env_cpu(env), addr, MMU_DATA_STORE, GETPC());
-}
-
 static void *cpu_mmu_lookup(CPUArchState *env, abi_ptr addr,
 MemOp mop, uintptr_t ra, MMUAccessType type)
 {
-- 
2.34.1

[PATCH v4 34/54] tcg: Add routines for calling slow-path helpers

2023-05-03 Thread Richard Henderson

Add tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.  These and their subroutines
use the existing knowledge of the host function call abi
to load the function call arguments and return results.

These will be used to simplify the backends in turn.

Signed-off-by: Richard Henderson 
---
 tcg/tcg.c | 456 +-
 1 file changed, 453 insertions(+), 3 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 057423c121..748be8426a 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -181,6 +181,22 @@ static bool tcg_target_const_match(int64_t val, TCGType 
type, int ct);
 static int tcg_out_ldst_finalize(TCGContext *s);
 #endif
 
+typedef struct TCGLdstHelperParam {
+TCGReg (*ra_gen)(TCGContext *s, const TCGLabelQemuLdst *l, int arg_reg);
+unsigned ntmp;
+int tmp[3];
+} TCGLdstHelperParam;
+
+static void tcg_out_ld_helper_args(TCGContext *s, const TCGLabelQemuLdst *l,
+   const TCGLdstHelperParam *p)
+__attribute__((unused));
+static void tcg_out_ld_helper_ret(TCGContext *s, const TCGLabelQemuLdst *l,
+  bool load_sign, const TCGLdstHelperParam *p)
+__attribute__((unused));
+static void tcg_out_st_helper_args(TCGContext *s, const TCGLabelQemuLdst *l,
+   const TCGLdstHelperParam *p)
+__attribute__((unused));
+
 TCGContext tcg_init_ctx;
 __thread TCGContext *tcg_ctx;
 
@@ -459,9 +475,8 @@ static void tcg_out_movext1(TCGContext *s, const 
TCGMovExtend *i)
  * between the sources and destinations.
  */
 
-static void __attribute__((unused))
-tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
-const TCGMovExtend *i2, int scratch)
+static void tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
+const TCGMovExtend *i2, int scratch)
 {
 TCGReg src1 = i1->src;
 TCGReg src2 = i2->src;
@@ -715,6 +730,50 @@ static TCGHelperInfo all_helpers[] = {
 };
 static GHashTable *helper_table;
 
+#if TCG_TARGET_REG_BITS == 32
+# define dh_typecode_ttl  dh_typecode_i32
+#else
+# define dh_typecode_ttl  dh_typecode_i64
+#endif
+
+static TCGHelperInfo info_helper_ld32_mmu = {
+.flags = TCG_CALL_NO_WG,
+.typemask = dh_typemask(ttl, 0)  /* return tcg_target_ulong */
+  | dh_typemask(env, 1)
+  | dh_typemask(tl, 2)   /* target_ulong addr */
+  | dh_typemask(i32, 3)  /* unsigned oi */
+  | dh_typemask(ptr, 4)  /* uintptr_t ra */
+};
+
+static TCGHelperInfo info_helper_ld64_mmu = {
+.flags = TCG_CALL_NO_WG,
+.typemask = dh_typemask(i64, 0)  /* return uint64_t */
+  | dh_typemask(env, 1)
+  | dh_typemask(tl, 2)   /* target_ulong addr */
+  | dh_typemask(i32, 3)  /* unsigned oi */
+  | dh_typemask(ptr, 4)  /* uintptr_t ra */
+};
+
+static TCGHelperInfo info_helper_st32_mmu = {
+.flags = TCG_CALL_NO_WG,
+.typemask = dh_typemask(void, 0)
+  | dh_typemask(env, 1)
+  | dh_typemask(tl, 2)   /* target_ulong addr */
+  | dh_typemask(i32, 3)  /* uint32_t data */
+  | dh_typemask(i32, 4)  /* unsigned oi */
+  | dh_typemask(ptr, 5)  /* uintptr_t ra */
+};
+
+static TCGHelperInfo info_helper_st64_mmu = {
+.flags = TCG_CALL_NO_WG,
+.typemask = dh_typemask(void, 0)
+  | dh_typemask(env, 1)
+  | dh_typemask(tl, 2)   /* target_ulong addr */
+  | dh_typemask(i64, 3)  /* uint64_t data */
+  | dh_typemask(i32, 4)  /* unsigned oi */
+  | dh_typemask(ptr, 5)  /* uintptr_t ra */
+};
+
 #ifdef CONFIG_TCG_INTERPRETER
 static ffi_type *typecode_to_ffi(int argmask)
 {
@@ -1126,6 +1185,11 @@ static void tcg_context_init(unsigned max_cpus)
 (gpointer)&all_helpers[i]);
 }
 
+init_call_layout(&info_helper_ld32_mmu);
+init_call_layout(&info_helper_ld64_mmu);
+init_call_layout(&info_helper_st32_mmu);
+init_call_layout(&info_helper_st64_mmu);
+
 #ifdef CONFIG_TCG_INTERPRETER
 init_ffi_layouts();
 #endif
@@ -5011,6 +5075,392 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 }
 }
 
+/*
+ * Similarly for qemu_ld/st slow path helpers.
+ * We must re-implement tcg_gen_callN and tcg_reg_alloc_call simultaneously,
+ * using only the provided backend tcg_out_* functions.
+ */
+
+static int tcg_out_helper_stk_ofs(TCGType type, unsigned slot)
+{
+int ofs = arg_slot_stk_ofs(slot);
+
+/*
+ * Each stack slot is TCG_TARGET_LONG_BITS.  If the host does not
+ * require extension to uint64_t, adjust the address for uint32_t.
+ */
+if (HOST_BIG_ENDIAN &&
+TCG_TARGET_REG_BITS == 64 &&
+type == TCG_TYPE_I32) {
+ofs += 4;
+}
+return ofs;
+}
+
+static void tcg_out_helper_load_regs(TCGContext *s,
+ unsigned nmov, TCGMovExtend *mov,
+

[PATCH v4 34/57] tcg/sparc64: Use standard slow path for softmmu

2023-05-03 Thread Richard Henderson

Drop the target-specific trampolines for the standard slow path.
This lets us use tcg_out_helper_{ld,st}_args, and handles the new
atomicity bits within MemOp.

At the same time, use the full load/store helpers for user-only mode.
Drop inline unaligned access support for user-only mode, as it does
not handle atomicity.

Use TCG_REG_T[1-3] in the tlb lookup, instead of TCG_REG_O[0-2].
This allows the constraints to be simplified.

Signed-off-by: Richard Henderson 
---
 tcg/sparc64/tcg-target-con-set.h |   2 -
 tcg/sparc64/tcg-target-con-str.h |   1 -
 tcg/sparc64/tcg-target.h |   1 +
 tcg/sparc64/tcg-target.c.inc | 610 +--
 4 files changed, 182 insertions(+), 432 deletions(-)

diff --git a/tcg/sparc64/tcg-target-con-set.h b/tcg/sparc64/tcg-target-con-set.h
index 31e6fea1fc..434bf25072 100644
--- a/tcg/sparc64/tcg-target-con-set.h
+++ b/tcg/sparc64/tcg-target-con-set.h
@@ -12,8 +12,6 @@
 C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rJ)
-C_O0_I2(sZ, s)
-C_O1_I1(r, s)
 C_O1_I1(r, r)
 C_O1_I2(r, r, r)
 C_O1_I2(r, rZ, rJ)
diff --git a/tcg/sparc64/tcg-target-con-str.h b/tcg/sparc64/tcg-target-con-str.h
index 8f5c7aef97..0577ec4942 100644
--- a/tcg/sparc64/tcg-target-con-str.h
+++ b/tcg/sparc64/tcg-target-con-str.h
@@ -9,7 +9,6 @@
  * REGS(letter, register_mask)
  */
 REGS('r', ALL_GENERAL_REGS)
-REGS('s', ALL_QLDST_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index ffe22b1d21..7434cc99d4 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -155,6 +155,7 @@ extern bool use_vis3_instructions;
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP 1
+#define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
 #endif
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index 4375a06377..0237188d65 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -27,6 +27,7 @@
 #error "unsupported code generation mode"
 #endif
 
+#include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 
 #ifdef CONFIG_DEBUG_TCG
@@ -70,18 +71,7 @@ static const char * const 
tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #define TCG_CT_CONST_S13  0x200
 #define TCG_CT_CONST_ZERO 0x400
 
-/*
- * For softmmu, we need to avoid conflicts with the first 3
- * argument registers to perform the tlb lookup, and to call
- * the helper function.
- */
-#ifdef CONFIG_SOFTMMU
-#define SOFTMMU_RESERVE_REGS MAKE_64BIT_MASK(TCG_REG_O0, 3)
-#else
-#define SOFTMMU_RESERVE_REGS 0
-#endif
-#define ALL_GENERAL_REGS MAKE_64BIT_MASK(0, 32)
-#define ALL_QLDST_REGS   (ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
+#define ALL_GENERAL_REGS  MAKE_64BIT_MASK(0, 32)
 
 /* Define some temporary registers.  T3 is used for constant generation.  */
 #define TCG_REG_T1  TCG_REG_G1
@@ -918,82 +908,6 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
 tcg_out32(s, MEMBAR | (a0 & TCG_MO_ALL));
 }
 
-#ifdef CONFIG_SOFTMMU
-static const tcg_insn_unit *qemu_ld_trampoline[MO_SSIZE + 1];
-static const tcg_insn_unit *qemu_st_trampoline[MO_SIZE + 1];
-
-static void build_trampolines(TCGContext *s)
-{
-int i;
-
-for (i = 0; i < ARRAY_SIZE(qemu_ld_helpers); ++i) {
-if (qemu_ld_helpers[i] == NULL) {
-continue;
-}
-
-/* May as well align the trampoline.  */
-while ((uintptr_t)s->code_ptr & 15) {
-tcg_out_nop(s);
-}
-qemu_ld_trampoline[i] = tcg_splitwx_to_rx(s->code_ptr);
-
-/* Set the retaddr operand.  */
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O3, TCG_REG_O7);
-/* Tail call.  */
-tcg_out_jmpl_const(s, qemu_ld_helpers[i], true, true);
-/* delay slot -- set the env argument */
-tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
-}
-
-for (i = 0; i < ARRAY_SIZE(qemu_st_helpers); ++i) {
-if (qemu_st_helpers[i] == NULL) {
-continue;
-}
-
-/* May as well align the trampoline.  */
-while ((uintptr_t)s->code_ptr & 15) {
-tcg_out_nop(s);
-}
-qemu_st_trampoline[i] = tcg_splitwx_to_rx(s->code_ptr);
-
-/* Set the retaddr operand.  */
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_O4, TCG_REG_O7);
-
-/* Tail call.  */
-tcg_out_jmpl_const(s, qemu_st_helpers[i], true, true);
-/* delay slot -- set the env argument */
-tcg_out_mov_delay(s, TCG_REG_O0, TCG_AREG0);
-}
-}
-#else
-static const tcg_insn_unit *qemu_unalign_ld_trampoline;
-static const tcg_insn_unit *qemu_unalign_st_trampoline;
-
-static void build_trampolines(TCGContext *s)
-{
-for (int ld = 0; ld < 2; ++ld) {
-void *helper;
-
-while ((uintptr_t)s->code_ptr & 15) {
-tcg_out_nop(s);
-}
-
-if (ld) {
-helper = helper_unaligned_ld;
-qemu_unalign_ld_trampoline = tcg_splitwx_to_rx(s->code_ptr);
-} else {
-

[PATCH v4 52/57] tcg/i386: Honor 64-bit atomicity in 32-bit mode

2023-05-03 Thread Richard Henderson

Use the fpu to perform 64-bit loads and stores.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 44 +--
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 3e21f067d6..5c6c64c48a 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -468,6 +468,10 @@ static bool tcg_target_const_match(int64_t val, TCGType 
type, int ct)
 #define OPC_GRP5(0xff)
 #define OPC_GRP14   (0x73 | P_EXT | P_DATA16)
 
+#define OPC_ESCDF   (0xdf)
+#define ESCDF_FILD_m64  5
+#define ESCDF_FISTP_m64 7
+
 /* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH.  */
 #define ARITH_ADD 0
@@ -2091,7 +2095,20 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
 datalo = datahi;
 datahi = t;
 }
-if (h.base == datalo || h.index == datalo) {
+if (h.atom == MO_64) {
+/*
+ * Atomicity requires that we use use a single 8-byte load.
+ * For simplicity and code size, always use the FPU for this.
+ * Similar insns using SSE/AVX are merely larger.
+ * Load from memory in one go, then store back to the stack,
+ * from whence we can load into the correct integer regs.
+ */
+tcg_out_modrm_sib_offset(s, OPC_ESCDF + h.seg, ESCDF_FILD_m64,
+ h.base, h.index, 0, h.ofs);
+tcg_out_modrm_offset(s, OPC_ESCDF, ESCDF_FISTP_m64, TCG_REG_ESP, 
0);
+tcg_out_modrm_offset(s, movop, datalo, TCG_REG_ESP, 0);
+tcg_out_modrm_offset(s, movop, datahi, TCG_REG_ESP, 4);
+} else if (h.base == datalo || h.index == datalo) {
 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
  h.base, h.index, 0, h.ofs);
 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
@@ -2161,12 +2178,27 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
 if (TCG_TARGET_REG_BITS == 64) {
 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
  h.base, h.index, 0, h.ofs);
+break;
+}
+if (use_movbe) {
+TCGReg t = datalo;
+datalo = datahi;
+datahi = t;
+}
+if (h.atom == MO_64) {
+/*
+ * Atomicity requires that we use use one 8-byte store.
+ * For simplicity, and code size, always use the FPU for this.
+ * Similar insns using SSE/AVX are merely larger.
+ * Assemble the 8-byte quantity in required endianness
+ * on the stack, load to coproc unit, and store.
+ */
+tcg_out_modrm_offset(s, movop, datalo, TCG_REG_ESP, 0);
+tcg_out_modrm_offset(s, movop, datahi, TCG_REG_ESP, 4);
+tcg_out_modrm_offset(s, OPC_ESCDF, ESCDF_FILD_m64, TCG_REG_ESP, 0);
+tcg_out_modrm_sib_offset(s, OPC_ESCDF + h.seg, ESCDF_FISTP_m64,
+ h.base, h.index, 0, h.ofs);
 } else {
-if (use_movbe) {
-TCGReg t = datalo;
-datalo = datahi;
-datahi = t;
-}
 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
  h.base, h.index, 0, h.ofs);
 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
-- 
2.34.1

[PATCH v4 33/57] tcg/sparc64: Split out tcg_out_movi_s32

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 tcg/sparc64/tcg-target.c.inc | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index e244209890..4375a06377 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -405,6 +405,13 @@ static void tcg_out_movi_s13(TCGContext *s, TCGReg ret, 
int32_t arg)
 tcg_out_arithi(s, ret, TCG_REG_G0, arg, ARITH_OR);
 }
 
+/* A 32-bit constant sign-extended to 64 bits.  */
+static void tcg_out_movi_s32(TCGContext *s, TCGReg ret, int32_t arg)
+{
+tcg_out_sethi(s, ret, ~arg);
+tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR);
+}
+
 /* A 32-bit constant zero-extended to 64 bits.  */
 static void tcg_out_movi_u32(TCGContext *s, TCGReg ret, uint32_t arg)
 {
@@ -444,8 +451,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, 
TCGReg ret,
 
 /* A 32-bit constant sign-extended to 64-bits.  */
 if (arg == lo) {
-tcg_out_sethi(s, ret, ~arg);
-tcg_out_arithi(s, ret, ret, (arg & 0x3ff) | -0x400, ARITH_XOR);
+tcg_out_movi_s32(s, ret, arg);
 return;
 }
 
-- 
2.34.1

[PATCH v4 42/54] tcg/riscv: Convert tcg_out_qemu_{ld,st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.

Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/riscv/tcg-target.c.inc | 37 ++---
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index 2b2d313fe2..c22d1e35ac 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -906,14 +906,14 @@ static void tcg_out_goto(TCGContext *s, const 
tcg_insn_unit *target)
 tcg_debug_assert(ok);
 }
 
+/* We have three temps, we might as well expose them. */
+static const TCGLdstHelperParam ldst_helper_param = {
+.ntmp = 3, .tmp = { TCG_REG_TMP0, TCG_REG_TMP1, TCG_REG_TMP2 }
+};
+
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
-TCGReg a0 = tcg_target_call_iarg_regs[0];
-TCGReg a1 = tcg_target_call_iarg_regs[1];
-TCGReg a2 = tcg_target_call_iarg_regs[2];
-TCGReg a3 = tcg_target_call_iarg_regs[3];
+MemOp opc = get_memop(l->oi);
 
 /* resolve label address */
 if (!reloc_sbimm12(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
@@ -921,13 +921,9 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 }
 
 /* call load helper */
-tcg_out_mov(s, TCG_TYPE_PTR, a0, TCG_AREG0);
-tcg_out_mov(s, TCG_TYPE_PTR, a1, l->addrlo_reg);
-tcg_out_movi(s, TCG_TYPE_PTR, a2, oi);
-tcg_out_movi(s, TCG_TYPE_PTR, a3, (tcg_target_long)l->raddr);
-
+tcg_out_ld_helper_args(s, l, &ldst_helper_param);
 tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SSIZE], false);
-tcg_out_mov(s, (opc & MO_SIZE) == MO_64, l->datalo_reg, a0);
+tcg_out_ld_helper_ret(s, l, true, &ldst_helper_param);
 
 tcg_out_goto(s, l->raddr);
 return true;
@@ -935,14 +931,7 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 
 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
-MemOp s_bits = opc & MO_SIZE;
-TCGReg a0 = tcg_target_call_iarg_regs[0];
-TCGReg a1 = tcg_target_call_iarg_regs[1];
-TCGReg a2 = tcg_target_call_iarg_regs[2];
-TCGReg a3 = tcg_target_call_iarg_regs[3];
-TCGReg a4 = tcg_target_call_iarg_regs[4];
+MemOp opc = get_memop(l->oi);
 
 /* resolve label address */
 if (!reloc_sbimm12(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
@@ -950,13 +939,7 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 }
 
 /* call store helper */
-tcg_out_mov(s, TCG_TYPE_PTR, a0, TCG_AREG0);
-tcg_out_mov(s, TCG_TYPE_PTR, a1, l->addrlo_reg);
-tcg_out_movext(s, s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32, a2,
-   l->type, s_bits, l->datalo_reg);
-tcg_out_movi(s, TCG_TYPE_PTR, a3, oi);
-tcg_out_movi(s, TCG_TYPE_PTR, a4, (tcg_target_long)l->raddr);
-
+tcg_out_st_helper_args(s, l, &ldst_helper_param);
 tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE], false);
 
 tcg_out_goto(s, l->raddr);
-- 
2.34.1

[PATCH v4 22/57] tcg/aarch64: Use full load/store helpers in user-only mode

2023-05-03 Thread Richard Henderson

Instead of using helper_unaligned_{ld,st}, use the full load/store helpers.
This will allow the fast path to increase alignment to implement atomicity
while not immediately raising an alignment exception.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 35 ---
 1 file changed, 35 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 43acb4fbcb..09c9ecad0f 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1595,7 +1595,6 @@ typedef struct {
 TCGType index_ext;
 } HostAddress;
 
-#ifdef CONFIG_SOFTMMU
 static const TCGLdstHelperParam ldst_helper_param = {
 .ntmp = 1, .tmp = { TCG_REG_TMP }
 };
@@ -1628,40 +1627,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 tcg_out_goto(s, lb->raddr);
 return true;
 }
-#else
-static void tcg_out_adr(TCGContext *s, TCGReg rd, const void *target)
-{
-ptrdiff_t offset = tcg_pcrel_diff(s, target);
-tcg_debug_assert(offset == sextract64(offset, 0, 21));
-tcg_out_insn(s, 3406, ADR, rd, offset);
-}
-
-static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
-{
-if (!reloc_pc19(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
-return false;
-}
-
-tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_X1, l->addrlo_reg);
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
-
-/* "Tail call" to the helper, with the return address back inline. */
-tcg_out_adr(s, TCG_REG_LR, l->raddr);
-tcg_out_goto_long(s, (const void *)(l->is_ld ? helper_unaligned_ld
-: helper_unaligned_st));
-return true;
-}
-
-static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-
-static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-#endif /* CONFIG_SOFTMMU */
 
 /*
  * For softmmu, perform the TLB load and compare.
-- 
2.34.1

[PATCH v4 03/54] tcg/i386: Introduce HostAddress

2023-05-03 Thread Richard Henderson

Collect the 4 potential parts of the host address into a struct.
Reorg tcg_out_qemu_{ld,st}_direct to use it.
Reorg guest_base handling to use it.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 165 +-
 1 file changed, 90 insertions(+), 75 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 173f3c3172..909eecd4a3 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1751,6 +1751,13 @@ static void tcg_out_nopn(TCGContext *s, int n)
 tcg_out8(s, 0x90);
 }
 
+typedef struct {
+TCGReg base;
+int index;
+int ofs;
+int seg;
+} HostAddress;
+
 #if defined(CONFIG_SOFTMMU)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
@@ -2113,17 +2120,13 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 return tcg_out_fail_alignment(s, l);
 }
 
-#if TCG_TARGET_REG_BITS == 32
-# define x86_guest_base_seg 0
-# define x86_guest_base_index   -1
-# define x86_guest_base_offset  guest_base
-#else
-static int x86_guest_base_seg;
-static int x86_guest_base_index = -1;
-static int32_t x86_guest_base_offset;
-# if defined(__x86_64__) && defined(__linux__)
-#  include 
-#  include 
+static HostAddress x86_guest_base = {
+.index = -1
+};
+
+#if defined(__x86_64__) && defined(__linux__)
+# include 
+# include 
 int arch_prctl(int code, unsigned long addr);
 static inline int setup_guest_base_seg(void)
 {
@@ -2132,8 +2135,9 @@ static inline int setup_guest_base_seg(void)
 }
 return 0;
 }
-# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
-#  include 
+#elif defined(__x86_64__) && \
+  (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
+# include 
 static inline int setup_guest_base_seg(void)
 {
 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
@@ -2141,18 +2145,16 @@ static inline int setup_guest_base_seg(void)
 }
 return 0;
 }
-# else
+#else
 static inline int setup_guest_base_seg(void)
 {
 return 0;
 }
-# endif
-#endif
+#endif /* setup_guest_base_seg */
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
-   TCGReg base, int index, intptr_t ofs,
-   int seg, TCGType type, MemOp memop)
+   HostAddress h, TCGType type, MemOp memop)
 {
 bool use_movbe = false;
 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
@@ -2167,60 +2169,61 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
 
 switch (memop & MO_SSIZE) {
 case MO_UB:
-tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
- base, index, 0, ofs);
+tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
+ h.base, h.index, 0, h.ofs);
 break;
 case MO_SB:
-tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
- base, index, 0, ofs);
+tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
+ h.base, h.index, 0, h.ofs);
 break;
 case MO_UW:
 if (use_movbe) {
 /* There is no extending movbe; only low 16-bits are modified.  */
-if (datalo != base && datalo != index) {
+if (datalo != h.base && datalo != h.index) {
 /* XOR breaks dependency chains.  */
 tgen_arithr(s, ARITH_XOR, datalo, datalo);
-tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
- datalo, base, index, 0, ofs);
+tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
+ datalo, h.base, h.index, 0, h.ofs);
 } else {
-tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
- datalo, base, index, 0, ofs);
+tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
+ datalo, h.base, h.index, 0, h.ofs);
 tcg_out_ext16u(s, datalo, datalo);
 }
 } else {
-tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
- base, index, 0, ofs);
+tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
+ h.base, h.index, 0, h.ofs);
 }
 break;
 case MO_SW:
 if (use_movbe) {
-tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
- datalo, base, index, 0, ofs);
+tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
+ datalo, h.bas

[PATCH v4 57/57] tcg/s390x: Support 128-bit load/store

2023-05-03 Thread Richard Henderson

Use LPQ/STPQ when 16-byte atomicity is required.
Note that these instructions require 16-byte alignment.

Signed-off-by: Richard Henderson 
---
 tcg/s390x/tcg-target-con-set.h |   2 +
 tcg/s390x/tcg-target.h |   2 +-
 tcg/s390x/tcg-target.c.inc | 100 -
 3 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index ecc079bb6d..cbad91b2b5 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -14,6 +14,7 @@ C_O0_I2(r, r)
 C_O0_I2(r, ri)
 C_O0_I2(r, rA)
 C_O0_I2(v, r)
+C_O0_I3(o, m, r)
 C_O1_I1(r, r)
 C_O1_I1(v, r)
 C_O1_I1(v, v)
@@ -36,6 +37,7 @@ C_O1_I2(v, v, v)
 C_O1_I3(v, v, v, v)
 C_O1_I4(r, r, ri, rI, r)
 C_O1_I4(r, r, rA, rI, r)
+C_O2_I1(o, m, r)
 C_O2_I2(o, m, 0, r)
 C_O2_I2(o, m, r, r)
 C_O2_I3(o, m, 0, 1, r)
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index 170007bea5..ec96952172 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -140,7 +140,7 @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_muluh_i64  0
 #define TCG_TARGET_HAS_mulsh_i64  0
 
-#define TCG_TARGET_HAS_qemu_ldst_i128 0
+#define TCG_TARGET_HAS_qemu_ldst_i128 1
 
 #define TCG_TARGET_HAS_v64HAVE_FACILITY(VECTOR)
 #define TCG_TARGET_HAS_v128   HAVE_FACILITY(VECTOR)
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index ddd9860a6a..91fecfc51b 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -243,6 +243,7 @@ typedef enum S390Opcode {
 RXY_LLGF= 0xe316,
 RXY_LLGH= 0xe391,
 RXY_LMG = 0xeb04,
+RXY_LPQ = 0xe38f,
 RXY_LRV = 0xe31e,
 RXY_LRVG= 0xe30f,
 RXY_LRVH= 0xe31f,
@@ -253,6 +254,7 @@ typedef enum S390Opcode {
 RXY_STG = 0xe324,
 RXY_STHY= 0xe370,
 RXY_STMG= 0xeb24,
+RXY_STPQ= 0xe38e,
 RXY_STRV= 0xe33e,
 RXY_STRVG   = 0xe32f,
 RXY_STRVH   = 0xe33f,
@@ -1578,7 +1580,19 @@ typedef struct {
 
 bool tcg_target_has_memory_bswap(MemOp memop)
 {
-return true;
+MemOp atom_a, atom_u;
+
+if ((memop & MO_SIZE) <= MO_64) {
+return true;
+}
+
+/*
+ * Reject 16-byte memop with 16-byte atomicity,
+ * but do allow a pair of 64-bit operations.
+ */
+(void)atom_and_align_for_opc(tcg_ctx, &atom_a, &atom_u, memop,
+ MO_ATOM_IFALIGN, true);
+return atom_a <= MO_64;
 }
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg data,
@@ -1868,6 +1882,80 @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg 
data_reg, TCGReg addr_reg,
 }
 }
 
+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
+   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+TCGLabel *l1 = NULL, *l2 = NULL;
+TCGLabelQemuLdst *ldst;
+HostAddress h;
+bool need_bswap;
+bool use_pair;
+S390Opcode insn;
+
+ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+use_pair = h.atom < MO_128;
+need_bswap = get_memop(oi) & MO_BSWAP;
+
+if (!use_pair) {
+/*
+ * Atomicity requires we use LPQ.  If we've already checked for
+ * 16-byte alignment, that's all we need.  If we arrive with
+ * lesser alignment, we have determined that less than 16-byte
+ * alignment can be satisfied with two 8-byte loads.
+ */
+if (h.align < MO_128) {
+use_pair = true;
+l1 = gen_new_label();
+l2 = gen_new_label();
+
+tcg_out_insn(s, RI, TMLL, addr_reg, 15);
+tgen_branch(s, 7, l1); /* CC in {1,2,3} */
+}
+
+tcg_debug_assert(!need_bswap);
+tcg_debug_assert(datalo & 1);
+tcg_debug_assert(datahi == datalo - 1);
+insn = is_ld ? RXY_LPQ : RXY_STPQ;
+tcg_out_insn_RXY(s, insn, datahi, h.base, h.index, h.disp);
+
+if (use_pair) {
+tgen_branch(s, S390_CC_ALWAYS, l2);
+tcg_out_label(s, l1);
+}
+}
+if (use_pair) {
+TCGReg d1, d2;
+
+if (need_bswap) {
+d1 = datalo, d2 = datahi;
+insn = is_ld ? RXY_LRVG : RXY_STRVG;
+} else {
+d1 = datahi, d2 = datalo;
+insn = is_ld ? RXY_LG : RXY_STG;
+}
+
+if (h.base == d1 || h.index == d1) {
+tcg_out_insn(s, RXY, LAY, TCG_TMP0, h.base, h.index, h.disp);
+h.base = TCG_TMP0;
+h.index = TCG_REG_NONE;
+h.disp = 0;
+}
+tcg_out_insn_RXY(s, insn, d1, h.base, h.index, h.disp);
+tcg_out_insn_RXY(s, insn, d2, h.base, h.index, h.disp + 8);
+}
+if (l2) {
+tcg_out_label(s, l2);
+}
+
+if (ldst) {
+ldst->type = TCG_TYPE_I128;
+ldst->datalo_reg = datalo;
+ldst->datahi_reg = datahi;
+ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+}
+
 static

[PATCH v4 07/54] tcg/i386: Use indexed addressing for softmmu fast path

2023-05-03 Thread Richard Henderson

Since tcg_out_{ld,st}_helper_args, the slow path no longer requires
the address argument to be set up by the tlb load sequence.  Use a
plain load for the addend and indexed addressing with the original
input address register.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 237b154194..8752968af2 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1837,7 +1837,8 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-/* The second argument is already loaded with addrlo.  */
+tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
+l->addrlo_reg);
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
  (uintptr_t)l->raddr);
@@ -1910,7 +1911,8 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-/* The second argument is already loaded with addrlo.  */
+tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
+l->addrlo_reg);
 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
 tcg_target_call_iarg_regs[2], l->datalo_reg);
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
@@ -2083,16 +2085,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
  TCG_REG_L1, TCG_REG_L0, cmp_ofs);
 
-/*
- * Prepare for both the fast path add of the tlb addend, and the slow
- * path function argument setup.
- */
-*h = (HostAddress) {
-.base = TCG_REG_L1,
-.index = -1
-};
-tcg_out_mov(s, ttype, h->base, addrlo);
-
 /* jne slow_path */
 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
 ldst->label_ptr[0] = s->code_ptr;
@@ -2109,10 +2101,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 }
 
 /* TLB Hit.  */
+tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
+   offsetof(CPUTLBEntry, addend));
 
-/* add addend(TCG_REG_L0), TCG_REG_L1 */
-tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, h->base, TCG_REG_L0,
- offsetof(CPUTLBEntry, addend));
+*h = (HostAddress) {
+.base = addrlo,
+.index = TCG_REG_L0,
+};
 #else
 if (a_bits) {
 ldst = new_ldst_label(s);
-- 
2.34.1

[PATCH v4 23/57] tcg/ppc: Use full load/store helpers in user-only mode

2023-05-03 Thread Richard Henderson

Instead of using helper_unaligned_{ld,st}, use the full load/store helpers.
This will allow the fast path to increase alignment to implement atomicity
while not immediately raising an alignment exception.

Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target.c.inc | 44 
 1 file changed, 44 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 0963156a78..733f67c7a5 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1962,7 +1962,6 @@ static const uint32_t qemu_stx_opc[(MO_SIZE + MO_BSWAP) + 
1] = {
 [MO_BSWAP | MO_UQ] = STDBRX,
 };
 
-#if defined (CONFIG_SOFTMMU)
 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
 {
 if (arg < 0) {
@@ -2012,49 +2011,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 tcg_out_b(s, 0, lb->raddr);
 return true;
 }
-#else
-static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
-{
-if (!reloc_pc14(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
-return false;
-}
-
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-TCGReg arg = TCG_REG_R4;
-
-arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
-if (l->addrlo_reg != arg) {
-tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
-tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
-} else if (l->addrhi_reg != arg + 1) {
-tcg_out_mov(s, TCG_TYPE_I32, arg + 1, l->addrlo_reg);
-tcg_out_mov(s, TCG_TYPE_I32, arg, l->addrhi_reg);
-} else {
-tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_R0, arg);
-tcg_out_mov(s, TCG_TYPE_I32, arg, arg + 1);
-tcg_out_mov(s, TCG_TYPE_I32, arg + 1, TCG_REG_R0);
-}
-} else {
-tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R4, l->addrlo_reg);
-}
-tcg_out_mov(s, TCG_TYPE_TL, TCG_REG_R3, TCG_AREG0);
-
-/* "Tail call" to the helper, with the return address back inline. */
-tcg_out_call_int(s, 0, (const void *)(l->is_ld ? helper_unaligned_ld
-  : helper_unaligned_st));
-return true;
-}
-
-static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-
-static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-return tcg_out_fail_alignment(s, l);
-}
-#endif /* SOFTMMU */
 
 typedef struct {
 TCGReg base;
-- 
2.34.1

[PATCH v4 12/57] tcg: Add 128-bit guest memory primitives

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 accel/tcg/tcg-runtime.h|   3 +
 include/tcg/tcg-ldst.h |   4 +
 accel/tcg/cputlb.c | 392 +
 accel/tcg/user-exec.c  |  94 ++--
 tcg/tcg-op.c   | 184 +++-
 accel/tcg/ldst_atomicity.c.inc | 189 
 6 files changed, 688 insertions(+), 178 deletions(-)

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index b8e6421c8a..d9adc646c1 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -39,6 +39,9 @@ DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
 #endif /* IN_HELPER_PROTO */
 
+DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, tl, i32)
+DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, tl, i128, i32)
+
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
i32, env, tl, i32, i32, i32)
 DEF_HELPER_FLAGS_5(atomic_cmpxchgw_be, TCG_CALL_NO_WG,
diff --git a/include/tcg/tcg-ldst.h b/include/tcg/tcg-ldst.h
index 57fafa14b1..64f48e6990 100644
--- a/include/tcg/tcg-ldst.h
+++ b/include/tcg/tcg-ldst.h
@@ -34,6 +34,8 @@ tcg_target_ulong helper_ldul_mmu(CPUArchState *env, 
target_ulong addr,
  MemOpIdx oi, uintptr_t retaddr);
 uint64_t helper_ldq_mmu(CPUArchState *env, target_ulong addr,
 MemOpIdx oi, uintptr_t retaddr);
+Int128 helper_ld16_mmu(CPUArchState *env, target_ulong addr,
+   MemOpIdx oi, uintptr_t retaddr);
 
 /* Value sign-extended to tcg register size.  */
 tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, target_ulong addr,
@@ -55,6 +57,8 @@ void helper_stl_mmu(CPUArchState *env, target_ulong addr, 
uint32_t val,
 MemOpIdx oi, uintptr_t retaddr);
 void helper_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 MemOpIdx oi, uintptr_t retaddr);
+void helper_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+ MemOpIdx oi, uintptr_t retaddr);
 
 #ifdef CONFIG_USER_ONLY
 
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 566cf8311b..a77b439df8 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -40,6 +40,7 @@
 #include "qemu/plugin-memory.h"
 #endif
 #include "tcg/tcg-ldst.h"
+#include "exec/helper-proto.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -2161,6 +2162,31 @@ static uint64_t do_ld_whole_be8(CPUArchState *env, 
uintptr_t ra,
 return (ret_be << (p->size * 8)) | x;
 }
 
+/**
+ * do_ld_parts_be16
+ * @p: translation parameters
+ * @ret_be: accumulated data
+ *
+ * As do_ld_bytes_beN, but with one atomic load.
+ * 16 aligned bytes are guaranteed to cover the load.
+ */
+static Int128 do_ld_whole_be16(CPUArchState *env, uintptr_t ra,
+   MMULookupPageData *p, uint64_t ret_be)
+{
+int o = p->addr & 15;
+Int128 x, y = load_atomic16_or_exit(env, ra, p->haddr - o);
+int size = p->size;
+
+if (!HOST_BIG_ENDIAN) {
+y = bswap128(y);
+}
+y = int128_lshift(y, o * 8);
+y = int128_urshift(y, (16 - size) * 8);
+x = int128_make64(ret_be);
+x = int128_lshift(x, size * 8);
+return int128_or(x, y);
+}
+
 /*
  * Wrapper for the above.
  */
@@ -2205,6 +2231,59 @@ static uint64_t do_ld_beN(CPUArchState *env, 
MMULookupPageData *p,
 }
 }
 
+/*
+ * Wrapper for the above, for 8 < size < 16.
+ */
+static Int128 do_ld16_beN(CPUArchState *env, MMULookupPageData *p,
+  uint64_t a, int mmu_idx, MemOp mop, uintptr_t ra)
+{
+int size = p->size;
+uint64_t b;
+MemOp atmax;
+
+if (unlikely(p->flags & TLB_MMIO)) {
+p->size = size - 8;
+a = do_ld_mmio_beN(env, p, a, mmu_idx, MMU_DATA_LOAD, ra);
+p->addr += p->size;
+p->size = 8;
+b = do_ld_mmio_beN(env, p, 0, mmu_idx, MMU_DATA_LOAD, ra);
+} else {
+switch (mop & MO_ATOM_MASK) {
+case MO_ATOM_WITHIN16:
+/*
+ * It is a given that we cross a page and therefore there is no
+ * atomicity for the load as a whole, but there may be a subobject
+ * as defined by ATMAX which does not cross a 16-byte boundary.
+ */
+atmax = mop & MO_ATMAX_MASK;
+if (atmax != MO_ATMAX_SIZE) {
+atmax >>= MO_ATMAX_SHIFT;
+if (unlikely(size >= (1 << atmax))) {
+return do_ld_whole_be16(env, ra, p, a);
+}
+}
+/* fall through */
+case MO_ATOM_IFALIGN:
+case MO_ATOM_NONE:
+p->size = size - 8;
+a = do_ld_bytes_beN(p, a);
+b = ldq_be_p(p->haddr + size - 8);
+break;
+case MO_ATOM_SUBALIGN:
+p->size = size - 8;
+a = do_ld_parts_beN(p, a);
+p->haddr += size -

[PATCH v4 11/57] tcg/tci: Use helper_{ld,st}*_mmu for user-only

2023-05-03 Thread Richard Henderson

We can now fold these two pieces of code.

Signed-off-by: Richard Henderson 
---
 tcg/tci.c | 89 ---
 1 file changed, 89 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index 5bde2e1f2e..15f2f8c463 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -292,7 +292,6 @@ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong 
taddr,
 MemOp mop = get_memop(oi);
 uintptr_t ra = (uintptr_t)tb_ptr;
 
-#ifdef CONFIG_SOFTMMU
 switch (mop & MO_SSIZE) {
 case MO_UB:
 return helper_ldub_mmu(env, taddr, oi, ra);
@@ -311,58 +310,6 @@ static uint64_t tci_qemu_ld(CPUArchState *env, 
target_ulong taddr,
 default:
 g_assert_not_reached();
 }
-#else
-void *haddr = g2h(env_cpu(env), taddr);
-unsigned a_mask = (1u << get_alignment_bits(mop)) - 1;
-uint64_t ret;
-
-set_helper_retaddr(ra);
-if (taddr & a_mask) {
-helper_unaligned_ld(env, taddr);
-}
-switch (mop & (MO_BSWAP | MO_SSIZE)) {
-case MO_UB:
-ret = ldub_p(haddr);
-break;
-case MO_SB:
-ret = ldsb_p(haddr);
-break;
-case MO_LEUW:
-ret = lduw_le_p(haddr);
-break;
-case MO_LESW:
-ret = ldsw_le_p(haddr);
-break;
-case MO_LEUL:
-ret = (uint32_t)ldl_le_p(haddr);
-break;
-case MO_LESL:
-ret = (int32_t)ldl_le_p(haddr);
-break;
-case MO_LEUQ:
-ret = ldq_le_p(haddr);
-break;
-case MO_BEUW:
-ret = lduw_be_p(haddr);
-break;
-case MO_BESW:
-ret = ldsw_be_p(haddr);
-break;
-case MO_BEUL:
-ret = (uint32_t)ldl_be_p(haddr);
-break;
-case MO_BESL:
-ret = (int32_t)ldl_be_p(haddr);
-break;
-case MO_BEUQ:
-ret = ldq_be_p(haddr);
-break;
-default:
-g_assert_not_reached();
-}
-clear_helper_retaddr();
-return ret;
-#endif
 }
 
 static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val,
@@ -371,7 +318,6 @@ static void tci_qemu_st(CPUArchState *env, target_ulong 
taddr, uint64_t val,
 MemOp mop = get_memop(oi);
 uintptr_t ra = (uintptr_t)tb_ptr;
 
-#ifdef CONFIG_SOFTMMU
 switch (mop & MO_SIZE) {
 case MO_UB:
 helper_stb_mmu(env, taddr, val, oi, ra);
@@ -388,41 +334,6 @@ static void tci_qemu_st(CPUArchState *env, target_ulong 
taddr, uint64_t val,
 default:
 g_assert_not_reached();
 }
-#else
-void *haddr = g2h(env_cpu(env), taddr);
-unsigned a_mask = (1u << get_alignment_bits(mop)) - 1;
-
-set_helper_retaddr(ra);
-if (taddr & a_mask) {
-helper_unaligned_st(env, taddr);
-}
-switch (mop & (MO_BSWAP | MO_SIZE)) {
-case MO_UB:
-stb_p(haddr, val);
-break;
-case MO_LEUW:
-stw_le_p(haddr, val);
-break;
-case MO_LEUL:
-stl_le_p(haddr, val);
-break;
-case MO_LEUQ:
-stq_le_p(haddr, val);
-break;
-case MO_BEUW:
-stw_be_p(haddr, val);
-break;
-case MO_BEUL:
-stl_be_p(haddr, val);
-break;
-case MO_BEUQ:
-stq_be_p(haddr, val);
-break;
-default:
-g_assert_not_reached();
-}
-clear_helper_retaddr();
-#endif
 }
 
 #if TCG_TARGET_REG_BITS == 64
-- 
2.34.1

[PATCH v4 25/54] tcg/s390x: Pass TCGType to tcg_out_qemu_{ld,st}

2023-05-03 Thread Richard Henderson

We need to set this in TCGLabelQemuLdst, so plumb this
all the way through from tcg_out_op.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/s390x/tcg-target.c.inc | 22 ++
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index b399798664..e931f0cde4 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -1770,13 +1770,14 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addr_reg, MemOp opc,
 }
 
 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, MemOpIdx oi,
-TCGReg data, TCGReg addr,
+TCGType type, TCGReg data, TCGReg addr,
 tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
 {
 TCGLabelQemuLdst *label = new_ldst_label(s);
 
 label->is_ld = is_ld;
 label->oi = oi;
+label->type = type;
 label->datalo_reg = data;
 label->addrlo_reg = addr;
 label->raddr = tcg_splitwx_to_rx(raddr);
@@ -1900,7 +1901,7 @@ static void tcg_prepare_user_ldst(TCGContext *s, TCGReg 
*addr_reg,
 #endif /* CONFIG_SOFTMMU */
 
 static void tcg_out_qemu_ld(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
-MemOpIdx oi)
+MemOpIdx oi, TCGType data_type)
 {
 MemOp opc = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
@@ -1916,7 +1917,8 @@ static void tcg_out_qemu_ld(TCGContext* s, TCGReg 
data_reg, TCGReg addr_reg,
 
 tcg_out_qemu_ld_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0);
 
-add_qemu_ldst_label(s, 1, oi, data_reg, addr_reg, s->code_ptr, label_ptr);
+add_qemu_ldst_label(s, true, oi, data_type, data_reg, addr_reg,
+s->code_ptr, label_ptr);
 #else
 TCGReg index_reg;
 tcg_target_long disp;
@@ -1931,7 +1933,7 @@ static void tcg_out_qemu_ld(TCGContext* s, TCGReg 
data_reg, TCGReg addr_reg,
 }
 
 static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg,
-MemOpIdx oi)
+MemOpIdx oi, TCGType data_type)
 {
 MemOp opc = get_memop(oi);
 #ifdef CONFIG_SOFTMMU
@@ -1947,7 +1949,8 @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg 
data_reg, TCGReg addr_reg,
 
 tcg_out_qemu_st_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0);
 
-add_qemu_ldst_label(s, 0, oi, data_reg, addr_reg, s->code_ptr, label_ptr);
+add_qemu_ldst_label(s, false, oi, data_type, data_reg, addr_reg,
+s->code_ptr, label_ptr);
 #else
 TCGReg index_reg;
 tcg_target_long disp;
@@ -2307,13 +2310,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 break;
 
 case INDEX_op_qemu_ld_i32:
-/* ??? Technically we can use a non-extending instruction.  */
+tcg_out_qemu_ld(s, args[0], args[1], args[2], TCG_TYPE_I32);
+break;
 case INDEX_op_qemu_ld_i64:
-tcg_out_qemu_ld(s, args[0], args[1], args[2]);
+tcg_out_qemu_ld(s, args[0], args[1], args[2], TCG_TYPE_I64);
 break;
 case INDEX_op_qemu_st_i32:
+tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I32);
+break;
 case INDEX_op_qemu_st_i64:
-tcg_out_qemu_st(s, args[0], args[1], args[2]);
+tcg_out_qemu_st(s, args[0], args[1], args[2], TCG_TYPE_I64);
 break;
 
 case INDEX_op_ld16s_i64:
-- 
2.34.1

[PATCH v4 53/57] tcg/i386: Support 128-bit load/store with have_atomic16

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.h |   3 +-
 tcg/i386/tcg-target.c.inc | 184 +-
 2 files changed, 182 insertions(+), 5 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 943af6775e..7f69997e30 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -194,7 +194,8 @@ extern bool have_atomic16;
 #define TCG_TARGET_HAS_qemu_st8_i32 1
 #endif
 
-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128 \
+(TCG_TARGET_REG_BITS == 64 && have_atomic16)
 
 /* We do not support older SSE systems, only beginning with AVX1.  */
 #define TCG_TARGET_HAS_v64  have_avx1
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 5c6c64c48a..a2739977a6 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -91,6 +91,8 @@ static const int tcg_target_reg_alloc_order[] = {
 #endif
 };
 
+#define TCG_TMP_VEC  TCG_REG_XMM5
+
 static const int tcg_target_call_iarg_regs[] = {
 #if TCG_TARGET_REG_BITS == 64
 #if defined(_WIN64)
@@ -347,6 +349,8 @@ static bool tcg_target_const_match(int64_t val, TCGType 
type, int ct)
 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
+#define OPC_PEXTRD  (0x16 | P_EXT3A | P_DATA16)
+#define OPC_PINSRD  (0x22 | P_EXT3A | P_DATA16)
 #define OPC_PMAXSB  (0x3c | P_EXT38 | P_DATA16)
 #define OPC_PMAXSW  (0xee | P_EXT | P_DATA16)
 #define OPC_PMAXSD  (0x3d | P_EXT38 | P_DATA16)
@@ -1784,7 +1788,22 @@ typedef struct {
 
 bool tcg_target_has_memory_bswap(MemOp memop)
 {
-return have_movbe;
+MemOp atom_a, atom_u;
+
+if (!have_movbe) {
+return false;
+}
+if ((memop & MO_SIZE) <= MO_64) {
+return true;
+}
+
+/*
+ * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
+ * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
+ */
+(void)atom_and_align_for_opc(tcg_ctx, &atom_a, &atom_u, memop,
+ MO_ATOM_IFALIGN, true);
+return atom_a <= MO_64;
 }
 
 /*
@@ -1812,6 +1831,30 @@ static const TCGLdstHelperParam ldst_helper_param = {
 static const TCGLdstHelperParam ldst_helper_param = { };
 #endif
 
+static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
+TCGReg l, TCGReg h, TCGReg v)
+{
+int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
+
+/* vpmov{d,q} %v, %l */
+tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
+/* vpextr{d,q} $1, %v, %h */
+tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
+tcg_out8(s, 1);
+}
+
+static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
+TCGReg v, TCGReg l, TCGReg h)
+{
+int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
+
+/* vmov{d,q} %l, %v */
+tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
+/* vpinsr{d,q} $1, %h, %v, %v */
+tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
+tcg_out8(s, 1);
+}
+
 /*
  * Generate code for the slow path for a load at the end of block
  */
@@ -1901,11 +1944,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-MemOp atom_u;
+MemOp atom_u, s_bits;
 unsigned a_mask;
 
+s_bits = opc & MO_SIZE;
 h->align = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
-  MO_ATOM_IFALIGN, false);
+  MO_ATOM_IFALIGN, s_bits == MO_128);
 a_mask = (1 << h->align) - 1;
 
 #ifdef CONFIG_SOFTMMU
@@ -1915,7 +1959,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 TCGType tlbtype = TCG_TYPE_I32;
 int trexw = 0, hrexw = 0, tlbrexw = 0;
 unsigned mem_index = get_mmuidx(oi);
-unsigned s_bits = opc & MO_SIZE;
 unsigned s_mask = (1 << s_bits) - 1;
 target_ulong tlb_mask;
 
@@ -2120,6 +2163,69 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
  h.base, h.index, 0, h.ofs + 4);
 }
 break;
+
+case MO_128:
+{
+TCGLabel *l1 = NULL, *l2 = NULL;
+bool use_pair = h.align < MO_128;
+
+tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+
+if (!use_pair) {
+tcg_debug_assert(!use_movbe);
+/*
+ * Atomicity requires that we use use VMOVDQA.
+ * If we've already checked for 16-byte alignment, that's all
+ * we need.  If we arrive here with lesser alignment, then we
+ * have determined that less than 16-byte alignment can be
+ * satisfied with two 8-byte loads.
+ */
+if (h.align < MO_128) {
+use_pair = true;
+l1 =

[PATCH v4 05/54] tcg/i386: Introduce tcg_out_testi

2023-05-03 Thread Richard Henderson

Split out a helper for choosing testb vs testl.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 30 ++
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 78160f453b..aae698121a 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -1751,6 +1751,23 @@ static void tcg_out_nopn(TCGContext *s, int n)
 tcg_out8(s, 0x90);
 }
 
+/* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
+static void __attribute__((unused))
+tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
+{
+/*
+ * This is used for testing alignment, so we can usually use testb.
+ * For i686, we have to use testl for %esi/%edi.
+ */
+if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
+tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
+tcg_out8(s, i);
+} else {
+tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
+tcg_out32(s, i);
+}
+}
+
 typedef struct {
 TCGReg base;
 int index;
@@ -2051,18 +2068,7 @@ static void tcg_out_test_alignment(TCGContext *s, bool 
is_ld, TCGReg addrlo,
 unsigned a_mask = (1 << a_bits) - 1;
 TCGLabelQemuLdst *label;
 
-/*
- * We are expecting a_bits to max out at 7, so we can usually use testb.
- * For i686, we have to use testl for %esi/%edi.
- */
-if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
-tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
-tcg_out8(s, a_mask);
-} else {
-tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
-tcg_out32(s, a_mask);
-}
-
+tcg_out_testi(s, addrlo, a_mask);
 /* jne slow_path */
 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
 
-- 
2.34.1

[PATCH v4 54/57] tcg/aarch64: Rename temporaries

2023-05-03 Thread Richard Henderson

We will need to allocate a second general-purpose temporary.
Rename the existing temps to add a distinguishing number.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 50 ++--
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 1d6d382edd..76a6bfd202 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -80,8 +80,8 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind 
kind, int slot)
 bool have_lse;
 bool have_lse2;
 
-#define TCG_REG_TMP TCG_REG_X30
-#define TCG_VEC_TMP TCG_REG_V31
+#define TCG_REG_TMP0 TCG_REG_X30
+#define TCG_VEC_TMP0 TCG_REG_V31
 
 #ifndef CONFIG_SOFTMMU
 /* Note that XZR cannot be encoded in the address base register slot,
@@ -998,7 +998,7 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, 
unsigned vece,
 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
  TCGReg r, TCGReg base, intptr_t offset)
 {
-TCGReg temp = TCG_REG_TMP;
+TCGReg temp = TCG_REG_TMP0;
 
 if (offset < -0xff || offset > 0xff) {
 tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
@@ -1150,8 +1150,8 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, 
TCGReg rd,
 }
 
 /* Worst-case scenario, move offset to temp register, use reg offset.  */
-tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, offset);
-tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
+tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset);
+tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0);
 }
 
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
@@ -1367,8 +1367,8 @@ static void tcg_out_call_int(TCGContext *s, const 
tcg_insn_unit *target)
 if (offset == sextract64(offset, 0, 26)) {
 tcg_out_insn(s, 3206, BL, offset);
 } else {
-tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP, (intptr_t)target);
-tcg_out_insn(s, 3207, BLR, TCG_REG_TMP);
+tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
+tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0);
 }
 }
 
@@ -1505,7 +1505,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, 
TCGReg rl,
 AArch64Insn insn;
 
 if (rl == ah || (!const_bh && rl == bh)) {
-rl = TCG_REG_TMP;
+rl = TCG_REG_TMP0;
 }
 
 if (const_bl) {
@@ -1522,7 +1522,7 @@ static void tcg_out_addsub2(TCGContext *s, TCGType ext, 
TCGReg rl,
possibility of adding 0+const in the low part, and the
immediate add instructions encode XSP not XZR.  Don't try
anything more elaborate here than loading another zero.  */
-al = TCG_REG_TMP;
+al = TCG_REG_TMP0;
 tcg_out_movi(s, ext, al, 0);
 }
 tcg_out_insn_3401(s, insn, ext, rl, al, bl);
@@ -1563,7 +1563,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 {
 TCGReg a1 = a0;
 if (is_ctz) {
-a1 = TCG_REG_TMP;
+a1 = TCG_REG_TMP0;
 tcg_out_insn(s, 3507, RBIT, ext, a1, a0);
 }
 if (const_b && b == (ext ? 64 : 32)) {
@@ -1572,7 +1572,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 AArch64Insn sel = I3506_CSEL;
 
 tcg_out_cmp(s, ext, a0, 0, 1);
-tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP, a1);
+tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1);
 
 if (const_b) {
 if (b == -1) {
@@ -1585,7 +1585,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 b = d;
 }
 }
-tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP, b, TCG_COND_NE);
+tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE);
 }
 }
 
@@ -1603,7 +1603,7 @@ bool tcg_target_has_memory_bswap(MemOp memop)
 }
 
 static const TCGLdstHelperParam ldst_helper_param = {
-.ntmp = 1, .tmp = { TCG_REG_TMP }
+.ntmp = 1, .tmp = { TCG_REG_TMP0 }
 };
 
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -1864,7 +1864,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 
 set_jmp_insn_offset(s, which);
 tcg_out32(s, I3206_B);
-tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
+tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
 set_jmp_reset_offset(s, which);
 }
 
@@ -1883,7 +1883,7 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, 
int n,
 ptrdiff_t i_offset = i_addr - jmp_rx;
 
 /* Note that we asserted this in range in tcg_out_goto_tb. */
-insn = deposit32(I3305_LDR | TCG_REG_TMP, 5, 19, i_offset >> 2);
+insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2);
 }
 qatomic_set((uint32_t *)jmp_rw, insn);
 flush_idcache_range(jmp_rx, jmp_rw, 4);
@@ -2079,13 +2079,13 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
 case INDEX_op_rem_i64:
 case INDEX_op_rem_i32

[PATCH v4 50/57] tcg/s390x: Use atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 tcg/s390x/tcg-target.c.inc | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index 22f0206b5a..ddd9860a6a 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -1572,6 +1572,8 @@ typedef struct {
 TCGReg base;
 TCGReg index;
 int disp;
+MemOp align;
+MemOp atom;
 } HostAddress;
 
 bool tcg_target_has_memory_bswap(MemOp memop)
@@ -1733,8 +1735,12 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-unsigned a_bits = get_alignment_bits(opc);
-unsigned a_mask = (1u << a_bits) - 1;
+MemOp atom_u;
+unsigned a_mask;
+
+h->align = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
+  MO_ATOM_IFALIGN, false);
+a_mask = (1 << h->align) - 1;
 
 #ifdef CONFIG_SOFTMMU
 unsigned s_bits = opc & MO_SIZE;
@@ -1764,7 +1770,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
  * bits within the address.  For unaligned access, we check that we don't
  * cross pages using the address of the last byte of the access.
  */
-a_off = (a_bits >= s_bits ? 0 : s_mask - a_mask);
+a_off = (a_mask >= s_mask ? 0 : s_mask - a_mask);
 tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
 if (a_off == 0) {
 tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask);
@@ -1806,7 +1812,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 ldst->addrlo_reg = addr_reg;
 
 /* We are expecting a_bits to max out at 7, much lower than TMLL. */
-tcg_debug_assert(a_bits < 16);
+tcg_debug_assert(a_mask <= 0x);
 tcg_out_insn(s, RI, TMLL, addr_reg, a_mask);
 
 tcg_out16(s, RI_BRC | (7 << 4)); /* CC in {1,2,3} */
-- 
2.34.1

[PATCH v4 46/57] tcg/loongarch64: Use atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 tcg/loongarch64/tcg-target.c.inc | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 62bf823084..43341524f2 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -826,6 +826,8 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 typedef struct {
 TCGReg base;
 TCGReg index;
+MemOp align;
+MemOp atom;
 } HostAddress;
 
 bool tcg_target_has_memory_bswap(MemOp memop)
@@ -845,7 +847,11 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
 {
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-unsigned a_bits = get_alignment_bits(opc);
+MemOp a_bits, atom_u;
+
+a_bits = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
+MO_ATOM_IFALIGN, false);
+h->align = a_bits;
 
 #ifdef CONFIG_SOFTMMU
 unsigned s_bits = opc & MO_SIZE;
-- 
2.34.1

[PATCH v4 55/57] tcg/aarch64: Support 128-bit load/store

2023-05-03 Thread Richard Henderson

Use LDXP+STXP when LSE2 is not present and 16-byte atomicity is required,
and LDP/STP otherwise.  This requires allocating a second general-purpose
temporary, as Rs cannot overlap Rn in STXP.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target-con-set.h |   2 +
 tcg/aarch64/tcg-target.h |   2 +-
 tcg/aarch64/tcg-target.c.inc | 181 ++-
 3 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/tcg/aarch64/tcg-target-con-set.h b/tcg/aarch64/tcg-target-con-set.h
index d6c6866878..74065c7098 100644
--- a/tcg/aarch64/tcg-target-con-set.h
+++ b/tcg/aarch64/tcg-target-con-set.h
@@ -14,6 +14,7 @@ C_O0_I2(lZ, l)
 C_O0_I2(r, rA)
 C_O0_I2(rZ, r)
 C_O0_I2(w, r)
+C_O0_I3(lZ, lZ, l)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
@@ -33,4 +34,5 @@ C_O1_I2(w, w, wO)
 C_O1_I2(w, w, wZ)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, r, rA, rZ, rZ)
+C_O2_I1(r, r, l)
 C_O2_I4(r, r, rZ, rZ, rA, rMZ)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 74ee2ed255..fa6af9746f 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -129,7 +129,7 @@ extern bool have_lse2;
 #define TCG_TARGET_HAS_muluh_i641
 #define TCG_TARGET_HAS_mulsh_i641
 
-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128   1
 
 #define TCG_TARGET_HAS_v64  1
 #define TCG_TARGET_HAS_v128 1
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 76a6bfd202..f1627cb96d 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -81,6 +81,7 @@ bool have_lse;
 bool have_lse2;
 
 #define TCG_REG_TMP0 TCG_REG_X30
+#define TCG_REG_TMP1 TCG_REG_X17
 #define TCG_VEC_TMP0 TCG_REG_V31
 
 #ifndef CONFIG_SOFTMMU
@@ -404,6 +405,10 @@ typedef enum {
 I3305_LDR_v64   = 0x5c00,
 I3305_LDR_v128  = 0x9c00,
 
+/* Load/store exclusive. */
+I3306_LDXP  = 0xc860,
+I3306_STXP  = 0xc820,
+
 /* Load/store register.  Described here as 3.3.12, but the helper
that emits them can transform to 3.3.10 or 3.3.13.  */
 I3312_STRB  = 0x3800 | LDST_ST << 22 | MO_8 << 30,
@@ -468,6 +473,9 @@ typedef enum {
 I3406_ADR   = 0x1000,
 I3406_ADRP  = 0x9000,
 
+/* Add/subtract extended register instructions. */
+I3501_ADD   = 0x0b20,
+
 /* Add/subtract shifted register instructions (without a shift).  */
 I3502_ADD   = 0x0b00,
 I3502_ADDS  = 0x2b00,
@@ -638,6 +646,12 @@ static void tcg_out_insn_3305(TCGContext *s, AArch64Insn 
insn,
 tcg_out32(s, insn | (imm19 & 0x7) << 5 | rt);
 }
 
+static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs,
+  TCGReg rt, TCGReg rt2, TCGReg rn)
+{
+tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt);
+}
+
 static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
   TCGReg rt, int imm19)
 {
@@ -720,6 +734,14 @@ static void tcg_out_insn_3406(TCGContext *s, AArch64Insn 
insn,
 tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1c) << (5 - 2) | rd);
 }
 
+static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
+ TCGType sf, TCGReg rd, TCGReg rn,
+ TCGReg rm, int opt, int imm3)
+{
+tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 |
+  imm3 << 10 | rn << 5 | rd);
+}
+
 /* This function is for both 3.5.2 (Add/Subtract shifted register), for
the rare occasion when we actually want to supply a shift amount.  */
 static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
@@ -1648,17 +1670,17 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 TCGType addr_type = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-MemOp atom_u;
+MemOp atom_u, s_bits;
 unsigned a_mask;
 
+s_bits = opc & MO_SIZE;
 h->align = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
   have_lse2 ? MO_ATOM_WITHIN16
 : MO_ATOM_IFALIGN,
-  false);
+  s_bits == MO_128);
 a_mask = (1 << h->align) - 1;
 
 #ifdef CONFIG_SOFTMMU
-unsigned s_bits = opc & MO_SIZE;
 unsigned s_mask = (1u << s_bits) - 1;
 unsigned mem_index = get_mmuidx(oi);
 TCGReg x3;
@@ -1839,6 +1861,148 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 }
 }
 
+static TCGLabelQemuLdst *
+prepare_host_addr_base_only(TCGContext *s, HostAddress *h, TCGReg addr_reg,
+MemOpIdx oi, bool is_ld)
+{
+TCGLabelQemuLdst *ldst;
+
+ldst = prepare_host_addr(s, h, addr_reg, oi, true);
+
+/* Compose the final address, as LDP/STP have no indexing. */
+

[PATCH v4 31/57] tcg/sparc64: Rename tcg_out_movi_imm13 to tcg_out_movi_s13

2023-05-03 Thread Richard Henderson

Emphasize that the constant is signed.

Signed-off-by: Richard Henderson 
---
 tcg/sparc64/tcg-target.c.inc | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index 64464ab363..2e6127d506 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -399,7 +399,7 @@ static void tcg_out_sethi(TCGContext *s, TCGReg ret, 
uint32_t arg)
 tcg_out32(s, SETHI | INSN_RD(ret) | ((arg & 0xfc00) >> 10));
 }
 
-static void tcg_out_movi_imm13(TCGContext *s, TCGReg ret, int32_t arg)
+static void tcg_out_movi_s13(TCGContext *s, TCGReg ret, int32_t arg)
 {
 tcg_out_arithi(s, ret, TCG_REG_G0, arg, ARITH_OR);
 }
@@ -408,7 +408,7 @@ static void tcg_out_movi_imm32(TCGContext *s, TCGReg ret, 
int32_t arg)
 {
 if (check_fit_i32(arg, 13)) {
 /* A 13-bit constant sign-extended to 64-bits.  */
-tcg_out_movi_imm13(s, ret, arg);
+tcg_out_movi_s13(s, ret, arg);
 } else {
 /* A 32-bit constant zero-extended to 64 bits.  */
 tcg_out_sethi(s, ret, arg);
@@ -425,15 +425,15 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, 
TCGReg ret,
 tcg_target_long hi, lo = (int32_t)arg;
 tcg_target_long test, lsb;
 
-/* A 32-bit constant, or 32-bit zero-extended to 64-bits.  */
-if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) {
-tcg_out_movi_imm32(s, ret, arg);
+/* A 13-bit constant sign-extended to 64-bits.  */
+if (check_fit_tl(arg, 13)) {
+tcg_out_movi_s13(s, ret, arg);
 return;
 }
 
-/* A 13-bit constant sign-extended to 64-bits.  */
-if (check_fit_tl(arg, 13)) {
-tcg_out_movi_imm13(s, ret, arg);
+/* A 32-bit constant, or 32-bit zero-extended to 64-bits.  */
+if (type == TCG_TYPE_I32 || arg == (uint32_t)arg) {
+tcg_out_movi_imm32(s, ret, arg);
 return;
 }
 
@@ -767,7 +767,7 @@ static void tcg_out_setcond_i32(TCGContext *s, TCGCond 
cond, TCGReg ret,
 
 default:
 tcg_out_cmp(s, c1, c2, c2const);
-tcg_out_movi_imm13(s, ret, 0);
+tcg_out_movi_s13(s, ret, 0);
 tcg_out_movcc(s, cond, MOVCC_ICC, ret, 1, 1);
 return;
 }
@@ -803,11 +803,11 @@ static void tcg_out_setcond_i64(TCGContext *s, TCGCond 
cond, TCGReg ret,
 /* For 64-bit signed comparisons vs zero, we can avoid the compare
if the input does not overlap the output.  */
 if (c2 == 0 && !is_unsigned_cond(cond) && c1 != ret) {
-tcg_out_movi_imm13(s, ret, 0);
+tcg_out_movi_s13(s, ret, 0);
 tcg_out_movr(s, cond, ret, c1, 1, 1);
 } else {
 tcg_out_cmp(s, c1, c2, c2const);
-tcg_out_movi_imm13(s, ret, 0);
+tcg_out_movi_s13(s, ret, 0);
 tcg_out_movcc(s, cond, MOVCC_XCC, ret, 1, 1);
 }
 }
@@ -844,7 +844,7 @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, 
TCGReg rh,
 if (use_vis3_instructions && !is_sub) {
 /* Note that ADDXC doesn't accept immediates.  */
 if (bhconst && bh != 0) {
-   tcg_out_movi_imm13(s, TCG_REG_T2, bh);
+   tcg_out_movi_s13(s, TCG_REG_T2, bh);
bh = TCG_REG_T2;
 }
 tcg_out_arith(s, rh, ah, bh, ARITH_ADDXC);
@@ -866,7 +866,7 @@ static void tcg_out_addsub2_i64(TCGContext *s, TCGReg rl, 
TCGReg rh,
  * so the adjustment fits 12 bits.
  */
 if (bhconst) {
-tcg_out_movi_imm13(s, TCG_REG_T2, bh + (is_sub ? -1 : 1));
+tcg_out_movi_s13(s, TCG_REG_T2, bh + (is_sub ? -1 : 1));
 } else {
 tcg_out_arithi(s, TCG_REG_T2, bh, 1,
is_sub ? ARITH_SUB : ARITH_ADD);
@@ -1036,7 +1036,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
 tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
 /* delay slot */
-tcg_out_movi_imm13(s, TCG_REG_O0, 0);
+tcg_out_movi_s13(s, TCG_REG_O0, 0);
 
 build_trampolines(s);
 }
@@ -1430,7 +1430,7 @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 {
 if (check_fit_ptr(a0, 13)) {
 tcg_out_arithi(s, TCG_REG_G0, TCG_REG_I7, 8, RETURN);
-tcg_out_movi_imm13(s, TCG_REG_O0, a0);
+tcg_out_movi_s13(s, TCG_REG_O0, a0);
 return;
 } else {
 intptr_t tb_diff = tcg_tbrel_diff(s, (void *)a0);
-- 
2.34.1

[PATCH v4 16/57] accel/tcg: Add aarch64 specific support in ldst_atomicity

2023-05-03 Thread Richard Henderson

We have code in atomic128.h noting that through GCC 8, there
was no support for atomic operations on __uint128.  This has
been fixed in GCC 10.  But we can still improve over any
basic compare-and-swap loop using the ldxp/stxp instructions.

Signed-off-by: Richard Henderson 
---
 accel/tcg/ldst_atomicity.c.inc | 60 --
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index 07bfa5c3c8..2426b09aef 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -247,7 +247,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, 
uintptr_t ra, void *pv)
  * In system mode all guest pages are writable, and for user-only
  * we have just checked writability.  Try cmpxchg.
  */
-#if defined(CONFIG_CMPXCHG128)
+#if defined(__aarch64__)
+/* We can do better than cmpxchg for AArch64.  */
+{
+uint64_t l, h;
+uint32_t fail;
+
+/* The load must be paired with the store to guarantee not tearing. */
+asm("0: ldxp %0, %1, %3\n\t"
+"stxp %w2, %0, %1, %3\n\t"
+"cbnz %w2, 0b"
+: "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
+
+qemu_build_assert(!HOST_BIG_ENDIAN);
+return int128_make128(l, h);
+}
+#elif defined(CONFIG_CMPXCHG128)
 /* Swap 0 with 0, with the side-effect of returning the old value. */
 {
 Int128Alias r;
@@ -743,7 +758,22 @@ store_atomic16(void *pv, Int128Alias val)
 return;
 }
 #endif
-#if defined(CONFIG_CMPXCHG128)
+#if defined(__aarch64__)
+/* We can do better than cmpxchg for AArch64.  */
+{
+uint64_t l, h, t;
+
+qemu_build_assert(!HOST_BIG_ENDIAN);
+l = int128_getlo(val.s);
+h = int128_gethi(val.s);
+
+asm("0: ldxp %0, xzr, %1\n\t"
+"stxp %w0, %2, %3, %1\n\t"
+"cbnz %w0, 0b"
+: "=&r"(t), "=Q"(*(__uint128_t *)pv) : "r"(l), "r"(h));
+return;
+}
+#elif defined(CONFIG_CMPXCHG128)
 {
 __uint128_t *pu = __builtin_assume_aligned(pv, 16);
 __uint128_t o;
@@ -841,7 +871,31 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t 
val, uint64_t msk)
 static void ATTRIBUTE_ATOMIC128_OPT
 store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
 {
-#if defined(CONFIG_ATOMIC128)
+#if defined(__aarch64__)
+/*
+ * GCC only implements __sync* primitives for int128 on aarch64.
+ * We can do better without the barriers, and integrating the
+ * arithmetic into the load-exclusive/store-conditional pair.
+ */
+uint64_t tl, th, vl, vh, ml, mh;
+uint32_t fail;
+
+qemu_build_assert(!HOST_BIG_ENDIAN);
+vl = int128_getlo(val.s);
+vh = int128_gethi(val.s);
+ml = int128_getlo(msk.s);
+mh = int128_gethi(msk.s);
+
+asm("0: ldxp %[l], %[h], %[mem]\n\t"
+"bic %[l], %[l], %[ml]\n\t"
+"bic %[h], %[h], %[mh]\n\t"
+"orr %[l], %[l], %[vl]\n\t"
+"orr %[h], %[h], %[vh]\n\t"
+"stxp %w[f], %[l], %[h], %[mem]\n\t"
+"cbnz %w[f], 0b\n"
+: [mem] "+Q"(*ps), [f] "=&r"(fail), [l] "=&r"(tl), [h] "=&r"(th)
+: [vl] "r"(vl), [vh] "r"(vh), [ml] "r"(ml), [mh] "r"(mh));
+#elif defined(CONFIG_ATOMIC128)
 __uint128_t *pu, old, new;
 
 /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
-- 
2.34.1

[PATCH v4 41/54] tcg/ppc: Convert tcg_out_qemu_{ld,st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.

Reviewed-by: Daniel Henrique Barboza 
Signed-off-by: Richard Henderson 
---
 tcg/ppc/tcg-target.c.inc | 88 
 1 file changed, 26 insertions(+), 62 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 7239335bdf..042136fee7 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2003,44 +2003,38 @@ static void * const qemu_st_helpers[(MO_SIZE | 
MO_BSWAP) + 1] = {
 [MO_BEUQ] = helper_be_stq_mmu,
 };
 
+static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
+{
+if (arg < 0) {
+arg = TCG_REG_TMP1;
+}
+tcg_out32(s, MFSPR | RT(arg) | LR);
+return arg;
+}
+
+/*
+ * For the purposes of ppc32 sorting 4 input registers into 4 argument
+ * registers, there is an outside chance we would require 3 temps.
+ * Because of constraints, no inputs are in r3, and env will not be
+ * placed into r3 until after the sorting is done, and is thus free.
+ */
+static const TCGLdstHelperParam ldst_helper_param = {
+.ra_gen = ldst_ra_gen,
+.ntmp = 3,
+.tmp = { TCG_REG_TMP1, TCG_REG_R0, TCG_REG_R3 }
+};
+
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-MemOpIdx oi = lb->oi;
-MemOp opc = get_memop(oi);
-TCGReg hi, lo, arg = TCG_REG_R3;
+MemOp opc = get_memop(lb->oi);
 
 if (!reloc_pc14(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
 return false;
 }
 
-tcg_out_mov(s, TCG_TYPE_PTR, arg++, TCG_AREG0);
-
-lo = lb->addrlo_reg;
-hi = lb->addrhi_reg;
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
-tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
-tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
-} else {
-/* If the address needed to be zero-extended, we'll have already
-   placed it in R4.  The only remaining case is 64-bit guest.  */
-tcg_out_mov(s, TCG_TYPE_TL, arg++, lo);
-}
-
-tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
-tcg_out32(s, MFSPR | RT(arg) | LR);
-
+tcg_out_ld_helper_args(s, lb, &ldst_helper_param);
 tcg_out_call_int(s, LK, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-
-lo = lb->datalo_reg;
-hi = lb->datahi_reg;
-if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
-tcg_out_mov(s, TCG_TYPE_I32, lo, TCG_REG_R4);
-tcg_out_mov(s, TCG_TYPE_I32, hi, TCG_REG_R3);
-} else {
-tcg_out_movext(s, lb->type, lo,
-   TCG_TYPE_REG, opc & MO_SSIZE, TCG_REG_R3);
-}
+tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param);
 
 tcg_out_b(s, 0, lb->raddr);
 return true;
@@ -2048,43 +2042,13 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *lb)
 
 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-MemOpIdx oi = lb->oi;
-MemOp opc = get_memop(oi);
-MemOp s_bits = opc & MO_SIZE;
-TCGReg hi, lo, arg = TCG_REG_R3;
+MemOp opc = get_memop(lb->oi);
 
 if (!reloc_pc14(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
 return false;
 }
 
-tcg_out_mov(s, TCG_TYPE_PTR, arg++, TCG_AREG0);
-
-lo = lb->addrlo_reg;
-hi = lb->addrhi_reg;
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
-tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
-tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
-} else {
-/* If the address needed to be zero-extended, we'll have already
-   placed it in R4.  The only remaining case is 64-bit guest.  */
-tcg_out_mov(s, TCG_TYPE_TL, arg++, lo);
-}
-
-lo = lb->datalo_reg;
-hi = lb->datahi_reg;
-if (TCG_TARGET_REG_BITS == 32 && s_bits == MO_64) {
-arg |= (TCG_TARGET_CALL_ARG_I64 == TCG_CALL_ARG_EVEN);
-tcg_out_mov(s, TCG_TYPE_I32, arg++, hi);
-tcg_out_mov(s, TCG_TYPE_I32, arg++, lo);
-} else {
-tcg_out_movext(s, s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32,
-   arg++, lb->type, s_bits, lo);
-}
-
-tcg_out_movi(s, TCG_TYPE_I32, arg++, oi);
-tcg_out32(s, MFSPR | RT(arg) | LR);
-
+tcg_out_st_helper_args(s, lb, &ldst_helper_param);
 tcg_out_call_int(s, LK, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 
 tcg_out_b(s, 0, lb->raddr);
-- 
2.34.1

[PATCH v4 01/57] include/exec/memop: Add bits describing atomicity

2023-05-03 Thread Richard Henderson

These bits may be used to describe the precise atomicity
requirements of the guest, which may then be used to
constrain the methods by which it may be emulated by the host.

For instance, the AArch64 LDP (32-bit) instruction changes
semantics with ARMv8.4 LSE2, from

  MO_64 | MO_ATMAX_4 | MO_ATOM_IFALIGN
  (64-bits, single-copy atomic only on 4 byte units,
   nonatomic if not aligned by 4),

to

  MO_64 | MO_ATMAX_SIZE | MO_ATOM_WITHIN16
  (64-bits, single-copy atomic within a 16 byte block)

The former may be implemented with two 4 byte loads, or
a single 8 byte load if that happens to be efficient on
the host.  The latter may not, and may also require a
helper when misaligned.

Reviewed-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/exec/memop.h | 36 
 1 file changed, 36 insertions(+)

diff --git a/include/exec/memop.h b/include/exec/memop.h
index 25d027434a..04e4048f0b 100644
--- a/include/exec/memop.h
+++ b/include/exec/memop.h
@@ -81,6 +81,42 @@ typedef enum MemOp {
 MO_ALIGN_32 = 5 << MO_ASHIFT,
 MO_ALIGN_64 = 6 << MO_ASHIFT,
 
+/*
+ * MO_ATOM_* describes that atomicity requirements of the operation:
+ * MO_ATOM_IFALIGN: the operation must be single-copy atomic if and
+ *only if it is aligned; if unaligned there is no atomicity.
+ * MO_ATOM_NONE: the operation has no atomicity requirements.
+ * MO_ATOM_SUBALIGN: the operation is single-copy atomic by parts
+ *by the alignment.  E.g. if the address is 0 mod 4, then each
+ *4-byte subobject is single-copy atomic.
+ *This is the atomicity of IBM Power and S390X processors.
+ * MO_ATOM_WITHIN16: the operation is single-copy atomic, even if it
+ *is unaligned, so long as it does not cross a 16-byte boundary;
+ *if it crosses a 16-byte boundary there is no atomicity.
+ *This is the atomicity of Arm FEAT_LSE2.
+ *
+ * MO_ATMAX_* describes the maximum atomicity unit required:
+ * MO_ATMAX_SIZE: the entire operation, i.e. MO_SIZE.
+ * MO_ATMAX_[248]: units of N bytes.
+ *
+ * Note the default (i.e. 0) values are single-copy atomic to the
+ * size of the operation, if aligned.  This retains the behaviour
+ * from before these were introduced.
+ */
+MO_ATOM_SHIFT= 8,
+MO_ATOM_MASK = 0x3 << MO_ATOM_SHIFT,
+MO_ATOM_IFALIGN  = 0 << MO_ATOM_SHIFT,
+MO_ATOM_NONE = 1 << MO_ATOM_SHIFT,
+MO_ATOM_SUBALIGN = 2 << MO_ATOM_SHIFT,
+MO_ATOM_WITHIN16 = 3 << MO_ATOM_SHIFT,
+
+MO_ATMAX_SHIFT = 10,
+MO_ATMAX_MASK  = 0x3 << MO_ATMAX_SHIFT,
+MO_ATMAX_SIZE  = 0 << MO_ATMAX_SHIFT,
+MO_ATMAX_2 = 1 << MO_ATMAX_SHIFT,
+MO_ATMAX_4 = 2 << MO_ATMAX_SHIFT,
+MO_ATMAX_8 = 3 << MO_ATMAX_SHIFT,
+
 /* Combinations of the above, for ease of use.  */
 MO_UB= MO_8,
 MO_UW= MO_16,
-- 
2.34.1

[PATCH v4 44/57] tcg/aarch64: Use atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 38 +++-
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index 8e5f3d3688..1d6d382edd 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -1593,6 +1593,8 @@ typedef struct {
 TCGReg base;
 TCGReg index;
 TCGType index_ext;
+MemOp align;
+MemOp atom;
 } HostAddress;
 
 bool tcg_target_has_memory_bswap(MemOp memop)
@@ -1646,8 +1648,14 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 TCGType addr_type = TARGET_LONG_BITS == 64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
 TCGLabelQemuLdst *ldst = NULL;
 MemOp opc = get_memop(oi);
-unsigned a_bits = get_alignment_bits(opc);
-unsigned a_mask = (1u << a_bits) - 1;
+MemOp atom_u;
+unsigned a_mask;
+
+h->align = atom_and_align_for_opc(s, &h->atom, &atom_u, opc,
+  have_lse2 ? MO_ATOM_WITHIN16
+: MO_ATOM_IFALIGN,
+  false);
+a_mask = (1 << h->align) - 1;
 
 #ifdef CONFIG_SOFTMMU
 unsigned s_bits = opc & MO_SIZE;
@@ -1693,7 +1701,7 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, 
HostAddress *h,
  * bits within the address.  For unaligned access, we check that we don't
  * cross pages using the address of the last byte of the access.
  */
-if (a_bits >= s_bits) {
+if (a_mask >= s_mask) {
 x3 = addr_reg;
 } else {
 tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
@@ -1713,11 +1721,9 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 ldst->label_ptr[0] = s->code_ptr;
 tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
 
-*h = (HostAddress){
-.base = TCG_REG_X1,
-.index = addr_reg,
-.index_ext = addr_type
-};
+h->base = TCG_REG_X1,
+h->index = addr_reg;
+h->index_ext = addr_type;
 #else
 if (a_mask) {
 ldst = new_ldst_label(s);
@@ -1735,17 +1741,13 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext 
*s, HostAddress *h,
 }
 
 if (USE_GUEST_BASE) {
-*h = (HostAddress){
-.base = TCG_REG_GUEST_BASE,
-.index = addr_reg,
-.index_ext = addr_type
-};
+h->base = TCG_REG_GUEST_BASE;
+h->index = addr_reg;
+h->index_ext = addr_type;
 } else {
-*h = (HostAddress){
-.base = addr_reg,
-.index = TCG_REG_XZR,
-.index_ext = TCG_TYPE_I64
-};
+h->base = addr_reg;
+h->index = TCG_REG_XZR;
+h->index_ext = TCG_TYPE_I64;
 }
 #endif
 
-- 
2.34.1

[PATCH v4 42/57] tcg: Introduce atom_and_align_for_opc

2023-05-03 Thread Richard Henderson

Examine MemOp for atomicity and alignment, adjusting alignment
as required to implement atomicity on the host.

Signed-off-by: Richard Henderson 
---
 tcg/tcg.c | 69 +++
 1 file changed, 69 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 3905d3041c..2422da64ac 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -220,6 +220,11 @@ static void * const qemu_st_helpers[MO_SIZE + 1] 
__attribute__((unused)) = {
 #endif
 };
 
+static MemOp atom_and_align_for_opc(TCGContext *s, MemOp *p_atom_a,
+MemOp *p_atom_u, MemOp opc,
+MemOp host_atom, bool allow_two_ops)
+__attribute__((unused));
+
 TCGContext tcg_init_ctx;
 __thread TCGContext *tcg_ctx;
 
@@ -5123,6 +5128,70 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
 }
 }
 
+/*
+ * Return the alignment and atomicity to use for the inline fast path
+ * for the given memory operation.  The alignment may be larger than
+ * that specified in @opc, and the correct alignment will be diagnosed
+ * by the slow path helper.
+ */
+static MemOp atom_and_align_for_opc(TCGContext *s, MemOp *p_atom_a,
+MemOp *p_atom_u, MemOp opc,
+MemOp host_atom, bool allow_two_ops)
+{
+MemOp align = get_alignment_bits(opc);
+MemOp atom, atmax, atmin, size = opc & MO_SIZE;
+
+/* When serialized, no further atomicity required.  */
+if (s->gen_tb->cflags & CF_PARALLEL) {
+atom = opc & MO_ATOM_MASK;
+} else {
+atom = MO_ATOM_NONE;
+}
+
+atmax = opc & MO_ATMAX_MASK;
+if (atmax == MO_ATMAX_SIZE) {
+atmax = size;
+} else {
+atmax = atmax >> MO_ATMAX_SHIFT;
+}
+
+switch (atom) {
+case MO_ATOM_NONE:
+/* The operation requires no specific atomicity. */
+atmax = atmin = MO_8;
+break;
+case MO_ATOM_IFALIGN:
+/* If unaligned, the subobjects are bytes. */
+atmin = MO_8;
+break;
+case MO_ATOM_WITHIN16:
+/* If unaligned, there are subobjects if atmax < size. */
+atmin = (atmax < size ? atmax : MO_8);
+atmax = size;
+break;
+case MO_ATOM_SUBALIGN:
+/* If unaligned but not odd, there are subobjects up to atmax - 1. */
+atmin = (atmax == MO_8 ? MO_8 : atmax - 1);
+break;
+default:
+g_assert_not_reached();
+}
+
+/*
+ * If there are subobjects, and the host model does not match, then we
+ * need to raise the initial alignment check.  If the backend is prepared
+ * to double-check alignment and issue two half size ops, we need not
+ * raise initial alignment beyond half.
+ */
+if (atmin > MO_8 && host_atom != atom) {
+align = MAX(align, size - allow_two_ops);
+}
+
+*p_atom_a = atmax;
+*p_atom_u = atmin;
+return align;
+}
+
 /*
  * Similarly for qemu_ld/st slow path helpers.
  * We must re-implement tcg_gen_callN and tcg_reg_alloc_call simultaneously,
-- 
2.34.1

[PATCH 09/84] tcg: Reduce copies for plugin_gen_mem_callbacks

2023-05-03 Thread Richard Henderson

We only need to make copies for loads, when the destination
overlaps the address.  For now, only eliminate the copy for
stores and 128-bit loads.

Rename plugin_prep_mem_callbacks to plugin_maybe_preserve_addr,
returning NULL if no copy is made.

Signed-off-by: Richard Henderson 
---
 tcg/tcg-op-ldst.c | 38 --
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index 17fe35b93c..cbd85f793c 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -114,7 +114,8 @@ static void tcg_gen_req_mo(TCGBar type)
 }
 }
 
-static inline TCGv plugin_prep_mem_callbacks(TCGv vaddr)
+/* Only required for loads, where value might overlap addr. */
+static TCGv plugin_maybe_preserve_addr(TCGv vaddr)
 {
 #ifdef CONFIG_PLUGIN
 if (tcg_ctx->plugin_insn != NULL) {
@@ -124,17 +125,20 @@ static inline TCGv plugin_prep_mem_callbacks(TCGv vaddr)
 return temp;
 }
 #endif
-return vaddr;
+return NULL;
 }
 
-static void plugin_gen_mem_callbacks(TCGv vaddr, MemOpIdx oi,
- enum qemu_plugin_mem_rw rw)
+static void
+plugin_gen_mem_callbacks(TCGv copy_addr, TCGv orig_addr, MemOpIdx oi,
+ enum qemu_plugin_mem_rw rw)
 {
 #ifdef CONFIG_PLUGIN
 if (tcg_ctx->plugin_insn != NULL) {
 qemu_plugin_meminfo_t info = make_plugin_meminfo(oi, rw);
-plugin_gen_empty_mem_callback(vaddr, info);
-tcg_temp_free(vaddr);
+plugin_gen_empty_mem_callback(copy_addr ? : orig_addr, info);
+if (copy_addr) {
+tcg_temp_free(copy_addr);
+}
 }
 #endif
 }
@@ -143,6 +147,7 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
 {
 MemOp orig_memop;
 MemOpIdx oi;
+TCGv copy_addr;
 
 tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
 memop = tcg_canonicalize_memop(memop, 0, 0);
@@ -157,9 +162,9 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
 }
 }
 
-addr = plugin_prep_mem_callbacks(addr);
+copy_addr = plugin_maybe_preserve_addr(addr);
 gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx);
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
+plugin_gen_mem_callbacks(copy_addr, addr, oi, QEMU_PLUGIN_MEM_R);
 
 if ((orig_memop ^ memop) & MO_BSWAP) {
 switch (orig_memop & MO_SIZE) {
@@ -202,13 +207,12 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
 memop &= ~MO_BSWAP;
 }
 
-addr = plugin_prep_mem_callbacks(addr);
 if (TCG_TARGET_HAS_qemu_st8_i32 && (memop & MO_SIZE) == MO_8) {
 gen_ldst_i32(INDEX_op_qemu_st8_i32, val, addr, memop, idx);
 } else {
 gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
 }
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W);
+plugin_gen_mem_callbacks(NULL, addr, oi, QEMU_PLUGIN_MEM_W);
 
 if (swap) {
 tcg_temp_free_i32(swap);
@@ -219,6 +223,7 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
 {
 MemOp orig_memop;
 MemOpIdx oi;
+TCGv copy_addr;
 
 if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
 tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
@@ -243,9 +248,9 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
 }
 }
 
-addr = plugin_prep_mem_callbacks(addr);
+copy_addr = plugin_maybe_preserve_addr(addr);
 gen_ldst_i64(INDEX_op_qemu_ld_i64, val, addr, memop, idx);
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
+plugin_gen_mem_callbacks(copy_addr, addr, oi, QEMU_PLUGIN_MEM_R);
 
 if ((orig_memop ^ memop) & MO_BSWAP) {
 int flags = (orig_memop & MO_SIGN
@@ -300,9 +305,8 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
 memop &= ~MO_BSWAP;
 }
 
-addr = plugin_prep_mem_callbacks(addr);
 gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W);
+plugin_gen_mem_callbacks(NULL, addr, oi, QEMU_PLUGIN_MEM_W);
 
 if (swap) {
 tcg_temp_free_i64(swap);
@@ -430,7 +434,6 @@ void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg 
idx, MemOp memop)
 tcg_debug_assert((memop & MO_SIGN) == 0);
 
 tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
-addr = plugin_prep_mem_callbacks(addr);
 
 /* TODO: For now, force 32-bit hosts to use the helper. */
 if (TCG_TARGET_HAS_qemu_ldst_i128 && TCG_TARGET_REG_BITS == 64) {
@@ -501,7 +504,7 @@ void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg 
idx, MemOp memop)
 maybe_free_addr64(a64);
 }
 
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
+plugin_gen_mem_callbacks(NULL, addr, oi, QEMU_PLUGIN_MEM_R);
 }
 
 void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -512,7 +515,6 @@ void tcg_gen_qemu_st_i128(TCGv_i128 val, T

[PATCH v4 39/57] tcg: Introduce tcg_target_has_memory_bswap

2023-05-03 Thread Richard Henderson

Replace the unparameterized TCG_TARGET_HAS_MEMORY_BSWAP macro
with a function with a memop argument.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.h |  1 -
 tcg/arm/tcg-target.h |  1 -
 tcg/i386/tcg-target.h|  3 ---
 tcg/loongarch64/tcg-target.h |  2 --
 tcg/mips/tcg-target.h|  2 --
 tcg/ppc/tcg-target.h |  1 -
 tcg/riscv/tcg-target.h   |  2 --
 tcg/s390x/tcg-target.h   |  2 --
 tcg/sparc64/tcg-target.h |  1 -
 tcg/tcg-internal.h   |  2 ++
 tcg/tci/tcg-target.h |  2 --
 tcg/tcg-op.c | 20 +++-
 tcg/aarch64/tcg-target.c.inc |  5 +
 tcg/arm/tcg-target.c.inc |  5 +
 tcg/i386/tcg-target.c.inc|  5 +
 tcg/loongarch64/tcg-target.c.inc |  5 +
 tcg/mips/tcg-target.c.inc|  5 +
 tcg/ppc/tcg-target.c.inc |  5 +
 tcg/riscv/tcg-target.c.inc   |  5 +
 tcg/s390x/tcg-target.c.inc   |  5 +
 tcg/sparc64/tcg-target.c.inc |  5 +
 tcg/tci/tcg-target.c.inc |  5 +
 22 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 3c0b0d312d..378e01d9d8 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -154,7 +154,6 @@ extern bool have_lse2;
 #define TCG_TARGET_HAS_cmpsel_vec   0
 
 #define TCG_TARGET_DEFAULT_MO (0)
-#define TCG_TARGET_HAS_MEMORY_BSWAP 0
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index def2a189e6..4c2d3332d5 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -150,7 +150,6 @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_cmpsel_vec   0
 
 #define TCG_TARGET_DEFAULT_MO (0)
-#define TCG_TARGET_HAS_MEMORY_BSWAP 0
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 0421776cb8..8fe6958abd 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -240,9 +240,6 @@ extern bool have_atomic16;
 #include "tcg/tcg-mo.h"
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
-
-#define TCG_TARGET_HAS_MEMORY_BSWAP  have_movbe
-
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index 17b8193aa5..75c3d80ed2 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -173,6 +173,4 @@ typedef enum {
 
 #define TCG_TARGET_NEED_LDST_LABELS
 
-#define TCG_TARGET_HAS_MEMORY_BSWAP 0
-
 #endif /* LOONGARCH_TCG_TARGET_H */
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 42bd7fff01..47088af9cb 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -205,8 +205,6 @@ extern bool use_mips32r2_instructions;
 #endif
 
 #define TCG_TARGET_DEFAULT_MO   0
-#define TCG_TARGET_HAS_MEMORY_BSWAP 0
-
 #define TCG_TARGET_NEED_LDST_LABELS
 
 #endif
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index af81c5a57f..d55f0266bb 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -179,7 +179,6 @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_cmpsel_vec   0
 
 #define TCG_TARGET_DEFAULT_MO (0)
-#define TCG_TARGET_HAS_MEMORY_BSWAP 1
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index dddf2486c1..dece3b3c27 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -168,6 +168,4 @@ typedef enum {
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
-#define TCG_TARGET_HAS_MEMORY_BSWAP 0
-
 #endif
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index a05b473117..fe05680124 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -172,8 +172,6 @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_CALL_ARG_I128TCG_CALL_ARG_BY_REF
 #define TCG_TARGET_CALL_RET_I128TCG_CALL_RET_BY_REF
 
-#define TCG_TARGET_HAS_MEMORY_BSWAP   1
-
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
diff --git a/tcg/sparc64/tcg-target.h b/tcg/sparc64/tcg-target.h
index 7434cc99d4..f6cd86975a 100644
--- a/tcg/sparc64/tcg-target.h
+++ b/tcg/sparc64/tcg-target.h
@@ -154,7 +154,6 @@ extern bool use_vis3_instructions;
 #define TCG_AREG0 TCG_REG_I0
 
 #define TCG_TARGET_DEFAULT_MO (0)
-#define TCG_TARGET_HAS_MEMORY_BSWAP 1
 #define TCG_TARGET_NEED_LDST_LABELS
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index 0f1ba01a9a..67b698bd5c 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -126,4 +126,6 @@ static inline TCGv_i64 TCGV128_HIGH(TCGv_i128 t)
 return temp_tcgv_i64(tcgv_i128_temp(t) + o);
 }
 
+bool tcg_target_has_memory_bswap(MemOp memop)

[PATCH v4 40/57] tcg: Add INDEX_op_qemu_{ld,st}_i128

2023-05-03 Thread Richard Henderson

Add opcodes for backend support for 128-bit memory operations.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-opc.h|  8 +
 tcg/aarch64/tcg-target.h |  2 ++
 tcg/arm/tcg-target.h |  2 ++
 tcg/i386/tcg-target.h|  2 ++
 tcg/loongarch64/tcg-target.h |  1 +
 tcg/mips/tcg-target.h|  2 ++
 tcg/ppc/tcg-target.h |  2 ++
 tcg/riscv/tcg-target.h   |  2 ++
 tcg/s390x/tcg-target.h   |  2 ++
 tcg/sparc64/tcg-target.h |  2 ++
 tcg/tci/tcg-target.h |  2 ++
 tcg/tcg-op.c | 69 
 tcg/tcg.c|  4 +++
 docs/devel/tcg-ops.rst   | 11 +++---
 14 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index dd444734d9..94cf7c5d6a 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -213,6 +213,14 @@ DEF(qemu_st8_i32, 0, TLADDR_ARGS + 1, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS |
 IMPL(TCG_TARGET_HAS_qemu_st8_i32))
 
+/* Only for 64-bit hosts at the moment. */
+DEF(qemu_ld_i128, 2, 1, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT |
+IMPL(TCG_TARGET_HAS_qemu_ldst_i128))
+DEF(qemu_st_i128, 0, 3, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT |
+IMPL(TCG_TARGET_HAS_qemu_ldst_i128))
+
 /* Host vector support.  */
 
 #define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 378e01d9d8..74ee2ed255 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -129,6 +129,8 @@ extern bool have_lse2;
 #define TCG_TARGET_HAS_muluh_i641
 #define TCG_TARGET_HAS_mulsh_i641
 
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
+
 #define TCG_TARGET_HAS_v64  1
 #define TCG_TARGET_HAS_v128 1
 #define TCG_TARGET_HAS_v256 0
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 4c2d3332d5..65efc538f4 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -125,6 +125,8 @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_rem_i32  0
 #define TCG_TARGET_HAS_qemu_st8_i32 0
 
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
+
 #define TCG_TARGET_HAS_v64  use_neon_instructions
 #define TCG_TARGET_HAS_v128 use_neon_instructions
 #define TCG_TARGET_HAS_v256 0
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 8fe6958abd..943af6775e 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -194,6 +194,8 @@ extern bool have_atomic16;
 #define TCG_TARGET_HAS_qemu_st8_i32 1
 #endif
 
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
+
 /* We do not support older SSE systems, only beginning with AVX1.  */
 #define TCG_TARGET_HAS_v64  have_avx1
 #define TCG_TARGET_HAS_v128 have_avx1
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index 75c3d80ed2..482901ac15 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -168,6 +168,7 @@ typedef enum {
 #define TCG_TARGET_HAS_muls2_i640
 #define TCG_TARGET_HAS_muluh_i641
 #define TCG_TARGET_HAS_mulsh_i641
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
 
 #define TCG_TARGET_DEFAULT_MO (0)
 
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 47088af9cb..7277a117ef 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -204,6 +204,8 @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_ext16u_i64   0 /* andi rt, rs, 0x */
 #endif
 
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
+
 #define TCG_TARGET_DEFAULT_MO   0
 #define TCG_TARGET_NEED_LDST_LABELS
 
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index d55f0266bb..0914380bd7 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -149,6 +149,8 @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_mulsh_i641
 #endif
 
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
+
 /*
  * While technically Altivec could support V64, it has no 64-bit store
  * instruction and substituting two 32-bit stores makes the generated
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index dece3b3c27..494c986b49 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -163,6 +163,8 @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i641
 #define TCG_TARGET_HAS_mulsh_i641
 
+#define TCG_TARGET_HAS_qemu_ldst_i128   0
+
 #define TCG_TARGET_DEFAULT_MO (0)
 
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index fe05680124..170007bea5 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -140,6 +140,8 @@ extern uint64_t s390_facilities[3];
 #define TCG_TARGET_HAS_muluh_i64  0
 #define TCG_TARGET_HAS_mulsh_i64  0
 
+#define TCG_TARGET_HAS_qemu_ldst_i128 0
+
 #define TCG_TARGET_HAS_v64HAVE_FACILITY(VECTOR)

[PATCH 09/84] tcg: Reduce copies for plugin_gen_mem_callbacks

2023-05-03 Thread Richard Henderson

We only need to make copies for loads, when the destination
overlaps the address.  For now, only eliminate the copy for
stores and 128-bit loads.

Rename plugin_prep_mem_callbacks to plugin_maybe_preserve_addr,
returning NULL if no copy is made.

Signed-off-by: Richard Henderson 
---
 tcg/tcg-op-ldst.c | 38 --
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index 17fe35b93c..cbd85f793c 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -114,7 +114,8 @@ static void tcg_gen_req_mo(TCGBar type)
 }
 }
 
-static inline TCGv plugin_prep_mem_callbacks(TCGv vaddr)
+/* Only required for loads, where value might overlap addr. */
+static TCGv plugin_maybe_preserve_addr(TCGv vaddr)
 {
 #ifdef CONFIG_PLUGIN
 if (tcg_ctx->plugin_insn != NULL) {
@@ -124,17 +125,20 @@ static inline TCGv plugin_prep_mem_callbacks(TCGv vaddr)
 return temp;
 }
 #endif
-return vaddr;
+return NULL;
 }
 
-static void plugin_gen_mem_callbacks(TCGv vaddr, MemOpIdx oi,
- enum qemu_plugin_mem_rw rw)
+static void
+plugin_gen_mem_callbacks(TCGv copy_addr, TCGv orig_addr, MemOpIdx oi,
+ enum qemu_plugin_mem_rw rw)
 {
 #ifdef CONFIG_PLUGIN
 if (tcg_ctx->plugin_insn != NULL) {
 qemu_plugin_meminfo_t info = make_plugin_meminfo(oi, rw);
-plugin_gen_empty_mem_callback(vaddr, info);
-tcg_temp_free(vaddr);
+plugin_gen_empty_mem_callback(copy_addr ? : orig_addr, info);
+if (copy_addr) {
+tcg_temp_free(copy_addr);
+}
 }
 #endif
 }
@@ -143,6 +147,7 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
 {
 MemOp orig_memop;
 MemOpIdx oi;
+TCGv copy_addr;
 
 tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
 memop = tcg_canonicalize_memop(memop, 0, 0);
@@ -157,9 +162,9 @@ void tcg_gen_qemu_ld_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
 }
 }
 
-addr = plugin_prep_mem_callbacks(addr);
+copy_addr = plugin_maybe_preserve_addr(addr);
 gen_ldst_i32(INDEX_op_qemu_ld_i32, val, addr, memop, idx);
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
+plugin_gen_mem_callbacks(copy_addr, addr, oi, QEMU_PLUGIN_MEM_R);
 
 if ((orig_memop ^ memop) & MO_BSWAP) {
 switch (orig_memop & MO_SIZE) {
@@ -202,13 +207,12 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg 
idx, MemOp memop)
 memop &= ~MO_BSWAP;
 }
 
-addr = plugin_prep_mem_callbacks(addr);
 if (TCG_TARGET_HAS_qemu_st8_i32 && (memop & MO_SIZE) == MO_8) {
 gen_ldst_i32(INDEX_op_qemu_st8_i32, val, addr, memop, idx);
 } else {
 gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
 }
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W);
+plugin_gen_mem_callbacks(NULL, addr, oi, QEMU_PLUGIN_MEM_W);
 
 if (swap) {
 tcg_temp_free_i32(swap);
@@ -219,6 +223,7 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
 {
 MemOp orig_memop;
 MemOpIdx oi;
+TCGv copy_addr;
 
 if (TCG_TARGET_REG_BITS == 32 && (memop & MO_SIZE) < MO_64) {
 tcg_gen_qemu_ld_i32(TCGV_LOW(val), addr, idx, memop);
@@ -243,9 +248,9 @@ void tcg_gen_qemu_ld_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
 }
 }
 
-addr = plugin_prep_mem_callbacks(addr);
+copy_addr = plugin_maybe_preserve_addr(addr);
 gen_ldst_i64(INDEX_op_qemu_ld_i64, val, addr, memop, idx);
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
+plugin_gen_mem_callbacks(copy_addr, addr, oi, QEMU_PLUGIN_MEM_R);
 
 if ((orig_memop ^ memop) & MO_BSWAP) {
 int flags = (orig_memop & MO_SIGN
@@ -300,9 +305,8 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg 
idx, MemOp memop)
 memop &= ~MO_BSWAP;
 }
 
-addr = plugin_prep_mem_callbacks(addr);
 gen_ldst_i64(INDEX_op_qemu_st_i64, val, addr, memop, idx);
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W);
+plugin_gen_mem_callbacks(NULL, addr, oi, QEMU_PLUGIN_MEM_W);
 
 if (swap) {
 tcg_temp_free_i64(swap);
@@ -430,7 +434,6 @@ void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg 
idx, MemOp memop)
 tcg_debug_assert((memop & MO_SIGN) == 0);
 
 tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
-addr = plugin_prep_mem_callbacks(addr);
 
 /* TODO: For now, force 32-bit hosts to use the helper. */
 if (TCG_TARGET_HAS_qemu_ldst_i128 && TCG_TARGET_REG_BITS == 64) {
@@ -501,7 +504,7 @@ void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg 
idx, MemOp memop)
 maybe_free_addr64(a64);
 }
 
-plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
+plugin_gen_mem_callbacks(NULL, addr, oi, QEMU_PLUGIN_MEM_R);
 }
 
 void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
@@ -512,7 +515,6 @@ void tcg_gen_qemu_st_i128(TCGv_i128 val, T

[PATCH 03/84] accel/tcg: Widen tcg-ldst.h addresses to uint64_t

2023-05-03 Thread Richard Henderson

Always pass the target address as uint64_t.
Adjust tcg_out_{ld,st}_helper_args to match.

Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-ldst.h | 26 +-
 accel/tcg/cputlb.c | 26 +-
 accel/tcg/user-exec.c  | 26 +-
 tcg/tcg.c  | 62 --
 4 files changed, 87 insertions(+), 53 deletions(-)

diff --git a/include/tcg/tcg-ldst.h b/include/tcg/tcg-ldst.h
index 7dd57013e9..6ccfe9131d 100644
--- a/include/tcg/tcg-ldst.h
+++ b/include/tcg/tcg-ldst.h
@@ -26,38 +26,38 @@
 #define TCG_LDST_H
 
 /* Value zero-extended to tcg register size.  */
-tcg_target_ulong helper_ldub_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr);
-tcg_target_ulong helper_lduw_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr);
-tcg_target_ulong helper_ldul_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr);
-uint64_t helper_ldq_mmu(CPUArchState *env, target_ulong addr,
+uint64_t helper_ldq_mmu(CPUArchState *env, uint64_t addr,
 MemOpIdx oi, uintptr_t retaddr);
-Int128 helper_ld16_mmu(CPUArchState *env, target_ulong addr,
+Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
MemOpIdx oi, uintptr_t retaddr);
 
 /* Value sign-extended to tcg register size.  */
-tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldsb_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr);
-tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldsw_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr);
-tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldsl_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr);
 
 /*
  * Value extended to at least uint32_t, so that some ABIs do not require
  * zero-extension from uint8_t or uint16_t.
  */
-void helper_stb_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+void helper_stb_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
 MemOpIdx oi, uintptr_t retaddr);
-void helper_stw_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+void helper_stw_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
 MemOpIdx oi, uintptr_t retaddr);
-void helper_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+void helper_stl_mmu(CPUArchState *env, uint64_t addr, uint32_t val,
 MemOpIdx oi, uintptr_t retaddr);
-void helper_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+void helper_stq_mmu(CPUArchState *env, uint64_t addr, uint64_t val,
 MemOpIdx oi, uintptr_t retaddr);
-void helper_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+void helper_st16_mmu(CPUArchState *env, uint64_t addr, Int128 val,
  MemOpIdx oi, uintptr_t retaddr);
 
 #endif /* TCG_LDST_H */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index a77b439df8..b594401267 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2357,7 +2357,7 @@ static uint8_t do_ld1_mmu(CPUArchState *env, target_ulong 
addr, MemOpIdx oi,
 return do_ld_1(env, &l.page[0], l.mmu_idx, access_type, ra);
 }
 
-tcg_target_ulong helper_ldub_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldub_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr)
 {
 tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_8);
@@ -2388,7 +2388,7 @@ static uint16_t do_ld2_mmu(CPUArchState *env, 
target_ulong addr, MemOpIdx oi,
 return ret;
 }
 
-tcg_target_ulong helper_lduw_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_lduw_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr)
 {
 tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
@@ -2415,7 +2415,7 @@ static uint32_t do_ld4_mmu(CPUArchState *env, 
target_ulong addr, MemOpIdx oi,
 return ret;
 }
 
-tcg_target_ulong helper_ldul_mmu(CPUArchState *env, target_ulong addr,
+tcg_target_ulong helper_ldul_mmu(CPUArchState *env, uint64_t addr,
  MemOpIdx oi, uintptr_t retaddr)
 {
 tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
@@ -2442,7 +2442,7 @@ static uint64_t do_ld8_mmu(CPUArchState *env, 
target_ulong addr, MemOpIdx oi,
 return ret;
 }
 
-uint64_t helper_ldq_mmu(CPUArchState *env, target_ulong addr,
+uint64_t helper_ldq_mm

[PATCH v4 31/54] tcg: Replace REG_P with arg_loc_reg_p

2023-05-03 Thread Richard Henderson

An inline function is safer than a macro, and REG_P
was rather too generic.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/tcg-internal.h |  4 
 tcg/tcg.c  | 16 +---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index e542a4e9b7..0f1ba01a9a 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -58,10 +58,6 @@ typedef struct TCGCallArgumentLoc {
 unsigned tmp_subindex   : 2;
 } TCGCallArgumentLoc;
 
-/* Avoid "unsigned < 0 is always false" Werror, when iarg_regs is empty. */
-#define REG_P(L) \
-((int)(L)->arg_slot < (int)ARRAY_SIZE(tcg_target_call_iarg_regs))
-
 typedef struct TCGHelperInfo {
 void *func;
 const char *name;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 6f5daaee5f..fa28db0188 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -806,6 +806,16 @@ static void init_ffi_layouts(void)
 }
 #endif /* CONFIG_TCG_INTERPRETER */
 
+static inline bool arg_slot_reg_p(unsigned arg_slot)
+{
+/*
+ * Split the sizeof away from the comparison to avoid Werror from
+ * "unsigned < 0 is always false", when iarg_regs is empty.
+ */
+unsigned nreg = ARRAY_SIZE(tcg_target_call_iarg_regs);
+return arg_slot < nreg;
+}
+
 typedef struct TCGCumulativeArgs {
 int arg_idx;/* tcg_gen_callN args[] */
 int info_in_idx;/* TCGHelperInfo in[] */
@@ -3231,7 +3241,7 @@ liveness_pass_1(TCGContext *s)
 case TCG_CALL_ARG_NORMAL:
 case TCG_CALL_ARG_EXTEND_U:
 case TCG_CALL_ARG_EXTEND_S:
-if (REG_P(loc)) {
+if (arg_slot_reg_p(loc->arg_slot)) {
 *la_temp_pref(ts) = 0;
 break;
 }
@@ -3258,7 +3268,7 @@ liveness_pass_1(TCGContext *s)
 case TCG_CALL_ARG_NORMAL:
 case TCG_CALL_ARG_EXTEND_U:
 case TCG_CALL_ARG_EXTEND_S:
-if (REG_P(loc)) {
+if (arg_slot_reg_p(loc->arg_slot)) {
 tcg_regset_set_reg(*la_temp_pref(ts),
 tcg_target_call_iarg_regs[loc->arg_slot]);
 }
@@ -4833,7 +4843,7 @@ static void load_arg_stk(TCGContext *s, int stk_slot, 
TCGTemp *ts,
 static void load_arg_normal(TCGContext *s, const TCGCallArgumentLoc *l,
 TCGTemp *ts, TCGRegSet *allocated_regs)
 {
-if (REG_P(l)) {
+if (arg_slot_reg_p(l->arg_slot)) {
 TCGReg reg = tcg_target_call_iarg_regs[l->arg_slot];
 load_arg_reg(s, reg, ts, *allocated_regs);
 tcg_regset_set_reg(*allocated_regs, reg);
-- 
2.34.1

[PATCH 02/84] tcg: Widen gen_insn_data to uint64_t

2023-05-03 Thread Richard Henderson

We already pass uint64_t to restore_state_to_opc; this changes all
of the other uses from insn_start through the encoding to decoding.

Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op.h  | 39 +--
 include/tcg/tcg-opc.h |  2 +-
 include/tcg/tcg.h | 30 +++---
 accel/tcg/translate-all.c | 28 
 tcg/tcg.c | 18 --
 5 files changed, 45 insertions(+), 72 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index 4401fa493c..de3b70aa84 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -723,48 +723,27 @@ static inline void tcg_gen_concat32_i64(TCGv_i64 ret, 
TCGv_i64 lo, TCGv_i64 hi)
 #endif
 
 #if TARGET_INSN_START_WORDS == 1
-# if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 static inline void tcg_gen_insn_start(target_ulong pc)
 {
-tcg_gen_op1(INDEX_op_insn_start, pc);
+TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 64 / TCG_TARGET_REG_BITS);
+tcg_set_insn_start_param(op, 0, pc);
 }
-# else
-static inline void tcg_gen_insn_start(target_ulong pc)
-{
-tcg_gen_op2(INDEX_op_insn_start, (uint32_t)pc, (uint32_t)(pc >> 32));
-}
-# endif
 #elif TARGET_INSN_START_WORDS == 2
-# if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1)
 {
-tcg_gen_op2(INDEX_op_insn_start, pc, a1);
+TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 2 * 64 / TCG_TARGET_REG_BITS);
+tcg_set_insn_start_param(op, 0, pc);
+tcg_set_insn_start_param(op, 1, a1);
 }
-# else
-static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1)
-{
-tcg_gen_op4(INDEX_op_insn_start,
-(uint32_t)pc, (uint32_t)(pc >> 32),
-(uint32_t)a1, (uint32_t)(a1 >> 32));
-}
-# endif
 #elif TARGET_INSN_START_WORDS == 3
-# if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
   target_ulong a2)
 {
-tcg_gen_op3(INDEX_op_insn_start, pc, a1, a2);
+TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 3 * 64 / TCG_TARGET_REG_BITS);
+tcg_set_insn_start_param(op, 0, pc);
+tcg_set_insn_start_param(op, 1, a1);
+tcg_set_insn_start_param(op, 2, a2);
 }
-# else
-static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
-  target_ulong a2)
-{
-tcg_gen_op6(INDEX_op_insn_start,
-(uint32_t)pc, (uint32_t)(pc >> 32),
-(uint32_t)a1, (uint32_t)(a1 >> 32),
-(uint32_t)a2, (uint32_t)(a2 >> 32));
-}
-# endif
 #else
 # error "Unhandled number of operands to insn_start"
 #endif
diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index 94cf7c5d6a..29216366d2 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -190,7 +190,7 @@ DEF(mulsh_i64, 1, 2, 0, IMPL64 | 
IMPL(TCG_TARGET_HAS_mulsh_i64))
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
 /* QEMU specific */
-DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
+DEF(insn_start, 0, 0, DATA64_ARGS * TARGET_INSN_START_WORDS,
 TCG_OPF_NOT_PRESENT)
 DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index efbd891f87..7c6a613364 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -629,7 +629,7 @@ struct TCGContext {
 TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS];
 
 uint16_t gen_insn_end_off[TCG_MAX_INSNS];
-target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
+uint64_t gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
 
 /* Exit to translator on overflow. */
 sigjmp_buf jmp_trans;
@@ -771,24 +771,24 @@ static inline void tcg_set_insn_param(TCGOp *op, int arg, 
TCGArg v)
 op->args[arg] = v;
 }
 
-static inline target_ulong tcg_get_insn_start_param(TCGOp *op, int arg)
+static inline uint64_t tcg_get_insn_start_param(TCGOp *op, int arg)
 {
-#if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
-return tcg_get_insn_param(op, arg);
-#else
-return tcg_get_insn_param(op, arg * 2) |
-   ((uint64_t)tcg_get_insn_param(op, arg * 2 + 1) << 32);
-#endif
+if (TCG_TARGET_REG_BITS == 64) {
+return tcg_get_insn_param(op, arg);
+} else {
+return deposit64(tcg_get_insn_param(op, arg * 2), 32, 32,
+ tcg_get_insn_param(op, arg * 2 + 1));
+}
 }
 
-static inline void tcg_set_insn_start_param(TCGOp *op, int arg, target_ulong v)
+static inline void tcg_set_insn_start_param(TCGOp *op, int arg, uint64_t v)
 {
-#if TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
-tcg_set_insn_param(op, arg, v);
-#else
-tcg_set_insn_param(op, arg * 2, v);
-tcg_set_insn_param(op, arg * 2 + 1, v >> 32);
-#endif
+if (TCG_TARGET_REG_BITS == 64) {
+tcg_set_insn_param(op, arg, v);
+} else {
+tcg_set_insn_param(op, arg

[PATCH v4 17/57] tcg/aarch64: Detect have_lse, have_lse2 for linux

2023-05-03 Thread Richard Henderson

Notice when the host has additional atomic instructions.
The new variables will also be used in generated code.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.h |  3 +++
 tcg/aarch64/tcg-target.c.inc | 12 
 2 files changed, 15 insertions(+)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index c0b0f614ba..3c0b0d312d 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -57,6 +57,9 @@ typedef enum {
 #define TCG_TARGET_CALL_ARG_I128TCG_CALL_ARG_EVEN
 #define TCG_TARGET_CALL_RET_I128TCG_CALL_RET_NORMAL
 
+extern bool have_lse;
+extern bool have_lse2;
+
 /* optional instructions */
 #define TCG_TARGET_HAS_div_i32  1
 #define TCG_TARGET_HAS_rem_i32  1
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index e6636c1f8b..fc551a3d10 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -13,6 +13,9 @@
 #include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 #include "qemu/bitops.h"
+#ifdef __linux__
+#include 
+#endif
 
 /* We're going to re-use TCGType in setting of the SF bit, which controls
the size of the operation performed.  If we know the values match, it
@@ -71,6 +74,9 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind 
kind, int slot)
 return TCG_REG_X0 + slot;
 }
 
+bool have_lse;
+bool have_lse2;
+
 #define TCG_REG_TMP TCG_REG_X30
 #define TCG_VEC_TMP TCG_REG_V31
 
@@ -2899,6 +2905,12 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode 
op)
 
 static void tcg_target_init(TCGContext *s)
 {
+#ifdef __linux__
+unsigned long hwcap = qemu_getauxval(AT_HWCAP);
+have_lse = hwcap & HWCAP_ATOMICS;
+have_lse2 = hwcap & HWCAP_USCAT;
+#endif
+
 tcg_target_available_regs[TCG_TYPE_I32] = 0xu;
 tcg_target_available_regs[TCG_TYPE_I64] = 0xu;
 tcg_target_available_regs[TCG_TYPE_V64] = 0xull;
-- 
2.34.1

[PATCH v4 18/57] tcg/aarch64: Detect have_lse, have_lse2 for darwin

2023-05-03 Thread Richard Henderson

These features are present for Apple M1.

Tested-by: Philippe Mathieu-Daudé 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.c.inc | 28 
 1 file changed, 28 insertions(+)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index fc551a3d10..3adc5fd3a3 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -16,6 +16,9 @@
 #ifdef __linux__
 #include 
 #endif
+#ifdef CONFIG_DARWIN
+#include 
+#endif
 
 /* We're going to re-use TCGType in setting of the SF bit, which controls
the size of the operation performed.  If we know the values match, it
@@ -2903,6 +2906,27 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode 
op)
 }
 }
 
+#ifdef CONFIG_DARWIN
+static bool sysctl_for_bool(const char *name)
+{
+int val = 0;
+size_t len = sizeof(val);
+
+if (sysctlbyname(name, &val, &len, NULL, 0) == 0) {
+return val != 0;
+}
+
+/*
+ * We might in ask for properties not present in older kernels,
+ * but we're only asking about static properties, all of which
+ * should be 'int'.  So we shouln't see ENOMEM (val too small),
+ * or any of the other more exotic errors.
+ */
+assert(errno == ENOENT);
+return false;
+}
+#endif
+
 static void tcg_target_init(TCGContext *s)
 {
 #ifdef __linux__
@@ -2910,6 +2934,10 @@ static void tcg_target_init(TCGContext *s)
 have_lse = hwcap & HWCAP_ATOMICS;
 have_lse2 = hwcap & HWCAP_USCAT;
 #endif
+#ifdef CONFIG_DARWIN
+have_lse = sysctl_for_bool("hw.optional.arm.FEAT_LSE");
+have_lse2 = sysctl_for_bool("hw.optional.arm.FEAT_LSE2");
+#endif
 
 tcg_target_available_regs[TCG_TYPE_I32] = 0xu;
 tcg_target_available_regs[TCG_TYPE_I64] = 0xu;
-- 
2.34.1

[PATCH v4 02/57] accel/tcg: Add cpu_in_serial_context

2023-05-03 Thread Richard Henderson

Like cpu_in_exclusive_context, but also true if
there is no other cpu against which we could race.

Use it in tb_flush as a direct replacement.
Use it in cpu_loop_exit_atomic to ensure that there
is no loop against cpu_exec_step_atomic.

Reviewed-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 accel/tcg/internal.h| 5 +
 accel/tcg/cpu-exec-common.c | 3 +++
 accel/tcg/tb-maint.c| 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index 7bb0fdbe14..8ca24420ea 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -64,6 +64,11 @@ static inline target_ulong log_pc(CPUState *cpu, const 
TranslationBlock *tb)
 }
 }
 
+static inline bool cpu_in_serial_context(CPUState *cs)
+{
+return !(cs->tcg_cflags & CF_PARALLEL) || cpu_in_exclusive_context(cs);
+}
+
 extern int64_t max_delay;
 extern int64_t max_advance;
 
diff --git a/accel/tcg/cpu-exec-common.c b/accel/tcg/cpu-exec-common.c
index e7962c9348..9a5fabf625 100644
--- a/accel/tcg/cpu-exec-common.c
+++ b/accel/tcg/cpu-exec-common.c
@@ -22,6 +22,7 @@
 #include "sysemu/tcg.h"
 #include "exec/exec-all.h"
 #include "qemu/plugin.h"
+#include "internal.h"
 
 bool tcg_allowed;
 
@@ -81,6 +82,8 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
 
 void cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc)
 {
+/* Prevent looping if already executing in a serial context. */
+g_assert(!cpu_in_serial_context(cpu));
 cpu->exception_index = EXCP_ATOMIC;
 cpu_loop_exit_restore(cpu, pc);
 }
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index cb1f806f00..7d613d36d2 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -760,7 +760,7 @@ void tb_flush(CPUState *cpu)
 if (tcg_enabled()) {
 unsigned tb_flush_count = qatomic_mb_read(&tb_ctx.tb_flush_count);
 
-if (cpu_in_exclusive_context(cpu)) {
+if (cpu_in_serial_context(cpu)) {
 do_tb_flush(cpu, RUN_ON_CPU_HOST_INT(tb_flush_count));
 } else {
 async_safe_run_on_cpu(cpu, do_tb_flush,
-- 
2.34.1

[PULL v2 02/12] accel/tcg: Uncache the host address for instruction fetch when tlb size < 1

2023-05-03 Thread Richard Henderson

From: Weiwei Li 

When PMP entry overlap part of the page, we'll set the tlb_size to 1, which
will make the address in tlb entry set with TLB_INVALID_MASK, and the next
access will again go through tlb_fill.However, this way will not work in
tb_gen_code() => get_page_addr_code_hostp(): the TLB host address will be
cached, and the following instructions can use this host address directly
which may lead to the bypass of PMP related check.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1542.

Signed-off-by: Weiwei Li 
Signed-off-by: Junqiang Wang 
Reviewed-by: LIU Zhiwei 
Reviewed-by: Richard Henderson 
Message-Id: <20230422130329.23555-6-liwei...@iscas.ac.cn>
---
 accel/tcg/cputlb.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index e984a98dc4..efa0cb67c9 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1696,6 +1696,11 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState 
*env, target_ulong addr,
 if (p == NULL) {
 return -1;
 }
+
+if (full->lg_page_size < TARGET_PAGE_BITS) {
+return -1;
+}
+
 if (hostp) {
 *hostp = p;
 }
-- 
2.34.1

[PATCH 04/84] tcg: Widen helper_{ld,st}_i128 addresses to uint64_t

2023-05-03 Thread Richard Henderson

Always pass the target address as uint64_t.

Signed-off-by: Richard Henderson 
---
 accel/tcg/tcg-runtime.h |  4 ++--
 accel/tcg/cputlb.c  |  5 ++---
 accel/tcg/user-exec.c   |  5 ++---
 tcg/tcg-op-ldst.c   | 26 --
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index d9adc646c1..0e6c5f55fd 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -39,8 +39,8 @@ DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 DEF_HELPER_FLAGS_3(memset, TCG_CALL_NO_RWG, ptr, ptr, int, ptr)
 #endif /* IN_HELPER_PROTO */
 
-DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, tl, i32)
-DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, tl, i128, i32)
+DEF_HELPER_FLAGS_3(ld_i128, TCG_CALL_NO_WG, i128, env, i64, i32)
+DEF_HELPER_FLAGS_4(st_i128, TCG_CALL_NO_WG, void, env, i64, i128, i32)
 
 DEF_HELPER_FLAGS_5(atomic_cmpxchgb, TCG_CALL_NO_WG,
i32, env, tl, i32, i32, i32)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index b594401267..e494404a0d 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2541,7 +2541,7 @@ Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
 return do_ld16_mmu(env, addr, oi, retaddr);
 }
 
-Int128 helper_ld_i128(CPUArchState *env, target_ulong addr, uint32_t oi)
+Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, uint32_t oi)
 {
 return helper_ld16_mmu(env, addr, oi, GETPC());
 }
@@ -3006,8 +3006,7 @@ void helper_st16_mmu(CPUArchState *env, uint64_t addr, 
Int128 val,
 do_st16_mmu(env, addr, val, oi, retaddr);
 }
 
-void helper_st_i128(CPUArchState *env, target_ulong addr, Int128 val,
-MemOpIdx oi)
+void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
 {
 helper_st16_mmu(env, addr, val, oi, GETPC());
 }
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 9a7afb6f78..36ad8284a5 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -1136,7 +1136,7 @@ Int128 helper_ld16_mmu(CPUArchState *env, uint64_t addr,
 return ret;
 }
 
-Int128 helper_ld_i128(CPUArchState *env, target_ulong addr, MemOpIdx oi)
+Int128 helper_ld_i128(CPUArchState *env, uint64_t addr, MemOpIdx oi)
 {
 return helper_ld16_mmu(env, addr, oi, GETPC());
 }
@@ -1343,8 +1343,7 @@ void helper_st16_mmu(CPUArchState *env, uint64_t addr, 
Int128 val,
 do_st16_he_mmu(env, addr, val, mop, ra);
 }
 
-void helper_st_i128(CPUArchState *env, target_ulong addr,
-Int128 val, MemOpIdx oi)
+void helper_st_i128(CPUArchState *env, uint64_t addr, Int128 val, MemOpIdx oi)
 {
 helper_st16_mmu(env, addr, val, oi, GETPC());
 }
diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index a3beede2f4..0010afb1a1 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -404,6 +404,24 @@ static void canonicalize_memop_i128_as_i64(MemOp ret[2], 
MemOp orig)
 #define tcg_temp_ebb_new  tcg_temp_ebb_new_i32
 #endif
 
+static TCGv_i64 maybe_extend_addr64(TCGv addr)
+{
+#if TARGET_LONG_BITS == 32
+TCGv_i64 a64 = tcg_temp_ebb_new_i64();
+tcg_gen_extu_i32_i64(a64, addr);
+return a64;
+#else
+return addr;
+#endif
+}
+
+static void maybe_free_addr64(TCGv_i64 a64)
+{
+#if TARGET_LONG_BITS == 32
+tcg_temp_free_i64(a64);
+#endif
+}
+
 void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
 {
 const MemOpIdx oi = make_memop_idx(memop, idx);
@@ -478,7 +496,9 @@ void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg 
idx, MemOp memop)
 tcg_gen_bswap64_i64(y, y);
 }
 } else {
-gen_helper_ld_i128(val, cpu_env, addr, tcg_constant_i32(oi));
+TCGv_i64 a64 = maybe_extend_addr64(addr);
+gen_helper_ld_i128(val, cpu_env, a64, tcg_constant_i32(oi));
+maybe_free_addr64(a64);
 }
 
 plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_R);
@@ -558,7 +578,9 @@ void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg 
idx, MemOp memop)
 }
 tcg_temp_free(addr_p8);
 } else {
-gen_helper_st_i128(cpu_env, addr, val, tcg_constant_i32(oi));
+TCGv_i64 a64 = maybe_extend_addr64(addr);
+gen_helper_st_i128(cpu_env, a64, val, tcg_constant_i32(oi));
+maybe_free_addr64(a64);
 }
 
 plugin_gen_mem_callbacks(addr, oi, QEMU_PLUGIN_MEM_W);
-- 
2.34.1

[PATCH v4 00/57] tcg: Improve atomicity support

2023-05-03 Thread Richard Henderson

v1: 
https://lore.kernel.org/qemu-devel/20221118094754.242910-1-richard.hender...@linaro.org/
v2: 
https://lore.kernel.org/qemu-devel/20230216025739.1211680-1-richard.hender...@linaro.org/
v3: 
https://lore.kernel.org/qemu-devel/20230425193146.2106111-1-richard.hender...@linaro.org/

Based-on: 20230503065729.1745843-1-richard.hender...@linaro.org
("[PATCH v4 00/54] tcg: Simplify calls to load/store helpers")

The main objective here is to support Arm FEAT_LSE2, which says that any
single memory access that does not cross a 16-byte boundary is atomic.
This is the MO_ATOM_WITHIN16 control.

While I'm touching all of this, a secondary objective is to handle the
atomicity of the IBM machines.  Both Power and s390x treat misaligned
accesses as atomic on the lsb of the pointer.  For instance, an 8-byte
access at ptr % 8 == 4 will appear as two atomic 4-byte accesses, and
ptr % 4 == 2 will appear as four 2-byte accesses.
This is the MO_ATOM_SUBALIGN control.

By default, acceses are atomic only if aligned, which is the current
behaviour of the tcg code generator (mostly, anyway, there were bugs).
This is the MO_ATOM_IFALIGN control.

Further, one can say that a large memory access is really a set of
contiguous smaller accesses, and we need not provide more atomicity
than that (modulo MO_ATOM_WITHIN16).  This is the MO_ATMAX_* control.

Changes for v4:
  - Rebase, fixing some conflicts.


r~


Richard Henderson (57):
  include/exec/memop: Add bits describing atomicity
  accel/tcg: Add cpu_in_serial_context
  accel/tcg: Introduce tlb_read_idx
  accel/tcg: Reorg system mode load helpers
  accel/tcg: Reorg system mode store helpers
  accel/tcg: Honor atomicity of loads
  accel/tcg: Honor atomicity of stores
  target/loongarch: Do not include tcg-ldst.h
  tcg: Unify helper_{be,le}_{ld,st}*
  accel/tcg: Implement helper_{ld,st}*_mmu for user-only
  tcg/tci: Use helper_{ld,st}*_mmu for user-only
  tcg: Add 128-bit guest memory primitives
  meson: Detect atomic128 support with optimization
  tcg/i386: Add have_atomic16
  accel/tcg: Use have_atomic16 in ldst_atomicity.c.inc
  accel/tcg: Add aarch64 specific support in ldst_atomicity
  tcg/aarch64: Detect have_lse, have_lse2 for linux
  tcg/aarch64: Detect have_lse, have_lse2 for darwin
  accel/tcg: Add have_lse2 support in ldst_atomicity
  tcg: Introduce TCG_OPF_TYPE_MASK
  tcg/i386: Use full load/store helpers in user-only mode
  tcg/aarch64: Use full load/store helpers in user-only mode
  tcg/ppc: Use full load/store helpers in user-only mode
  tcg/loongarch64: Use full load/store helpers in user-only mode
  tcg/riscv: Use full load/store helpers in user-only mode
  tcg/arm: Adjust constraints on qemu_ld/st
  tcg/arm: Use full load/store helpers in user-only mode
  tcg/mips: Use full load/store helpers in user-only mode
  tcg/s390x: Use full load/store helpers in user-only mode
  tcg/sparc64: Allocate %g2 as a third temporary
  tcg/sparc64: Rename tcg_out_movi_imm13 to tcg_out_movi_s13
  tcg/sparc64: Rename tcg_out_movi_imm32 to tcg_out_movi_u32
  tcg/sparc64: Split out tcg_out_movi_s32
  tcg/sparc64: Use standard slow path for softmmu
  accel/tcg: Remove helper_unaligned_{ld,st}
  tcg/loongarch64: Assert the host supports unaligned accesses
  tcg/loongarch64: Support softmmu unaligned accesses
  tcg/riscv: Support softmmu unaligned accesses
  tcg: Introduce tcg_target_has_memory_bswap
  tcg: Add INDEX_op_qemu_{ld,st}_i128
  tcg: Support TCG_TYPE_I128 in tcg_out_{ld,st}_helper_{args,ret}
  tcg: Introduce atom_and_align_for_opc
  tcg/i386: Use atom_and_align_for_opc
  tcg/aarch64: Use atom_and_align_for_opc
  tcg/arm: Use atom_and_align_for_opc
  tcg/loongarch64: Use atom_and_align_for_opc
  tcg/mips: Use atom_and_align_for_opc
  tcg/ppc: Use atom_and_align_for_opc
  tcg/riscv: Use atom_and_align_for_opc
  tcg/s390x: Use atom_and_align_for_opc
  tcg/sparc64: Use atom_and_align_for_opc
  tcg/i386: Honor 64-bit atomicity in 32-bit mode
  tcg/i386: Support 128-bit load/store with have_atomic16
  tcg/aarch64: Rename temporaries
  tcg/aarch64: Support 128-bit load/store
  tcg/ppc: Support 128-bit load/store
  tcg/s390x: Support 128-bit load/store

 accel/tcg/internal.h |5 +
 accel/tcg/tcg-runtime.h  |3 +
 include/exec/cpu-defs.h  |7 +-
 include/exec/cpu_ldst.h  |   26 +-
 include/exec/memop.h |   36 +
 include/qemu/cpuid.h |   18 +
 include/tcg/tcg-ldst.h   |   72 +-
 include/tcg/tcg-opc.h|8 +
 include/tcg/tcg.h|   22 +-
 tcg/aarch64/tcg-target-con-set.h |2 +
 tcg/aarch64/tcg-target.h |6 +-
 tcg/arm/tcg-target-con-set.h |   16 +-
 tcg/arm/tcg-target-con-str.h |5 +-
 tcg/arm/tcg-target.h |3 +-
 tcg/i386/tcg-target.h|7 +-
 tcg/loongarch64/tcg-target.h |3 +-
 tcg/mips/tcg-target.h|4 +-
 tcg/ppc/tcg-target-con-set.h |2 +
 tcg/ppc/tcg-target-con-str.h |1 +
 tcg/ppc/tcg-tar

[PATCH 15/84] tcg/tci: Elimnate TARGET_LONG_BITS, target_ulong

2023-05-03 Thread Richard Henderson

We now have the address size as part of the opcode, so
we no longer need to test TARGET_LONG_BITS.  We can use
uint64_t for target_ulong, as passed into load/store helpers.

Signed-off-by: Richard Henderson 
---
 tcg/tci.c| 61 +---
 tcg/tci/tcg-target.c.inc | 15 +-
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index 742c791726..bab4397bc5 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -286,7 +286,7 @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond 
condition)
 return result;
 }
 
-static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong taddr,
+static uint64_t tci_qemu_ld(CPUArchState *env, uint64_t taddr,
 MemOpIdx oi, const void *tb_ptr)
 {
 MemOp mop = get_memop(oi);
@@ -312,7 +312,7 @@ static uint64_t tci_qemu_ld(CPUArchState *env, target_ulong 
taddr,
 }
 }
 
-static void tci_qemu_st(CPUArchState *env, target_ulong taddr, uint64_t val,
+static void tci_qemu_st(CPUArchState *env, uint64_t taddr, uint64_t val,
 MemOpIdx oi, const void *tb_ptr)
 {
 MemOp mop = get_memop(oi);
@@ -372,10 +372,9 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState 
*env,
 TCGReg r0, r1, r2, r3, r4, r5;
 tcg_target_ulong t1;
 TCGCond condition;
-target_ulong taddr;
 uint8_t pos, len;
 uint32_t tmp32;
-uint64_t tmp64;
+uint64_t tmp64, taddr;
 uint64_t T1, T2;
 MemOpIdx oi;
 int32_t ofs;
@@ -923,31 +922,40 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState 
*env,
 break;
 
 case INDEX_op_qemu_ld_a32_i32:
+tci_args_rrm(insn, &r0, &r1, &oi);
+taddr = (uint32_t)regs[r1];
+goto do_ld_i32;
 case INDEX_op_qemu_ld_a64_i32:
-if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) {
+if (TCG_TARGET_REG_BITS == 64) {
 tci_args_rrm(insn, &r0, &r1, &oi);
 taddr = regs[r1];
 } else {
 tci_args_rrrm(insn, &r0, &r1, &r2, &oi);
 taddr = tci_uint64(regs[r2], regs[r1]);
 }
-tmp32 = tci_qemu_ld(env, taddr, oi, tb_ptr);
-regs[r0] = tmp32;
+do_ld_i32:
+regs[r0] = tci_qemu_ld(env, taddr, oi, tb_ptr);
 break;
 
 case INDEX_op_qemu_ld_a32_i64:
+if (TCG_TARGET_REG_BITS == 64) {
+tci_args_rrm(insn, &r0, &r1, &oi);
+taddr = (uint32_t)regs[r1];
+} else {
+tci_args_rrrm(insn, &r0, &r1, &r2, &oi);
+taddr = (uint32_t)regs[r2];
+}
+goto do_ld_i64;
 case INDEX_op_qemu_ld_a64_i64:
 if (TCG_TARGET_REG_BITS == 64) {
 tci_args_rrm(insn, &r0, &r1, &oi);
 taddr = regs[r1];
-} else if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) {
-tci_args_rrrm(insn, &r0, &r1, &r2, &oi);
-taddr = regs[r2];
 } else {
 tci_args_r(insn, &r0, &r1, &r2, &r3, &r4);
 taddr = tci_uint64(regs[r3], regs[r2]);
 oi = regs[r4];
 }
+do_ld_i64:
 tmp64 = tci_qemu_ld(env, taddr, oi, tb_ptr);
 if (TCG_TARGET_REG_BITS == 32) {
 tci_write_reg64(regs, r1, r0, tmp64);
@@ -957,35 +965,44 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState 
*env,
 break;
 
 case INDEX_op_qemu_st_a32_i32:
+tci_args_rrm(insn, &r0, &r1, &oi);
+taddr = (uint32_t)regs[r1];
+goto do_st_i32;
 case INDEX_op_qemu_st_a64_i32:
-if (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS) {
+if (TCG_TARGET_REG_BITS == 64) {
 tci_args_rrm(insn, &r0, &r1, &oi);
 taddr = regs[r1];
 } else {
 tci_args_rrrm(insn, &r0, &r1, &r2, &oi);
 taddr = tci_uint64(regs[r2], regs[r1]);
 }
-tmp32 = regs[r0];
-tci_qemu_st(env, taddr, tmp32, oi, tb_ptr);
+do_st_i32:
+tci_qemu_st(env, taddr, regs[r0], oi, tb_ptr);
 break;
 
 case INDEX_op_qemu_st_a32_i64:
+if (TCG_TARGET_REG_BITS == 64) {
+tci_args_rrm(insn, &r0, &r1, &oi);
+tmp64 = regs[r0];
+taddr = (uint32_t)regs[r1];
+} else {
+tci_args_rrrm(insn, &r0, &r1, &r2, &oi);
+tmp64 = tci_uint64(regs[r1], regs[r0]);
+taddr = (uint32_t)regs[r2];
+}
+goto do_st_i64;
 case INDEX_op_qemu_st_a64_i64:
 if (TCG_TARGET_REG_BITS == 64) {
 tci_args_rrm(insn, &r0, &r1, &oi);
-taddr = regs[r1];
 tmp64 = regs[r0];
+taddr = regs[r1];

[PULL v2 07/12] qemu/int128: Re-shuffle Int128Alias members

2023-05-03 Thread Richard Henderson

Clang 14, with --enable-tcg-interpreter errors with

include/qemu/int128.h:487:16: error: alignment of field 'i' (128 bits)
  does not match the alignment of the first field in transparent union;
  transparent_union attribute ignored [-Werror,-Wignored-attributes]
__int128_t i;
   ^
include/qemu/int128.h:486:12: note: alignment of first field is 64 bits
Int128 s;
   ^
1 error generated.

By placing the __uint128_t member first, this is avoided.

Signed-off-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Message-Id: <20230501204625.277361-1-richard.hender...@linaro.org>
---
 include/qemu/int128.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index f62a46b48c..9e46cfaefc 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -483,9 +483,9 @@ static inline void bswap128s(Int128 *s)
  */
 #ifdef CONFIG_INT128
 typedef union {
-Int128 s;
-__int128_t i;
 __uint128_t u;
+__int128_t i;
+Int128 s;
 } Int128Alias __attribute__((transparent_union));
 #else
 typedef Int128 Int128Alias;
-- 
2.34.1

[PATCH v4 13/57] meson: Detect atomic128 support with optimization

2023-05-03 Thread Richard Henderson

There is an edge condition prior to gcc13 for which optimization
is required to generate 16-byte atomic sequences.  Detect this.

Signed-off-by: Richard Henderson 
---
 accel/tcg/ldst_atomicity.c.inc | 38 ++---
 meson.build| 52 ++
 2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
index e61121d6bf..c43f101ebe 100644
--- a/accel/tcg/ldst_atomicity.c.inc
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -16,6 +16,23 @@
 #endif
 #define HAVE_al8_fast  (ATOMIC_REG_SIZE >= 8)
 
+/*
+ * If __alignof(unsigned __int128) < 16, GCC may refuse to inline atomics
+ * that are supported by the host, e.g. s390x.  We can force the pointer to
+ * have our known alignment with __builtin_assume_aligned, however prior to
+ * GCC 13 that was only reliable with optimization enabled.  See
+ *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107389
+ */
+#if defined(CONFIG_ATOMIC128_OPT)
+# if !defined(__OPTIMIZE__)
+#  define ATTRIBUTE_ATOMIC128_OPT  __attribute__((optimize("O1")))
+# endif
+# define CONFIG_ATOMIC128
+#endif
+#ifndef ATTRIBUTE_ATOMIC128_OPT
+# define ATTRIBUTE_ATOMIC128_OPT
+#endif
+
 #if defined(CONFIG_ATOMIC128)
 # define HAVE_al16_fasttrue
 #else
@@ -136,7 +153,8 @@ static inline uint64_t load_atomic8(void *pv)
  *
  * Atomically load 16 aligned bytes from @pv.
  */
-static inline Int128 load_atomic16(void *pv)
+static inline Int128 ATTRIBUTE_ATOMIC128_OPT
+load_atomic16(void *pv)
 {
 #ifdef CONFIG_ATOMIC128
 __uint128_t *p = __builtin_assume_aligned(pv, 16);
@@ -340,7 +358,8 @@ static uint64_t load_atom_extract_al16_or_exit(CPUArchState 
*env, uintptr_t ra,
  * cross an 16-byte boundary then the access must be 16-byte atomic,
  * otherwise the access must be 8-byte atomic.
  */
-static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
+static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
+load_atom_extract_al16_or_al8(void *pv, int s)
 {
 #if defined(CONFIG_ATOMIC128)
 uintptr_t pi = (uintptr_t)pv;
@@ -676,28 +695,24 @@ static inline void store_atomic8(void *pv, uint64_t val)
  *
  * Atomically store 16 aligned bytes to @pv.
  */
-static inline void store_atomic16(void *pv, Int128 val)
+static inline void ATTRIBUTE_ATOMIC128_OPT
+store_atomic16(void *pv, Int128Alias val)
 {
 #if defined(CONFIG_ATOMIC128)
 __uint128_t *pu = __builtin_assume_aligned(pv, 16);
-Int128Alias new;
-
-new.s = val;
-qatomic_set__nocheck(pu, new.u);
+qatomic_set__nocheck(pu, val.u);
 #elif defined(CONFIG_CMPXCHG128)
 __uint128_t *pu = __builtin_assume_aligned(pv, 16);
 __uint128_t o;
-Int128Alias n;
 
 /*
  * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
  * defer to libatomic, so we must use __sync_val_compare_and_swap_16
  * and accept the sequential consistency that comes with it.
  */
-n.s = val;
 do {
 o = *pu;
-} while (!__sync_bool_compare_and_swap_16(pu, o, n.u));
+} while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
 #else
 qemu_build_not_reached();
 #endif
@@ -779,7 +794,8 @@ static void store_atom_insert_al8(uint64_t *p, uint64_t 
val, uint64_t msk)
  *
  * Atomically store @val to @p masked by @msk.
  */
-static void store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias 
msk)
+static void ATTRIBUTE_ATOMIC128_OPT
+store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
 {
 #if defined(CONFIG_ATOMIC128)
 __uint128_t *pu, old, new;
diff --git a/meson.build b/meson.build
index 77d42898c8..4bbdbcef37 100644
--- a/meson.build
+++ b/meson.build
@@ -2241,23 +2241,21 @@ config_host_data.set('HAVE_BROKEN_SIZE_MAX', not 
cc.compiles('''
 return printf("%zu", SIZE_MAX);
 }''', args: ['-Werror']))
 
-atomic_test = '''
+# See if 64-bit atomic operations are supported.
+# Note that without __atomic builtins, we can only
+# assume atomic loads/stores max at pointer size.
+config_host_data.set('CONFIG_ATOMIC64', cc.links('''
   #include 
   int main(void)
   {
-@0@ x = 0, y = 0;
+uint64_t x = 0, y = 0;
 y = __atomic_load_n(&x, __ATOMIC_RELAXED);
 __atomic_store_n(&x, y, __ATOMIC_RELAXED);
 __atomic_compare_exchange_n(&x, &y, x, 0, __ATOMIC_RELAXED, 
__ATOMIC_RELAXED);
 __atomic_exchange_n(&x, y, __ATOMIC_RELAXED);
 __atomic_fetch_add(&x, y, __ATOMIC_RELAXED);
 return 0;
-  }'''
-
-# See if 64-bit atomic operations are supported.
-# Note that without __atomic builtins, we can only
-# assume atomic loads/stores max at pointer size.
-config_host_data.set('CONFIG_ATOMIC64', 
cc.links(atomic_test.format('uint64_t')))
+  }'''))
 
 has_int128 = cc.links('''
   __int128_t a;
@@ -2275,21 +2273,39 @@ if has_int128
   # "do we have 128-bit atomics which are handled inline and specifically not
   # via libatomic". The reason we can't use libatomic is documented in the
   # comment starting "GCC is a house divided" in in

[PATCH v4 24/54] tcg/riscv: Introduce prepare_host_addr

2023-05-03 Thread Richard Henderson

Merge tcg_out_tlb_load, add_qemu_ldst_label, tcg_out_test_alignment,
and some code that lived in both tcg_out_qemu_ld and tcg_out_qemu_st
into one function that returns TCGReg and TCGLabelQemuLdst.

Signed-off-by: Richard Henderson 
---
 tcg/riscv/tcg-target.c.inc | 253 +
 1 file changed, 114 insertions(+), 139 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index a4cf60ca75..2b2d313fe2 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -899,10 +899,6 @@ static void * const qemu_st_helpers[MO_SIZE + 1] = {
 #endif
 };
 
-/* We expect to use a 12-bit negative offset from ENV.  */
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11));
-
 static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target)
 {
 tcg_out_opc_jump(s, OPC_JAL, TCG_REG_ZERO, 0);
@@ -910,76 +906,6 @@ static void tcg_out_goto(TCGContext *s, const 
tcg_insn_unit *target)
 tcg_debug_assert(ok);
 }
 
-static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, MemOpIdx oi,
-   tcg_insn_unit **label_ptr, bool is_load)
-{
-MemOp opc = get_memop(oi);
-unsigned s_bits = opc & MO_SIZE;
-unsigned a_bits = get_alignment_bits(opc);
-tcg_target_long compare_mask;
-int mem_index = get_mmuidx(oi);
-int fast_ofs = TLB_MASK_TABLE_OFS(mem_index);
-int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask);
-int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table);
-TCGReg mask_base = TCG_AREG0, table_base = TCG_AREG0;
-
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, mask_base, mask_ofs);
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, table_base, table_ofs);
-
-tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP2, addr,
-TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0);
-tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1);
-
-/* Load the tlb comparator and the addend.  */
-tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2,
-   is_load ? offsetof(CPUTLBEntry, addr_read)
-   : offsetof(CPUTLBEntry, addr_write));
-tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2,
-   offsetof(CPUTLBEntry, addend));
-
-/* We don't support unaligned accesses. */
-if (a_bits < s_bits) {
-a_bits = s_bits;
-}
-/* Clear the non-page, non-alignment bits from the address.  */
-compare_mask = (tcg_target_long)TARGET_PAGE_MASK | ((1 << a_bits) - 1);
-if (compare_mask == sextreg(compare_mask, 0, 12)) {
-tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr, compare_mask);
-} else {
-tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask);
-tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr);
-}
-
-/* Compare masked address with the TLB entry. */
-label_ptr[0] = s->code_ptr;
-tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0);
-
-/* TLB Hit - translate address using addend.  */
-if (TARGET_LONG_BITS == 32) {
-tcg_out_ext32u(s, TCG_REG_TMP0, addr);
-addr = TCG_REG_TMP0;
-}
-tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP2, addr);
-return TCG_REG_TMP0;
-}
-
-static void add_qemu_ldst_label(TCGContext *s, int is_ld, MemOpIdx oi,
-TCGType data_type, TCGReg data_reg,
-TCGReg addr_reg, void *raddr,
-tcg_insn_unit **label_ptr)
-{
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->oi = oi;
-label->type = data_type;
-label->datalo_reg = data_reg;
-label->addrlo_reg = addr_reg;
-label->raddr = tcg_splitwx_to_rx(raddr);
-label->label_ptr[0] = label_ptr[0];
-}
-
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
 MemOpIdx oi = l->oi;
@@ -1037,26 +963,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 return true;
 }
 #else
-
-static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addr_reg,
-   unsigned a_bits)
-{
-unsigned a_mask = (1 << a_bits) - 1;
-TCGLabelQemuLdst *l = new_ldst_label(s);
-
-l->is_ld = is_ld;
-l->addrlo_reg = addr_reg;
-
-/* We are expecting a_bits to max out at 7, so we can always use andi. */
-tcg_debug_assert(a_bits < 12);
-tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, a_mask);
-
-l->label_ptr[0] = s->code_ptr;
-tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP1, TCG_REG_ZERO, 0);
-
-l->raddr = tcg_splitwx_to_rx(s->code_ptr);
-}
-
 static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
 {
 /* resolve label address */
@@ -1083,9 +989,108 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 {
 return tcg_out_fail_alignment(s,

[PULL v2 04/12] qemu/host-utils.h: Add clz and ctz functions for lower-bit integers

2023-05-03 Thread Richard Henderson

From: Kiran Ostrolenk 

This is for use in the RISC-V vclz and vctz instructions (implemented in
proceeding commit).

Signed-off-by: Kiran Ostrolenk 
Reviewed-by: Richard Henderson 
Message-Id: <20230428144757.57530-11-lawrence.hun...@codethink.co.uk>
Signed-off-by: Richard Henderson 
---
 include/qemu/host-utils.h | 54 +++
 1 file changed, 54 insertions(+)

diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index 3ce62bf4a5..d3b4dce6a9 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -107,6 +107,36 @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, 
uint32_t c)
 }
 #endif
 
+/**
+ * clz8 - count leading zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz8(uint8_t val)
+{
+return val ? __builtin_clz(val) - 24 : 8;
+}
+
+/**
+ * clz16 - count leading zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ *
+ * Note that the GCC builtin will upcast its argument to an `unsigned int`
+ * so this function subtracts off the number of prepended zeroes.
+ */
+static inline int clz16(uint16_t val)
+{
+return val ? __builtin_clz(val) - 16 : 16;
+}
+
 /**
  * clz32 - count leading zeros in a 32-bit value.
  * @val: The value to search
@@ -153,6 +183,30 @@ static inline int clo64(uint64_t val)
 return clz64(~val);
 }
 
+/**
+ * ctz8 - count trailing zeros in a 8-bit value.
+ * @val: The value to search
+ *
+ * Returns 8 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz8(uint8_t val)
+{
+return val ? __builtin_ctz(val) : 8;
+}
+
+/**
+ * ctz16 - count trailing zeros in a 16-bit value.
+ * @val: The value to search
+ *
+ * Returns 16 if the value is zero.  Note that the GCC builtin is
+ * undefined if the value is zero.
+ */
+static inline int ctz16(uint16_t val)
+{
+return val ? __builtin_ctz(val) : 16;
+}
+
 /**
  * ctz32 - count trailing zeros in a 32-bit value.
  * @val: The value to search
-- 
2.34.1

[PATCH 68/84] target/arm: Tidy helpers for translation

2023-05-03 Thread Richard Henderson

Move most includes from *translate*.c to translate.h, ensuring
that we get the ordering correct.  Ensure cpu.h is first.
Use disas/disas.h instead of exec/log.h.
Drop otherwise unused includes.

Signed-off-by: Richard Henderson 
---
 target/arm/tcg/translate.h|  3 +++
 target/arm/tcg/translate-a64.c| 17 +
 target/arm/tcg/translate-m-nocp.c |  2 --
 target/arm/tcg/translate-mve.c|  3 ---
 target/arm/tcg/translate-neon.c   |  3 ---
 target/arm/tcg/translate-sme.c|  6 --
 target/arm/tcg/translate-sve.c|  9 -
 target/arm/tcg/translate-vfp.c|  3 ---
 target/arm/tcg/translate.c| 17 +
 9 files changed, 13 insertions(+), 50 deletions(-)

diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index d1a7a829ed..9179521b88 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -1,6 +1,9 @@
 #ifndef TARGET_ARM_TRANSLATE_H
 #define TARGET_ARM_TRANSLATE_H
 
+#include "cpu.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "exec/translator.h"
 #include "exec/helper-gen.h"
 #include "internals.h"
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 9bfea23353..6ae92b3353 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -18,20 +18,13 @@
  */
 #include "qemu/osdep.h"
 
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "qemu/log.h"
-#include "arm_ldst.h"
 #include "translate.h"
-#include "internals.h"
-#include "qemu/host-utils.h"
-#include "semihosting/semihost.h"
-#include "exec/log.h"
-#include "cpregs.h"
 #include "translate-a64.h"
-#include "qemu/atomic128.h"
+#include "qemu/log.h"
+#include "disas/disas.h"
+#include "arm_ldst.h"
+#include "semihosting/semihost.h"
+#include "cpregs.h"
 
 static TCGv_i64 cpu_X[32];
 static TCGv_i64 cpu_pc;
diff --git a/target/arm/tcg/translate-m-nocp.c 
b/target/arm/tcg/translate-m-nocp.c
index 9a89aab785..33f6478bb9 100644
--- a/target/arm/tcg/translate-m-nocp.c
+++ b/target/arm/tcg/translate-m-nocp.c
@@ -18,8 +18,6 @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
index 2ad3c40975..bbc7b3f4ce 100644
--- a/target/arm/tcg/translate-mve.c
+++ b/target/arm/tcg/translate-mve.c
@@ -18,9 +18,6 @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index 6fac577abd..03913de047 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -21,9 +21,6 @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index b0812d9dd6..d0054e3f77 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -18,14 +18,8 @@
  */
 
 #include "qemu/osdep.h"
-#include "cpu.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "tcg/tcg-gvec-desc.h"
 #include "translate.h"
 #include "translate-a64.h"
-#include "fpu/softfloat.h"
-
 
 /*
  * Include the generated decoder.
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
index 106baf311f..d9d5810dde 100644
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@@ -18,16 +18,7 @@
  */
 
 #include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "qemu/log.h"
-#include "arm_ldst.h"
 #include "translate.h"
-#include "internals.h"
-#include "exec/log.h"
 #include "translate-a64.h"
 #include "fpu/softfloat.h"
 
diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
index 95ac8d9db3..359b1e3e96 100644
--- a/target/arm/tcg/translate-vfp.c
+++ b/target/arm/tcg/translate-vfp.c
@@ -21,9 +21,6 @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index 7caf6d802d..a68d3c7f6d 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -20,20 +20,13 @@
  */
 #include "qemu/osdep.h"
 
-#include "cpu.h"
-#include "internals.h"
-#include "disas/disas.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "qemu/log.h"
-#include "qemu/bitops.h"
-#include "arm_ldst.h"
-#include "semihosting/semihost.h"
-#include "exec/log.h"
-#include "cpregs.h"
 #include "translate.h"
 #include "translate-a32.h"
+#include "qem

[PULL v2 05/12] tcg: Add tcg_gen_gvec_andcs

2023-05-03 Thread Richard Henderson

From: Nazar Kazakov 

Add tcg expander and helper functions for and-compliment
vector with scalar operand.

Signed-off-by: Nazar Kazakov 
Message-Id: <20230428144757.57530-10-lawrence.hun...@codethink.co.uk>
[rth: Split out of larger patch.]
Signed-off-by: Richard Henderson 
---
 accel/tcg/tcg-runtime.h  |  1 +
 include/tcg/tcg-op-gvec.h|  2 ++
 accel/tcg/tcg-runtime-gvec.c | 11 +++
 tcg/tcg-op-gvec.c| 17 +
 4 files changed, 31 insertions(+)

diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index e141a6ab24..b8e6421c8a 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -217,6 +217,7 @@ DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, 
ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_andcs, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index 28cafbcc5c..6d58683171 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -330,6 +330,8 @@ void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 
 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index ac7d28c251..97399493d5 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -550,6 +550,17 @@ void HELPER(gvec_ands)(void *d, void *a, uint64_t b, 
uint32_t desc)
 clear_high(d, oprsz, desc);
 }
 
+void HELPER(gvec_andcs)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+intptr_t oprsz = simd_oprsz(desc);
+intptr_t i;
+
+for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+*(uint64_t *)(d + i) = *(uint64_t *)(a + i) & ~b;
+}
+clear_high(d, oprsz, desc);
+}
+
 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
 {
 intptr_t oprsz = simd_oprsz(desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 047a832f44..9c14908a46 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -2761,6 +2761,23 @@ void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
 }
 
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+static GVecGen2s g = {
+.fni8 = tcg_gen_andc_i64,
+.fniv = tcg_gen_andc_vec,
+.fno = gen_helper_gvec_andcs,
+.prefer_i64 = TCG_TARGET_REG_BITS == 64,
+.vece = MO_64
+};
+
+TCGv_i64 tmp = tcg_temp_ebb_new_i64();
+tcg_gen_dup_i64(vece, tmp, c);
+tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g);
+tcg_temp_free_i64(tmp);
+}
+
 static const GVecGen2s gop_xors = {
 .fni8 = tcg_gen_xor_i64,
 .fniv = tcg_gen_xor_vec,
-- 
2.34.1

[PATCH v4 40/54] tcg/mips: Convert tcg_out_qemu_{ld,st}_slow_path

2023-05-03 Thread Richard Henderson

Use tcg_out_ld_helper_args, tcg_out_ld_helper_ret,
and tcg_out_st_helper_args.  This allows our local
tcg_out_arg_* infrastructure to be removed.

We are no longer filling the call or return branch
delay slots, nor are we tail-calling for the store,
but this seems a small price to pay.

Signed-off-by: Richard Henderson 
---
 tcg/mips/tcg-target.c.inc | 154 ++
 1 file changed, 22 insertions(+), 132 deletions(-)

diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index 94708e6ea7..022960d79a 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -1115,79 +1115,15 @@ static void * const qemu_st_helpers[(MO_SIZE | 
MO_BSWAP) + 1] = {
 [MO_BEUQ] = helper_be_stq_mmu,
 };
 
-/* Helper routines for marshalling helper function arguments into
- * the correct registers and stack.
- * I is where we want to put this argument, and is updated and returned
- * for the next call. ARG is the argument itself.
- *
- * We provide routines for arguments which are: immediate, 32 bit
- * value in register, 16 and 8 bit values in register (which must be zero
- * extended before use) and 64 bit value in a lo:hi register pair.
- */
-
-static int tcg_out_call_iarg_reg(TCGContext *s, int i, TCGReg arg)
-{
-if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
-tcg_out_mov(s, TCG_TYPE_REG, tcg_target_call_iarg_regs[i], arg);
-} else {
-/* For N32 and N64, the initial offset is different.  But there
-   we also have 8 argument register so we don't run out here.  */
-tcg_debug_assert(TCG_TARGET_REG_BITS == 32);
-tcg_out_st(s, TCG_TYPE_REG, arg, TCG_REG_SP, 4 * i);
-}
-return i + 1;
-}
-
-static int tcg_out_call_iarg_reg8(TCGContext *s, int i, TCGReg arg)
-{
-TCGReg tmp = TCG_TMP0;
-if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
-tmp = tcg_target_call_iarg_regs[i];
-}
-tcg_out_ext8u(s, tmp, arg);
-return tcg_out_call_iarg_reg(s, i, tmp);
-}
-
-static int tcg_out_call_iarg_reg16(TCGContext *s, int i, TCGReg arg)
-{
-TCGReg tmp = TCG_TMP0;
-if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
-tmp = tcg_target_call_iarg_regs[i];
-}
-tcg_out_opc_imm(s, OPC_ANDI, tmp, arg, 0x);
-return tcg_out_call_iarg_reg(s, i, tmp);
-}
-
-static int tcg_out_call_iarg_imm(TCGContext *s, int i, TCGArg arg)
-{
-TCGReg tmp = TCG_TMP0;
-if (arg == 0) {
-tmp = TCG_REG_ZERO;
-} else {
-if (i < ARRAY_SIZE(tcg_target_call_iarg_regs)) {
-tmp = tcg_target_call_iarg_regs[i];
-}
-tcg_out_movi(s, TCG_TYPE_REG, tmp, arg);
-}
-return tcg_out_call_iarg_reg(s, i, tmp);
-}
-
-static int tcg_out_call_iarg_reg2(TCGContext *s, int i, TCGReg al, TCGReg ah)
-{
-tcg_debug_assert(TCG_TARGET_REG_BITS == 32);
-i = (i + 1) & ~1;
-i = tcg_out_call_iarg_reg(s, i, (MIPS_BE ? ah : al));
-i = tcg_out_call_iarg_reg(s, i, (MIPS_BE ? al : ah));
-return i;
-}
+/* We have four temps, we might as well expose three of them. */
+static const TCGLdstHelperParam ldst_helper_param = {
+.ntmp = 3, .tmp = { TCG_TMP0, TCG_TMP1, TCG_TMP2 }
+};
 
 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 {
 const tcg_insn_unit *tgt_rx = tcg_splitwx_to_rx(s->code_ptr);
-MemOpIdx oi = l->oi;
-MemOp opc = get_memop(oi);
-TCGReg v0;
-int i;
+MemOp opc = get_memop(l->oi);
 
 /* resolve label address */
 if (!reloc_pc16(l->label_ptr[0], tgt_rx)
@@ -1196,29 +1132,13 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 return false;
 }
 
-i = 1;
-if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-i = tcg_out_call_iarg_reg2(s, i, l->addrlo_reg, l->addrhi_reg);
-} else {
-i = tcg_out_call_iarg_reg(s, i, l->addrlo_reg);
-}
-i = tcg_out_call_iarg_imm(s, i, oi);
-i = tcg_out_call_iarg_imm(s, i, (intptr_t)l->raddr);
+tcg_out_ld_helper_args(s, l, &ldst_helper_param);
+
 tcg_out_call_int(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)], false);
 /* delay slot */
-tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+tcg_out_nop(s);
 
-v0 = l->datalo_reg;
-if (TCG_TARGET_REG_BITS == 32 && (opc & MO_SIZE) == MO_64) {
-/* We eliminated V0 from the possible output registers, so it
-   cannot be clobbered here.  So we must move V1 first.  */
-if (MIPS_BE) {
-tcg_out_mov(s, TCG_TYPE_I32, v0, TCG_REG_V1);
-v0 = l->datahi_reg;
-} else {
-tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_V1);
-}
-}
+tcg_out_ld_helper_ret(s, l, true, &ldst_helper_param);
 
 tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
 if (!reloc_pc16(s->code_ptr - 1, l->raddr)) {
@@ -1226,22 +1146,14 @@ static bool tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 }
 
 /* delay slot */
-if (TCG_TARGET_R

[PATCH 42/84] tcg: Split out tcg/oversized-guest.h

2023-05-03 Thread Richard Henderson

Move a use of TARGET_LONG_BITS out of tcg/tcg.h.
Include the new file only where required.

Signed-off-by: Richard Henderson 
---
 include/exec/cpu_ldst.h   |  3 +--
 include/tcg/oversized-guest.h | 23 +++
 include/tcg/tcg.h |  9 -
 accel/tcg/cputlb.c|  1 +
 accel/tcg/tcg-all.c   |  1 +
 target/arm/ptw.c  |  1 +
 target/riscv/cpu_helper.c |  1 +
 7 files changed, 28 insertions(+), 11 deletions(-)
 create mode 100644 include/tcg/oversized-guest.h

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index f916a96a31..59abab7421 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -357,8 +357,7 @@ static inline void clear_helper_retaddr(void)
 
 #else
 
-/* Needed for TCG_OVERSIZED_GUEST */
-#include "tcg/tcg.h"
+#include "tcg/oversized-guest.h"
 
 static inline target_ulong tlb_read_idx(const CPUTLBEntry *entry,
 MMUAccessType access_type)
diff --git a/include/tcg/oversized-guest.h b/include/tcg/oversized-guest.h
new file mode 100644
index 00..641b9749ff
--- /dev/null
+++ b/include/tcg/oversized-guest.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define TCG_OVERSIZED_GUEST
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef EXEC_TCG_OVERSIZED_GUEST_H
+#define EXEC_TCG_OVERSIZED_GUEST_H
+
+#include "tcg-target-reg-bits.h"
+#include "cpu-param.h"
+
+/*
+ * Oversized TCG guests make things like MTTCG hard
+ * as we can't use atomics for cputlb updates.
+ */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+#define TCG_OVERSIZED_GUEST 1
+#else
+#define TCG_OVERSIZED_GUEST 0
+#endif
+
+#endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0999847b84..b3e8d78907 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -59,15 +59,6 @@ typedef uint64_t tcg_target_ulong;
 #error unsupported
 #endif
 
-/* Oversized TCG guests make things like MTTCG hard
- * as we can't use atomics for cputlb updates.
- */
-#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
-#define TCG_OVERSIZED_GUEST 1
-#else
-#define TCG_OVERSIZED_GUEST 0
-#endif
-
 #if TCG_TARGET_NB_REGS <= 32
 typedef uint32_t TCGRegSet;
 #elif TCG_TARGET_NB_REGS <= 64
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 83297f9bff..7d3cd877ff 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -40,6 +40,7 @@
 #include "qemu/plugin-memory.h"
 #endif
 #include "tcg/tcg-ldst.h"
+#include "tcg/oversized-guest.h"
 #include "exec/helper-proto.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index a831f8d7c3..02af6a2891 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -28,6 +28,7 @@
 #include "exec/replay-core.h"
 #include "sysemu/cpu-timers.h"
 #include "tcg/tcg.h"
+#include "tcg/oversized-guest.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/accel.h"
diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 7b7ce65c7a..0926ae4c4a 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -14,6 +14,7 @@
 #include "cpu.h"
 #include "internals.h"
 #include "idau.h"
+#include "tcg/oversized-guest.h"
 
 
 typedef struct S1Translate {
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index f88c503cf4..7b9744be1e 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -30,6 +30,7 @@
 #include "sysemu/cpu-timers.h"
 #include "cpu_bits.h"
 #include "debug.h"
+#include "tcg/oversized-guest.h"
 
 int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch)
 {
-- 
2.34.1

1 2 3 4 5 6 >

1 - 100 of 529 matches

Mail list logo