qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2] tcg/ppc: Optimize 26-bit jumps


From: Richard Henderson
Subject: Re: [PATCH v2] tcg/ppc: Optimize 26-bit jumps
Date: Thu, 15 Sep 2022 11:38:55 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.11.0

On 9/14/22 19:41, Leandro Lupori wrote:
PowerPC64 processors handle direct branches better than indirect
ones, resulting in less stalled cycles and branch misses.

However, PPC's tb_target_set_jmp_target() was only using direct
branches for 16-bit jumps, while PowerPC64's unconditional branch
instructions are able to handle displacements of up to 26 bits.
To take advantage of this, now jumps whose displacements fit in
between 17 and 26 bits are also converted to direct branches.

Signed-off-by: Leandro Lupori <leandro.lupori@eldorado.org.br>
---
v2: use stq to replace all instructions atomically

The only error here is failure to align to 16 bytes, here:

    case INDEX_op_goto_tb:

        if (s->tb_jmp_insn_offset) {

            /* Direct jump. */

            if (TCG_TARGET_REG_BITS == 64) {

                /* Ensure the next insns are 8-byte aligned. */

                if ((uintptr_t)s->code_ptr & 7) {

                    tcg_out32(s, NOP);

                }


The rest are minor code cleanup:

+#if TCG_TARGET_REG_BITS == 64

Not needed, I think.

+static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
+{
+#if HOST_BIG_ENDIAN
+    return (uint64_t)i1 << 32 | i2;
+#else
+    return (uint64_t)i2 << 32 | i1;
+#endif
+}

This is code movement, but we can now use a C if instead.

+static inline void ppc64_stq(uintptr_t dst, uint64_t src0, uint64_t src1)
+{
+    asm volatile (
+        "mr  %%r6, %0\n\t"
+        "mr  %%r7, %1\n\t"
+        "stq %%r6, 0(%2)"
+        : : "r" (src0), "r" (src1), "r" (dst) : "r6", "r7", "memory");
+}
+
+#endif
+
+static inline void ppc64_replace_insns(uintptr_t rx, uintptr_t rw,
+    tcg_insn_unit *insn)
+{
+#if TCG_TARGET_REG_BITS == 64
+    uint64_t pair[2];
+
+    pair[0] = make_pair(insn[0], insn[1]);
+    if (have_isa_2_07) {
+        pair[1] = make_pair(insn[2], insn[3]);
+#if HOST_BIG_ENDIAN
+        ppc64_stq(rw, pair[0], pair[1]);
+#else
+        ppc64_stq(rw, pair[1], pair[0]);
+#endif
+        flush_idcache_range(rx, rw, 16);
+    } else {
+        qatomic_set((uint64_t *)rw, pair[0]);
+        flush_idcache_range(rx, rw, 8);
+    }
+#endif
+}

I think this would be cleaner as

static inline void ppc64_replace2(
    uintptr_t rx, uintptr_t rw,
    tcg_insn_unit i0, i1)
{
#if TCG_TARGET_REG_BITS == 64
    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
    flush_idcache_range(rx, rw, 8);
#else
    qemu_build_not_reached();
#endif
}

static inline void ppc64_replace4(
    uintptr_t rx, uintptr_t rw,
    tcg_insn_unit i0, i2, i3, i4)
{
    uint64_t p[2];

    p[HOST_BIG_ENDIAN] = make_pair(i0, i1);
    p[!HOST_BIG_ENDIAN] = make_pair(i2, i3);

    asm("mr %%r6,%1\n\t"
        "mr %%r7,%2\n\t"
        "stq %%r6,%0"
        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]));
    flush_idcache_range(rx, rw, 16);
}

  void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
                                uintptr_t jmp_rw, uintptr_t addr)
  {
      if (TCG_TARGET_REG_BITS == 64) {
-        tcg_insn_unit i1, i2;
+        tcg_insn_unit i[4];
          intptr_t tb_diff = addr - tc_ptr;
          intptr_t br_diff = addr - (jmp_rx + 4);
-        uint64_t pair;

Now that 64-bit is getting more complicated, let's reverse
this first IF and eliminate ppc32 first:

    if (TCG_TARGET_REG_BITS == 32) {
        stuff;
        return;
    }

    if (tb_diff == (int16_t)tb_diff) {
        i0 = ADDI...
        i1 = B...
        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
        return;
    }

    i0 = ADDIS...
    i1 = ADDI...
    if (!have_isa_2_07) {
        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
        return;
    }

    if (in_range) {
        i2 = B...
        i3 = NOP;
    } else {
        i2 = MTSPR...
        i3 = BCCTR...
    }
    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]