qemu-arm
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v5 11/54] accel/tcg: Add aarch64 specific support in ldst_ato


From: Peter Maydell
Subject: Re: [PATCH v5 11/54] accel/tcg: Add aarch64 specific support in ldst_atomicity
Date: Tue, 16 May 2023 14:29:03 +0100

On Mon, 15 May 2023 at 15:38, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We have code in atomic128.h noting that through GCC 8, there
> was no support for atomic operations on __uint128.  This has
> been fixed in GCC 10.  But we can still improve over any
> basic compare-and-swap loop using the ldxp/stxp instructions.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  accel/tcg/ldst_atomicity.c.inc | 60 ++++++++++++++++++++++++++++++++--
>  1 file changed, 57 insertions(+), 3 deletions(-)
>
> diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
> index 69c1c61997..c3b2b35823 100644
> --- a/accel/tcg/ldst_atomicity.c.inc
> +++ b/accel/tcg/ldst_atomicity.c.inc
> @@ -263,7 +263,22 @@ static Int128 load_atomic16_or_exit(CPUArchState *env, 
> uintptr_t ra, void *pv)
>       * In system mode all guest pages are writable, and for user-only
>       * we have just checked writability.  Try cmpxchg.
>       */
> -#if defined(CONFIG_CMPXCHG128)
> +#if defined(__aarch64__)
> +    /* We can do better than cmpxchg for AArch64.  */
> +    {
> +        uint64_t l, h;
> +        uint32_t fail;
> +
> +        /* The load must be paired with the store to guarantee not tearing. 
> */
> +        asm("0: ldxp %0, %1, %3\n\t"
> +            "stxp %w2, %0, %1, %3\n\t"
> +            "cbnz %w2, 0b"
> +            : "=&r"(l), "=&r"(h), "=&r"(fail) : "Q"(*p));
> +
> +        qemu_build_assert(!HOST_BIG_ENDIAN);
> +        return int128_make128(l, h);
> +    }

The compiler (well, clang 11, anyway) seems able to generate equivalent
code to this inline asm:
https://godbolt.org/z/eraeTn4vs

__int128 do_load2(void *pv) {
    __int128 *p = __builtin_assume_aligned(pv, 16);
    __int128 i = __atomic_load_n(p, __ATOMIC_RELAXED);
    return i;
}

generates

do_load2:                               // @do_load2
        mov     x8, x0
.LBB0_1:                                // =>This Inner Loop Header: Depth=1
        ldxp    x0, x1, [x8]
        stxp    w9, x0, x1, [x8]
        cbnz    w9, .LBB0_1
        ret

thanks
-- PMM



reply via email to

[Prev in Thread] Current Thread [Next in Thread]