qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v6 09/15] target/riscv: Add Zvkned ISA extension support


From: Max Chou
Subject: Re: [PATCH v6 09/15] target/riscv: Add Zvkned ISA extension support
Date: Thu, 29 Jun 2023 23:10:09 +0800
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Thunderbird/102.12.0

On 2023/6/28 5:07 PM, Richard Henderson wrote:

On 6/27/23 19:45, Max Chou wrote:
+#define GEN_V_UNMASKED_TRANS(NAME, CHECK, EGS)                                \ +    static bool trans_##NAME(DisasContext *s, arg_##NAME *a)                  \
+ { \
+        if (CHECK(s, a)) {                                                    \ +            TCGv_ptr rd_v, rs2_v;                                             \ +            TCGv_i32 desc;                                                    \ +            uint32_t data = 0;                                                \ +            TCGLabel *over = gen_new_label();                                 \ +            TCGLabel *vl_ok = gen_new_label();                                \ +            TCGLabel *vstart_ok = gen_new_label();                            \ +            TCGv_i32 tmp = tcg_temp_new_i32();                                \
+ \
+            /* save opcode for unwinding in case we throw an exception */     \
+ decode_save_opc(s); \
+ \
+            /* check (vl % EGS == 0) assuming it's power of 2 */              \ +            tcg_gen_trunc_tl_i32(tmp, cpu_vl);                                \ +            tcg_gen_andi_i32(tmp, tmp, EGS - 1);                              \ +            tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, vl_ok);                  \
+ gen_helper_restore_cpu_and_raise_exception( \
+                cpu_env, tcg_constant_i32(RISCV_EXCP_ILLEGAL_INST));          \
+ gen_set_label(vl_ok); \
+ \
+            /* check (vstart % EGS == 0) assuming it's power of 2 */          \ +            tcg_gen_trunc_tl_i32(tmp, cpu_vstart);                            \ +            tcg_gen_andi_i32(tmp, tmp, EGS - 1);                              \ +            tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, vstart_ok);              \
+ gen_helper_restore_cpu_and_raise_exception( \
+                cpu_env, tcg_constant_i32(RISCV_EXCP_ILLEGAL_INST));          \
+ gen_set_label(vstart_ok); \
+ \
+            tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over);        \

This kind of massive macro is bad style.
Much better to have a helper function and pass in gen_helper_foo as a parameter.

You can eliminate the vstart % EGS test, and the vstart < vl test, when VSTART_EQ_ZERO.
You can eliminate the vl % EGS test when VL_EQ_VLMAX.

You could move all of these tests out of line, into a helper_foo_chk() function which performs the checks and then calls helper_foo().
Hi Richard

Thank you for the suggestion.
I'll provide the v7 patch set with this suggestion.

But I have an question about the vstart < vl test.
I think that we can't eliminate the vstart < vl test when both the vstart and vl are equal to zero.
Although this situation means that the instructions will do nothing.

+#define GEN_ZVKNED_HELPER_VV(NAME, ...)                                   \ +    void HELPER(NAME)(void *vd_vptr, void *vs2_vptr, CPURISCVState *env,  \ +                      uint32_t desc)                                      \
+ { \
+        uint64_t *vd = vd_vptr;                                           \ +        uint64_t *vs2 = vs2_vptr;                                         \ +        uint32_t vl = env->vl;                                            \ +        uint32_t total_elems = vext_get_total_elems(env, desc, 4);        \ +        uint32_t vta = vext_vta(desc);                                    \
+ \
+        for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) {        \ +            AESState round_key;                                           \ +            round_key.d[HOST_BIG_ENDIAN] = cpu_to_le64(vs2[i * 2 + 0]);   \ +            round_key.d[!HOST_BIG_ENDIAN] = cpu_to_le64(vs2[i * 2 + 1]);  \ +            AESState round_state;                                         \ +            cpu_to_le64s(vd + i * 2 + 0);                                 \ +            cpu_to_le64s(vd + i * 2 + 1);                                 \ +            for (int j = 0; j < 16; j++) {                                \ +                round_state.b[j] = ((uint8_t *)(vd + i * 2))[j];          \
+ }                                                             \

I think all of this byte swapping is wrong.
With this last loop particularly being particularly silly.

You want to present the 16 bytes in *host* endian order.
Because the words are always in little-endian order (see H1 et al),
we only need to swap the words on big-endian hosts.

See 20230620110758.787479-21-richard.henderson@linaro.org/">https://lore.kernel.org/qemu-devel/20230620110758.787479-21-richard.henderson@linaro.org/
where I do exactly the same thing for ARM:

+        AESState *ad = (AESState *)(vd + i);
+        AESState *st = (AESState *)(vm + i);
+        AESState t;
+
+        /* Our uint64_t are in the wrong order for big-endian. */
+        if (HOST_BIG_ENDIAN) {
+            t.d[0] = st->d[1];
+            t.d[1] = st->d[0];
+            aesdec_IMC(&t, &t, false);
+            ad->d[0] = t.d[1];
+            ad->d[1] = t.d[0];
+        } else {
+            aesdec_IMC(ad, st, false);
+        }

+void HELPER(vaeskf1_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm,
+                        CPURISCVState *env, uint32_t desc)
+{
+    uint32_t *vd = vd_vptr;
+    uint32_t *vs2 = vs2_vptr;
+    uint32_t vl = env->vl;
+    uint32_t total_elems = vext_get_total_elems(env, desc, 4);
+    uint32_t vta = vext_vta(desc);
+
+    uimm &= 0b1111;
+    if (uimm > 10 || uimm == 0) {
+        uimm ^= 0b1000;
+    }
+
+    for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) {
+        uint32_t rk[8];
+        static const uint32_t rcon[] = {
+            0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
+            0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000,
+        };
+
+        rk[0] = bswap32(vs2[i * 4 + H4(0)]);
+        rk[1] = bswap32(vs2[i * 4 + H4(1)]);
+        rk[2] = bswap32(vs2[i * 4 + H4(2)]);
+        rk[3] = bswap32(vs2[i * 4 + H4(3)]);
+
+        rk[4] = rk[0] ^ (((uint32_t)AES_sbox[(rk[3] >> 16) & 0xff] << 24) | +                         ((uint32_t)AES_sbox[(rk[3] >> 8) & 0xff] << 16) | +                         ((uint32_t)AES_sbox[(rk[3] >> 0) & 0xff] << 8) | +                         ((uint32_t)AES_sbox[(rk[3] >> 24) & 0xff] << 0))
+                      ^ rcon[uimm - 1];
+        rk[5] = rk[1] ^ rk[4];
+        rk[6] = rk[2] ^ rk[5];
+        rk[7] = rk[3] ^ rk[6];
+
+        vd[i * 4 + H4(0)] = bswap32(rk[4]);
+        vd[i * 4 + H4(1)] = bswap32(rk[5]);
+        vd[i * 4 + H4(2)] = bswap32(rk[6]);
+        vd[i * 4 + H4(3)] = bswap32(rk[7]);
+    }
+    env->vstart = 0;
+    /* set tail elements to 1s */
+    vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4);
+}

All of this byte swapping is going to be wrong for a big-endian host.
It is also a little bit silly to do for a little-endian host.

You're byte swapping uint32_t words, then extracting bytes from those words.  Just extract the exact byte you require from the original input, using the H1() macro, and now you have correct code for both big- and little-endian hosts.


r~

I'll fix these byte swapping issues in the v7 patch set.

Thank you :)
Max



reply via email to

[Prev in Thread] Current Thread [Next in Thread]