qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec


From: Lucas Mateus Castro(alqotel)
Subject: [PATCH 05/12] target/ppc: Move VPRTYB[WDQ] to decodetree and use gvec
Date: Fri, 23 Sep 2022 18:47:47 -0300

From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>

Moved VPRTYBW and VPRTYBD to use gvec and both of them and VPRTYBQ to
decodetree.

vprtybw:
rept    loop    master             patch
8       12500   0,01215900         0,00705600 (-42.0%)
25      4000    0,01198700         0,00574400 (-52.1%)
100     1000    0,01307800         0,00692200 (-47.1%)
500     200     0,01794800         0,01558800 (-13.1%)
2500    40      0,04028200         0,05400800 (+34.1%)
8000    12      0,10127300         0,16744700 (+65.3%)

vprtybd:
rept    loop    master             patch
8       12500   0,00757400         0,00791600 (+4.5%)
25      4000    0,00651300         0,00673700 (+3.4%)
100     1000    0,00713400         0,00837700 (+17.4%)
500     200     0,01195400         0,01937400 (+62.1%)
2500    40      0,03478600         0,07005500 (+101.4%)
8000    12      0,09539600         0,21013500 (+120.3%)

vprtybq:
rept    loop    master             patch
8       12500   0,00065540         0,00066440 (+1.4%)
25      4000    0,00057720         0,00059850 (+3.7%)
100     1000    0,00066400         0,00069360 (+4.5%)
500     200     0,00115170         0,00127360 (+10.6%)
2500    40      0,00341890         0,00391550 (+14.5%)
8000    12      0,00951220         0,01111480 (+16.8%)

I wasn't expecting such a performance lost in both VPRTYBD and VPRTYBQ,
I'm not sure if it's worth to move those instructions. Comparing the 
assembly of the helper with the TCGop they are pretty similar, so
I'm not sure why vprtybd took so much more time.

Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
---
 target/ppc/helper.h                 |  6 ++--
 target/ppc/insn32.decode            |  4 +++
 target/ppc/int_helper.c             |  6 ++--
 target/ppc/translate/vmx-impl.c.inc | 55 +++++++++++++++++++++++++++--
 target/ppc/translate/vmx-ops.c.inc  |  3 --
 5 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index feccf30bcb..6a43e32ad3 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -194,9 +194,9 @@ DEF_HELPER_FLAGS_3(vsro, TCG_CALL_NO_RWG, void, avr, avr, 
avr)
 DEF_HELPER_FLAGS_3(vsrv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_3(vslv, TCG_CALL_NO_RWG, void, avr, avr, avr)
 DEF_HELPER_FLAGS_4(VADDCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
-DEF_HELPER_FLAGS_2(vprtybw, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybd, TCG_CALL_NO_RWG, void, avr, avr)
-DEF_HELPER_FLAGS_2(vprtybq, TCG_CALL_NO_RWG, void, avr, avr)
+DEF_HELPER_FLAGS_3(VPRTYBW, TCG_CALL_NO_RWG, void, avr, avr, i32)
+DEF_HELPER_FLAGS_3(VPRTYBD, TCG_CALL_NO_RWG, void, avr, avr, i32)
+DEF_HELPER_FLAGS_3(VPRTYBQ, TCG_CALL_NO_RWG, void, avr, avr, i32)
 DEF_HELPER_FLAGS_4(VSUBCUW, TCG_CALL_NO_RWG, void, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddsbs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
 DEF_HELPER_FLAGS_5(vaddshs, TCG_CALL_NO_RWG, void, avr, avr, avr, avr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 2658dd3395..aa4968e6b9 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -529,6 +529,10 @@ VCTZDM          000100 ..... ..... ..... 11111000100    @VX
 VPDEPD          000100 ..... ..... ..... 10111001101    @VX
 VPEXTD          000100 ..... ..... ..... 10110001101    @VX
 
+VPRTYBD         000100 ..... 01001 ..... 11000000010    @VX_tb
+VPRTYBQ         000100 ..... 01010 ..... 11000000010    @VX_tb
+VPRTYBW         000100 ..... 01000 ..... 11000000010    @VX_tb
+
 ## Vector Permute and Formatting Instruction
 
 VEXTDUBVLX      000100 ..... ..... ..... ..... 011000   @VA
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 338ebced22..64b2d44a66 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -502,7 +502,7 @@ void helper_VADDCUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t 
*b, uint32_t v)
 }
 
 /* vprtybw */
-void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBW(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
     int i;
     for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
@@ -513,7 +513,7 @@ void helper_vprtybw(ppc_avr_t *r, ppc_avr_t *b)
 }
 
 /* vprtybd */
-void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBD(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
     int i;
     for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
@@ -525,7 +525,7 @@ void helper_vprtybd(ppc_avr_t *r, ppc_avr_t *b)
 }
 
 /* vprtybq */
-void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+void helper_VPRTYBQ(ppc_avr_t *r, ppc_avr_t *b, uint32_t v)
 {
     uint64_t res = b->u64[0] ^ b->u64[1];
     res ^= res >> 32;
diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 3f614097ac..06d91d1304 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -1659,9 +1659,58 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
-GEN_VXFORM_NOA(vprtybw, 1, 24);
-GEN_VXFORM_NOA(vprtybd, 1, 24);
-GEN_VXFORM_NOA(vprtybq, 1, 24);
+
+static void gen_vprtyb(unsigned vece, TCGv_vec t, TCGv_vec b)
+{
+    int i;
+    TCGv_vec tmp = tcg_temp_new_vec_matching(b);
+    /* MO_32 is 2, so 2 iteractions for MO_32 and 3 for MO_64 */
+    for (i = 0; i < vece; i++) {
+        tcg_gen_shri_vec(vece, tmp, b, (4 << (vece - i)));
+        tcg_gen_xor_vec(vece, b, tmp, b);
+    }
+    tcg_gen_dupi_vec(vece, tmp, 1);
+    tcg_gen_and_vec(vece, t, b, tmp);
+    tcg_temp_free_vec(tmp);
+}
+
+static bool do_vx_vprtyb(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
+{
+    static const TCGOpcode vecop_list[] = {
+        INDEX_op_shri_vec, 0
+    };
+
+    static const GVecGen2 op[] = {
+        {
+            .fniv = gen_vprtyb,
+            .fno = gen_helper_VPRTYBW,
+            .opt_opc = vecop_list,
+            .vece = MO_32
+        },
+        {
+            .fniv = gen_vprtyb,
+            .fno = gen_helper_VPRTYBD,
+            .opt_opc = vecop_list,
+            .vece = MO_64
+        },
+        {
+            .fno = gen_helper_VPRTYBQ,
+            .vece = MO_128
+        },
+    };
+
+    REQUIRE_INSNS_FLAGS2(ctx, ISA300);
+    REQUIRE_VECTOR(ctx);
+
+    tcg_gen_gvec_2(avr_full_offset(a->vrt), avr_full_offset(a->vrb),
+                   16, 16, &op[vece - MO_32]);
+
+    return true;
+}
+
+TRANS(VPRTYBW, do_vx_vprtyb, MO_32)
+TRANS(VPRTYBD, do_vx_vprtyb, MO_64)
+TRANS(VPRTYBQ, do_vx_vprtyb, MO_128)
 
 static void gen_vsplt(DisasContext *ctx, int vece)
 {
diff --git a/target/ppc/translate/vmx-ops.c.inc 
b/target/ppc/translate/vmx-ops.c.inc
index 27908533dd..46a620a232 100644
--- a/target/ppc/translate/vmx-ops.c.inc
+++ b/target/ppc/translate/vmx-ops.c.inc
@@ -106,9 +106,6 @@ GEN_VXFORM_300(vsrv, 2, 28),
 GEN_VXFORM_300(vslv, 2, 29),
 GEN_VXFORM(vslo, 6, 16),
 GEN_VXFORM(vsro, 6, 17),
-GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300),
-GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300),
-GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300),
 
 GEN_VXFORM(xpnd04_1, 0, 22),
 GEN_VXFORM_300(bcdsr, 0, 23),
-- 
2.31.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]