[PULL 34/47] target/arm: Implement fp16 for Neon pairwise fp ops

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PULL 34/47] target/arm: Implement fp16 for Neon pairwise fp ops

From:	Peter Maydell
Subject:	[PULL 34/47] target/arm: Implement fp16 for Neon pairwise fp ops
Date:	Tue, 1 Sep 2020 16:18:10 +0100

Convert the Neon pairwise fp ops to use a single gvic-style
helper to do the full operation instead of one helper call
for each 32-bit part. This allows us to use the same
framework to implement the fp16.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200828183354.27913-36-peter.maydell@linaro.org
---
 target/arm/helper.h             |  7 +++++
 target/arm/vec_helper.c         | 45 +++++++++++++++++++++++++++++++++
 target/arm/translate-neon.c.inc | 42 ++++++++++++------------------
 3 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 5a716498913..f1f33c696d9 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -607,6 +607,13 @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(neon_paddh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pmaxh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pminh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_padds, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pmaxs, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(neon_pmins, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
 DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index e5bb5e395cb..46623d401e7 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1771,3 +1771,48 @@ DO_ABA(gvec_uaba_s, uint32_t)
 DO_ABA(gvec_uaba_d, uint64_t)
 
 #undef DO_ABA
+
+#define DO_NEON_PAIRWISE(NAME, OP)                                      \
+    void HELPER(NAME##s)(void *vd, void *vn, void *vm,                  \
+                         void *stat, uint32_t oprsz)                    \
+    {                                                                   \
+        float_status *fpst = stat;                                      \
+        float32 *d = vd;                                                \
+        float32 *n = vn;                                                \
+        float32 *m = vm;                                                \
+        float32 r0, r1;                                                 \
+                                                                        \
+        /* Read all inputs before writing outputs in case vm == vd */   \
+        r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst);                    \
+        r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst);                    \
+                                                                        \
+        d[H4(0)] = r0;                                                  \
+        d[H4(1)] = r1;                                                  \
+    }                                                                   \
+                                                                        \
+    void HELPER(NAME##h)(void *vd, void *vn, void *vm,                  \
+                         void *stat, uint32_t oprsz)                    \
+    {                                                                   \
+        float_status *fpst = stat;                                      \
+        float16 *d = vd;                                                \
+        float16 *n = vn;                                                \
+        float16 *m = vm;                                                \
+        float16 r0, r1, r2, r3;                                         \
+                                                                        \
+        /* Read all inputs before writing outputs in case vm == vd */   \
+        r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst);                    \
+        r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst);                    \
+        r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst);                    \
+        r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst);                    \
+                                                                        \
+        d[H4(0)] = r0;                                                  \
+        d[H4(1)] = r1;                                                  \
+        d[H4(2)] = r2;                                                  \
+        d[H4(3)] = r3;                                                  \
+    }
+
+DO_NEON_PAIRWISE(neon_padd, add)
+DO_NEON_PAIRWISE(neon_pmax, max)
+DO_NEON_PAIRWISE(neon_pmin, min)
+
+#undef DO_NEON_PAIRWISE
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index 4c6d63be3d3..d13075986e5 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -1112,10 +1112,10 @@ static bool trans_VMINNM_fp_3s(DisasContext *s, 
arg_3same *a)
     return do_3same(s, a, gen_VMINNM_fp32_3s);
 }
 
-static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
+static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
+                             gen_helper_gvec_3_ptr *fn)
 {
-    /* FP operations handled pairwise 32 bits at a time */
-    TCGv_i32 tmp, tmp2, tmp3;
+    /* FP pairwise operations */
     TCGv_ptr fpstatus;
 
     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
@@ -1134,26 +1134,14 @@ static bool do_3same_fp_pair(DisasContext *s, arg_3same 
*a, VFPGen3OpSPFn *fn)
 
     assert(a->q == 0); /* enforced by decode patterns */
 
-    /*
-     * Note that we have to be careful not to clobber the source operands
-     * in the "vm == vd" case by storing the result of the first pass too
-     * early. Since Q is 0 there are always just two passes, so instead
-     * of a complicated loop over each pass we just unroll.
-     */
-    fpstatus = fpstatus_ptr(FPST_STD);
-    tmp = neon_load_reg(a->vn, 0);
-    tmp2 = neon_load_reg(a->vn, 1);
-    fn(tmp, tmp, tmp2, fpstatus);
-    tcg_temp_free_i32(tmp2);
 
-    tmp3 = neon_load_reg(a->vm, 0);
-    tmp2 = neon_load_reg(a->vm, 1);
-    fn(tmp3, tmp3, tmp2, fpstatus);
-    tcg_temp_free_i32(tmp2);
+    fpstatus = fpstatus_ptr(a->size != 0 ? FPST_STD_F16 : FPST_STD);
+    tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
+                       vfp_reg_offset(1, a->vn),
+                       vfp_reg_offset(1, a->vm),
+                       fpstatus, 8, 8, 0, fn);
     tcg_temp_free_ptr(fpstatus);
 
-    neon_store_reg(a->vd, 0, tmp);
-    neon_store_reg(a->vd, 1, tmp3);
     return true;
 }
 
@@ -1165,15 +1153,17 @@ static bool do_3same_fp_pair(DisasContext *s, arg_3same 
*a, VFPGen3OpSPFn *fn)
     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
     {                                                               \
         if (a->size != 0) {                                         \
-            /* TODO fp16 support */                                 \
-            return false;                                           \
+            if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
+                return false;                                       \
+            }                                                       \
+            return do_3same_fp_pair(s, a, FUNC##h);                 \
         }                                                           \
-        return do_3same_fp_pair(s, a, FUNC);                        \
+        return do_3same_fp_pair(s, a, FUNC##s);                     \
     }
 
-DO_3S_FP_PAIR(VPADD, gen_helper_vfp_adds)
-DO_3S_FP_PAIR(VPMAX, gen_helper_vfp_maxs)
-DO_3S_FP_PAIR(VPMIN, gen_helper_vfp_mins)
+DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
+DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
+DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
 
 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
 {
-- 
2.20.1

[Prev in Thread]

Current Thread

[Next in Thread]

[PULL 23/47] target/arm: Implement fp16 for Neon VRECPE, VRSQRTE using gvec, (continued)
- [PULL 23/47] target/arm: Implement fp16 for Neon VRECPE, VRSQRTE using gvec, Peter Maydell, 2020/09/01
- [PULL 24/47] target/arm: Implement fp16 for Neon VABS, VNEG of floats, Peter Maydell, 2020/09/01
- [PULL 25/47] target/arm: Implement fp16 for VCEQ, VCGE, VCGT comparisons, Peter Maydell, 2020/09/01
- [PULL 27/47] target/arm: Implement fp16 for Neon VMAX, VMIN, Peter Maydell, 2020/09/01
- [PULL 28/47] target/arm: Implement fp16 for Neon VMAXNM, VMINNM, Peter Maydell, 2020/09/01
- [PULL 29/47] target/arm: Implement fp16 for Neon VMLA, VMLS operations, Peter Maydell, 2020/09/01
- [PULL 30/47] target/arm: Implement fp16 for Neon VFMA, VMFS, Peter Maydell, 2020/09/01
- [PULL 26/47] target/arm: Implement fp16 for VACGE, VACGT, Peter Maydell, 2020/09/01
- [PULL 31/47] target/arm: Implement fp16 for Neon fp compare-vs-0, Peter Maydell, 2020/09/01
- [PULL 33/47] target/arm: Implement fp16 for Neon VRSQRTS, Peter Maydell, 2020/09/01
- [PULL 34/47] target/arm: Implement fp16 for Neon pairwise fp ops, Peter Maydell <=
- [PULL 35/47] target/arm: Implement fp16 for Neon float-integer VCVT, Peter Maydell, 2020/09/01
- [PULL 32/47] target/arm: Implement fp16 for Neon VRECPS, Peter Maydell, 2020/09/01
- [PULL 36/47] target/arm: Convert Neon VCVT fixed-point to gvec, Peter Maydell, 2020/09/01
- [PULL 37/47] target/arm: Implement fp16 for Neon VCVT fixed-point, Peter Maydell, 2020/09/01
- [PULL 39/47] target/arm: Implement fp16 for Neon VRINT-with-specified-rounding-mode, Peter Maydell, 2020/09/01
- [PULL 40/47] target/arm: Implement fp16 for Neon VRINTX, Peter Maydell, 2020/09/01
- [PULL 38/47] target/arm: Implement fp16 for Neon VCVT with rounding modes, Peter Maydell, 2020/09/01
- [PULL 41/47] target/arm/vec_helper: Handle oprsz less than 16 bytes in indexed operations, Peter Maydell, 2020/09/01
- [PULL 42/47] target/arm/vec_helper: Add gvec fp indexed multiply-and-add operations, Peter Maydell, 2020/09/01
- [PULL 43/47] target/arm: Implement fp16 for Neon VMUL, VMLA, VMLS, Peter Maydell, 2020/09/01

Prev by Date: [PULL 33/47] target/arm: Implement fp16 for Neon VRSQRTS
Next by Date: [PULL 35/47] target/arm: Implement fp16 for Neon float-integer VCVT
Previous by thread: [PULL 33/47] target/arm: Implement fp16 for Neon VRSQRTS
Next by thread: [PULL 35/47] target/arm: Implement fp16 for Neon float-integer VCVT
Index(es):
- Date
- Thread