[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 07/10] target/arm: Convert Neon 2-reg-scalar long multiplies to d
From: |
Peter Maydell |
Subject: |
[PATCH 07/10] target/arm: Convert Neon 2-reg-scalar long multiplies to decodetree |
Date: |
Thu, 11 Jun 2020 15:45:26 +0100 |
Convert the Neon 2-reg-scalar long multiplies to decodetree.
These are the last instructions in the group.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
target/arm/neon-dp.decode | 18 ++++
target/arm/translate-neon.inc.c | 163 ++++++++++++++++++++++++++++
target/arm/translate.c | 182 ++------------------------------
3 files changed, 187 insertions(+), 176 deletions(-)
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
index 7b069abe628..9ff182f56d7 100644
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@@ -476,16 +476,34 @@ Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 .
op:1 1 .... @1reg_imm
@2scalar .... ... q:1 . . size:2 .... .... .... . . . . .... \
&2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp
+ # For the 'long' ops the Q bit is part of insn decode
+ @2scalar_q0 .... ... . . . size:2 .... .... .... . . . . .... \
+ &2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp q=0
VMLA_2sc 1111 001 . 1 . .. .... .... 0000 . 1 . 0 .... @2scalar
VMLA_F_2sc 1111 001 . 1 . .. .... .... 0001 . 1 . 0 .... @2scalar
+ VMLAL_S_2sc 1111 001 0 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
+ VMLAL_U_2sc 1111 001 1 1 . .. .... .... 0010 . 1 . 0 .... @2scalar_q0
+
+ VQDMLAL_2sc 1111 001 0 1 . .. .... .... 0011 . 1 . 0 .... @2scalar_q0
+
VMLS_2sc 1111 001 . 1 . .. .... .... 0100 . 1 . 0 .... @2scalar
VMLS_F_2sc 1111 001 . 1 . .. .... .... 0101 . 1 . 0 .... @2scalar
+ VMLSL_S_2sc 1111 001 0 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
+ VMLSL_U_2sc 1111 001 1 1 . .. .... .... 0110 . 1 . 0 .... @2scalar_q0
+
+ VQDMLSL_2sc 1111 001 0 1 . .. .... .... 0111 . 1 . 0 .... @2scalar_q0
+
VMUL_2sc 1111 001 . 1 . .. .... .... 1000 . 1 . 0 .... @2scalar
VMUL_F_2sc 1111 001 . 1 . .. .... .... 1001 . 1 . 0 .... @2scalar
+ VMULL_S_2sc 1111 001 0 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
+ VMULL_U_2sc 1111 001 1 1 . .. .... .... 1010 . 1 . 0 .... @2scalar_q0
+
+ VQDMULL_2sc 1111 001 0 1 . .. .... .... 1011 . 1 . 0 .... @2scalar_q0
+
VQDMULH_2sc 1111 001 . 1 . .. .... .... 1100 . 1 . 0 .... @2scalar
VQRDMULH_2sc 1111 001 . 1 . .. .... .... 1101 . 1 . 0 .... @2scalar
diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c
index 19344ac331d..f054b567c8f 100644
--- a/target/arm/translate-neon.inc.c
+++ b/target/arm/translate-neon.inc.c
@@ -2649,3 +2649,166 @@ static bool trans_VQRDMLSH_2sc(DisasContext *s,
arg_2scalar *a)
};
return do_vqrdmlah_2sc(s, a, opfn[a->size]);
}
+
+static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
+ NeonGenTwoOpWidenFn *opfn,
+ NeonGenTwo64OpFn *accfn)
+{
+ /*
+ * Two registers and a scalar, long operations: perform an
+ * operation on the input elements and the scalar which produces
+ * a double-width result, and then possibly perform an accumulation
+ * operation of that result into the destination.
+ */
+ TCGv_i32 scalar, rn;
+ TCGv_i64 rn0_64, rn1_64;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vn | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if (!opfn) {
+ /* Bad size (including size == 3, which is a different insn group) */
+ return false;
+ }
+
+ if (a->vd & 1) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ scalar = neon_get_scalar(a->size, a->vm);
+
+ /* Load all inputs before writing any outputs, in case of overlap */
+ rn = neon_load_reg(a->vn, 0);
+ rn0_64 = tcg_temp_new_i64();
+ opfn(rn0_64, rn, scalar);
+ tcg_temp_free_i32(rn);
+
+ rn = neon_load_reg(a->vn, 1);
+ rn1_64 = tcg_temp_new_i64();
+ opfn(rn1_64, rn, scalar);
+ tcg_temp_free_i32(rn);
+ tcg_temp_free_i32(scalar);
+
+ if (accfn) {
+ TCGv_i64 t64 = tcg_temp_new_i64();
+ neon_load_reg64(t64, a->vd);
+ accfn(t64, t64, rn0_64);
+ neon_store_reg64(t64, a->vd);
+ neon_load_reg64(t64, a->vd + 1);
+ accfn(t64, t64, rn1_64);
+ neon_store_reg64(t64, a->vd + 1);
+ tcg_temp_free_i64(t64);
+ } else {
+ neon_store_reg64(rn0_64, a->vd);
+ neon_store_reg64(rn1_64, a->vd + 1);
+ }
+ tcg_temp_free_i64(rn0_64);
+ tcg_temp_free_i64(rn1_64);
+ return true;
+}
+
+static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mull_s16,
+ gen_mull_s32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_helper_neon_mull_u16,
+ gen_mull_u32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+#define DO_VMLAL_2SC(INSN, MULL, ACC) \
+ static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a) \
+ { \
+ static NeonGenTwoOpWidenFn * const opfn[] = { \
+ NULL, \
+ gen_helper_neon_##MULL##16, \
+ gen_##MULL##32, \
+ NULL, \
+ }; \
+ static NeonGenTwo64OpFn * const accfn[] = { \
+ NULL, \
+ gen_helper_neon_##ACC##l_u32, \
+ tcg_gen_##ACC##_i64, \
+ NULL, \
+ }; \
+ return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]); \
+ }
+
+DO_VMLAL_2SC(VMLAL_S, mull_s, add)
+DO_VMLAL_2SC(VMLAL_U, mull_u, add)
+DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
+DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
+
+static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], NULL);
+}
+
+static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ NULL,
+ gen_VQDMLAL_acc_16,
+ gen_VQDMLAL_acc_32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
+}
+
+static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
+{
+ static NeonGenTwoOpWidenFn * const opfn[] = {
+ NULL,
+ gen_VQDMULL_16,
+ gen_VQDMULL_32,
+ NULL,
+ };
+ static NeonGenTwo64OpFn * const accfn[] = {
+ NULL,
+ gen_VQDMLSL_acc_16,
+ gen_VQDMLSL_acc_32,
+ NULL,
+ };
+
+ return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
+}
diff --git a/target/arm/translate.c b/target/arm/translate.c
index f0db029f66d..4d39bbf035b 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -377,43 +377,6 @@ static void gen_revsh(TCGv_i32 dest, TCGv_i32 var)
tcg_gen_ext16s_i32(dest, var);
}
-/* 32x32->64 multiply. Marks inputs as dead. */
-static TCGv_i64 gen_mulu_i64_i32(TCGv_i32 a, TCGv_i32 b)
-{
- TCGv_i32 lo = tcg_temp_new_i32();
- TCGv_i32 hi = tcg_temp_new_i32();
- TCGv_i64 ret;
-
- tcg_gen_mulu2_i32(lo, hi, a, b);
- tcg_temp_free_i32(a);
- tcg_temp_free_i32(b);
-
- ret = tcg_temp_new_i64();
- tcg_gen_concat_i32_i64(ret, lo, hi);
- tcg_temp_free_i32(lo);
- tcg_temp_free_i32(hi);
-
- return ret;
-}
-
-static TCGv_i64 gen_muls_i64_i32(TCGv_i32 a, TCGv_i32 b)
-{
- TCGv_i32 lo = tcg_temp_new_i32();
- TCGv_i32 hi = tcg_temp_new_i32();
- TCGv_i64 ret;
-
- tcg_gen_muls2_i32(lo, hi, a, b);
- tcg_temp_free_i32(a);
- tcg_temp_free_i32(b);
-
- ret = tcg_temp_new_i64();
- tcg_gen_concat_i32_i64(ret, lo, hi);
- tcg_temp_free_i32(lo);
- tcg_temp_free_i32(hi);
-
- return ret;
-}
-
/* Swap low and high halfwords. */
static void gen_swap_half(TCGv_i32 var)
{
@@ -3164,58 +3127,6 @@ static inline void gen_neon_addl(int size)
}
}
-static inline void gen_neon_negl(TCGv_i64 var, int size)
-{
- switch (size) {
- case 0: gen_helper_neon_negl_u16(var, var); break;
- case 1: gen_helper_neon_negl_u32(var, var); break;
- case 2:
- tcg_gen_neg_i64(var, var);
- break;
- default: abort();
- }
-}
-
-static inline void gen_neon_addl_saturate(TCGv_i64 op0, TCGv_i64 op1, int size)
-{
- switch (size) {
- case 1: gen_helper_neon_addl_saturate_s32(op0, cpu_env, op0, op1); break;
- case 2: gen_helper_neon_addl_saturate_s64(op0, cpu_env, op0, op1); break;
- default: abort();
- }
-}
-
-static inline void gen_neon_mull(TCGv_i64 dest, TCGv_i32 a, TCGv_i32 b,
- int size, int u)
-{
- TCGv_i64 tmp;
-
- switch ((size << 1) | u) {
- case 0: gen_helper_neon_mull_s8(dest, a, b); break;
- case 1: gen_helper_neon_mull_u8(dest, a, b); break;
- case 2: gen_helper_neon_mull_s16(dest, a, b); break;
- case 3: gen_helper_neon_mull_u16(dest, a, b); break;
- case 4:
- tmp = gen_muls_i64_i32(a, b);
- tcg_gen_mov_i64(dest, tmp);
- tcg_temp_free_i64(tmp);
- break;
- case 5:
- tmp = gen_mulu_i64_i32(a, b);
- tcg_gen_mov_i64(dest, tmp);
- tcg_temp_free_i64(tmp);
- break;
- default: abort();
- }
-
- /* gen_helper_neon_mull_[su]{8|16} do not free their parameters.
- Don't forget to clean them now. */
- if (size < 2) {
- tcg_temp_free_i32(a);
- tcg_temp_free_i32(b);
- }
-}
-
static void gen_neon_narrow_op(int op, int u, int size,
TCGv_i32 dest, TCGv_i64 src)
{
@@ -5120,7 +5031,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t
insn)
int u;
int vec_size;
uint32_t imm;
- TCGv_i32 tmp, tmp2, tmp3, tmp4, tmp5;
+ TCGv_i32 tmp, tmp2, tmp3, tmp5;
TCGv_ptr ptr1;
TCGv_i64 tmp64;
@@ -5158,92 +5069,11 @@ static int disas_neon_data_insn(DisasContext *s,
uint32_t insn)
return 1;
} else { /* (insn & 0x00800010 == 0x00800000) */
if (size != 3) {
- op = (insn >> 8) & 0xf;
- if ((insn & (1 << 6)) == 0) {
- /* Three registers of different lengths: handled by decodetree
*/
- return 1;
- } else {
- /* Two registers and a scalar. NB that for ops of this form
- * the ARM ARM labels bit 24 as Q, but it is in our variable
- * 'u', not 'q'.
- */
- if (size == 0) {
- return 1;
- }
- switch (op) {
- case 0: /* Integer VMLA scalar */
- case 4: /* Integer VMLS scalar */
- case 8: /* Integer VMUL scalar */
- case 1: /* Float VMLA scalar */
- case 5: /* Floating point VMLS scalar */
- case 9: /* Floating point VMUL scalar */
- case 12: /* VQDMULH scalar */
- case 13: /* VQRDMULH scalar */
- case 14: /* VQRDMLAH scalar */
- case 15: /* VQRDMLSH scalar */
- return 1; /* handled by decodetree */
-
- case 3: /* VQDMLAL scalar */
- case 7: /* VQDMLSL scalar */
- case 11: /* VQDMULL scalar */
- if (u == 1) {
- return 1;
- }
- /* fall through */
- case 2: /* VMLAL sclar */
- case 6: /* VMLSL scalar */
- case 10: /* VMULL scalar */
- if (rd & 1) {
- return 1;
- }
- tmp2 = neon_get_scalar(size, rm);
- /* We need a copy of tmp2 because gen_neon_mull
- * deletes it during pass 0. */
- tmp4 = tcg_temp_new_i32();
- tcg_gen_mov_i32(tmp4, tmp2);
- tmp3 = neon_load_reg(rn, 1);
-
- for (pass = 0; pass < 2; pass++) {
- if (pass == 0) {
- tmp = neon_load_reg(rn, 0);
- } else {
- tmp = tmp3;
- tmp2 = tmp4;
- }
- gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
- if (op != 11) {
- neon_load_reg64(cpu_V1, rd + pass);
- }
- switch (op) {
- case 6:
- gen_neon_negl(cpu_V0, size);
- /* Fall through */
- case 2:
- gen_neon_addl(size);
- break;
- case 3: case 7:
- gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
- if (op == 7) {
- gen_neon_negl(cpu_V0, size);
- }
- gen_neon_addl_saturate(cpu_V0, cpu_V1, size);
- break;
- case 10:
- /* no-op */
- break;
- case 11:
- gen_neon_addl_saturate(cpu_V0, cpu_V0, size);
- break;
- default:
- abort();
- }
- neon_store_reg64(cpu_V0, rd + pass);
- }
- break;
- default:
- g_assert_not_reached();
- }
- }
+ /*
+ * Three registers of different lengths, or two registers and
+ * a scalar: handled by decodetree
+ */
+ return 1;
} else { /* size == 3 */
if (!u) {
/* Extract. */
--
2.20.1
- Re: [PATCH 01/10] target/arm: Add 'static' and 'const' annotations to VSHLL function arrays, (continued)
- [PATCH 03/10] target/arm: Convert Neon 2-reg-scalar integer multiplies to decodetree, Peter Maydell, 2020/06/11
- [PATCH 06/10] target/arm: Convert Neon 2-reg-scalar VQRDMLAH, VQRDMLSH to decodetree, Peter Maydell, 2020/06/11
- [PATCH 09/10] target/arm: Convert Neon VTBL, VTBX to decodetree, Peter Maydell, 2020/06/11
- [PATCH 08/10] target/arm: Convert Neon VEXT to decodetree, Peter Maydell, 2020/06/11
- [PATCH 07/10] target/arm: Convert Neon 2-reg-scalar long multiplies to decodetree,
Peter Maydell <=
- [PATCH 10/10] target/arm: Convert Neon VDUP (scalar) to decodetree, Peter Maydell, 2020/06/11
- Re: [PATCH 00/10] target/arm: Convert 2-reg-scalar to decodetree, no-reply, 2020/06/11
- Re: [PATCH 00/10] target/arm: Convert 2-reg-scalar to decodetree, no-reply, 2020/06/11