[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and
From: |
Richard Henderson |
Subject: |
Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract |
Date: |
Sat, 14 Mar 2020 18:00:58 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.4.1 |
On 3/14/20 4:12 PM, LIU Zhiwei wrote:
> I am not sure whether I get it. In my opinion, the code should be modified
> like
>
> static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
> {
> int16_t res = (int16_t)a + (int16_t)b;
> uint8_t round = res & 0x1;
> res = (res >> 1) + round;
> return res;
> }
>
> static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
> {
> int16_t res = (int16_t)a + (int16_t)b;
> uint8_t round = ((res & 0x3) == 0x3);
> res = (res >> 1) + round;
> return res;
> }
>
> static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
> {
> int16_t res = (int16_t)a + (int16_t)b;
> res = (res >> 1);
> return res;
> }
>
> static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
> {
> int16_t res = (int16_t)a + (int16_t)b;
> uint8_t round = ((res & 0x3) == 0x1);
> res = (res >> 1) + round;
> return res;
> }
>
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)
>
> void do_vext_vv_env(void *vd, void *v0, void *vs1,
> void *vs2, CPURISCVState *env, uint32_t desc,
> uint32_t esz, uint32_t dsz,
> opivv2_fn *fn, clear_fn *clearfn)
> {
> uint32_t vlmax = vext_maxsz(desc) / esz;
> uint32_t mlen = vext_mlen(desc);
> uint32_t vm = vext_vm(desc);
> uint32_t vl = env->vl;
> uint32_t i;
> for (i = 0; i < vl; i++) {
> if (!vm && !vext_elem_mask(v0, mlen, i)) {
> continue;
> }
> fn(vd, vs1, vs2, i, env);
> }
> if (i != 0) {
> clear_fn(vd, vl, vl * dsz, vlmax * dsz);
> }
> }
>
> #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN) \
> void HELPER(NAME)(void *vd, void *v0, void *vs1, \
> void *vs2, CPURISCVState *env, \
> uint32_t desc) \
> { \
> static opivv2_fn *fns[4] = { \
> NAME##_rnu, NAME##_rne, \
> NAME##_rdn, NAME##_rod \
> } \
> return do_vext_vv_env(vd, v0, vs1, vs2, env, desc, \
> ESZ, DSZ, fns[env->vxrm], \
> CLEAR_FN); \
> }
>
> Is it true?
While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.
My suggestion was
static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
int64_t res = (int64_t)a + b;
uint8_t round = get_round(vxrm, res, 1);
return (res >> 1) + round;
}
static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
int64_t res = a + b;
uint8_t round = get_round(vxrm, res, 1);
int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
/* With signed overflow, bit 64 is inverse of bit 63. */
return ((res >> 1) ^ over) + round;
}
RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
opivv2_rm_fn *fn)
{
for (uint32_t i = 0; i < vl; i++) {
if (!vm && !vext_elem_mask(v0, mlen, i)) {
continue;
}
fn(vd, vs1, vs2, i, vxrm);
}
}
static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
void *vs2, CPURISCVState *env, uint32_t desc,
uint32_t esz, uint32_t dsz,
opivv2_rm_fn *fn, clear_fn *clearfn)
{
uint32_t vlmax = vext_maxsz(desc) / esz;
uint32_t mlen = vext_mlen(desc);
uint32_t vm = vext_vm(desc);
uint32_t vl = env->vl;
if (vl == 0) {
return;
}
switch (env->vxrm) {
case 0: /* rnu */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 0, fn);
break;
case 1: /* rne */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 1, fn);
break;
case 2: /* rdn */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 2, fn);
break;
default: /* rod */
vext_vv_rm_1(vd, v0, vs1, vs2,
vl, vm, mlen, 3, fn);
break;
}
clear_fn(vd, vl, vl * dsz, vlmax * dsz);
}
>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain. At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.
r~
[PATCH v5 25/60] target/riscv: vector single-width fractional multiply with rounding and saturation, LIU Zhiwei, 2020/03/12
[PATCH v5 26/60] target/riscv: vector widening saturating scaled multiply-add, LIU Zhiwei, 2020/03/12
[PATCH v5 27/60] target/riscv: vector single-width scaling shift instructions, LIU Zhiwei, 2020/03/12
[PATCH v5 28/60] target/riscv: vector narrowing fixed-point clip instructions, LIU Zhiwei, 2020/03/12