[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 09/10 v11] target-tilegx: Generate tcg instructi
From: |
Richard Henderson |
Subject: |
Re: [Qemu-devel] [PATCH 09/10 v11] target-tilegx: Generate tcg instructions to finish "Hello world" |
Date: |
Mon, 01 Jun 2015 11:40:29 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.7.0 |
First, what happened to the decoding skeleton patch? You seem to have merged
it with patch 9 here. That said, see the bottom of this message.
On 05/30/2015 02:18 PM, Chen Gang wrote:
> +/* mfspr can be only in X1 pipe, so it doesn't need to be bufferd */
> +static void gen_mfspr(struct DisasContext *dc, uint8_t rdst, uint16_t imm14)
I'm not keen on this as a comment. Clearly it could be buffered, with what is
implemented here now. But there are plenty of SPRs for which produce side
effects, and *cannot* be buffered.
Adjust the comment to
/* Many SPR reads have side effects and cannot be buffered. However, they are
all in the X1 pipe, which we are executing last, therefore we need not do
additional buffering. */
> +/* mtspr can be only in X1 pipe, so it doesn't need to be bufferd */
Same, but s/reads/writes/.
> +#if 1
Do not include this.
> +/*
> + * uint64_t output = 0;
> + * uint32_t counter;
> + * for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++)
> + * {
> + * int8_t srca = getByte (rf[SrcA], counter);
> + * int8_t srcb = signExtend8 (Imm8);
> + * output = setByte (output, counter, ((srca == srcb) ? 1 : 0));
> + * }
> + * rf[Dest] = output;
> + */
> +static void gen_v1cmpeqi(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, int8_t imm8)
Pass in the condition to use, since you'll eventually need to implement
v1cmpltsi, v1cmpltui.
> +static void gen_v1cmpeq(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
Likewise for v1cmples, v1cmpleu, v1cmplts, v1cmpltu, v1cmpne.
> + tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type ==
> TEMP_VAL_REG' */
These comments are unnecessary. Of course it's illegal to use an uninitialized
temporary.
> +static void gen_v4int_l(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> + qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n",
> + rdst, rsrc, rsrcb);
> + tcg_gen_deposit_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
> + load_gr(dc, rsrcb), 0, 32);
This is incorrect. This produces { A1, B0 }, not { A0, B0 }.
As I said, you did want "32, 32" as the field insert, but you have the source
operands in the wrong order.
> +static void gen_addx(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> + TCGv vdst = dest_gr(dc, rdst);
> +
> + /* High bits have no effect with low bits, so addx and addxsc are
> merged. */
> + qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(sc) r%d, r%d, r%d\n",
> + rdst, rsrc, rsrcb);
Um, no, addxsc does signed saturation before sign extension.
> +static void gen_mul_u_u(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
> + int8 high, int8 highb, int8 add, const char *code)
A better name for this function is warranted, since it does much more than
mul_u_u. The add parameter should be type bool.
Given the existence of mul_hs_hs, mul_hu_ls, etc, you're probably better off
passing in extraction functions. E.g.
static void ext32s_high(TCGv d, TCGv s)
{
tcg_gen_sari_i64(d, s, 32);
}
static void ext32u_high(TCGv d, TCGv s)
{
tcg_gen_shri_i64(d, s, 32);
}
gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32s_high,
false, "mul_hs_hs");
gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32u_high,
false, "mul_hs_hu");
gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32s_i64,
false, "mul_hs_ls");
gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32u_i64,
false, "mul_hs_lu");
gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, ext32u_high,
false, "mul_hu_hu");
gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32s_i64,
false, "mul_hu_ls");
gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32u_i64,
false, "mul_hu_lu");
gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32s_i64,
false, "mul_ls_ls");
gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32u_i64,
false, "mul_ls_lu");
gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32u_i64, tcg_gen_ext32u_i64,
false, "mul_lu_lu");
and of course the same for the mula insns with true instead of false for the
"add" parameter.
> +static void gen_shladd(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
> + uint8_t shift, uint8_t cast)
cast should be bool.
> +static void gen_dblalign(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> + TCGv vdst = dest_gr(dc, rdst);
> + TCGv mask = tcg_temp_new_i64();
> + TCGv tmp = tcg_temp_new_i64();
> +
> + qemu_log_mask(CPU_LOG_TB_IN_ASM, "dblalign r%d, r%d, r%d\n",
> + rdst, rsrc, rsrcb);
> +
> + tcg_gen_andi_i64(mask, load_gr(dc, rsrcb), 7);
> + tcg_gen_muli_i64(mask, mask, 8);
tcg_gen_shli_i64(mask, mask, 3);
> + tcg_gen_shr_i64(vdst, load_gr(dc, rdst), mask);
> +
> + tcg_gen_movi_i64(tmp, 64);
> + tcg_gen_sub_i64(mask, tmp, mask);
> + tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
> +
> + tcg_gen_or_i64(vdst, vdst, mask);
Does not produce the correct results for mask == 0.
What you want is when mask == 0, you shift A by 64 bits, i.e. produce a zero.
But you can't do that in TCG (or C for that matter). Best is to do two shifts:
tcg_gen_xori_i64(mask, mask, 63); /* compute 1's compliment of the shift */
tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
tcg_gen_shli_i64(mask, mask, 1); /* one more to produce 2's compliment */
> +static void gen_ld_add(struct DisasContext *dc,
> + uint8_t rdst, uint8_t rsrc, int8_t imm8,
> + TCGMemOp ops, const char *code)
> +{
> + qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
> + code, rdst, rsrc, imm8);
> +
> + tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
> + MMU_USER_IDX, ops);
> + /*
> + * Each pipe only have one temp val which is already used, and it is only
> + * for pipe X1, so can use real register
> + */
> + if (rsrc < TILEGX_R_COUNT) {
> + tcg_gen_addi_i64(cpu_regs[rsrc], load_gr(dc, rsrc), imm8);
> + }
> +}
This is a poor comment. Clearly each pipe can have two outputs, so this
limitation is simply of your own design.
Further, the < TILEGX_R_COUNT restriction is also incorrect. True, you don't
actually implement the top 7 special registers, but that doesn't matter, you
should still be incrementing them.
> +
> + return;
Do not add bare return statments at the ends of functions.
> +static int gen_blb(struct DisasContext *dc, uint8_t rsrc, int32_t off,
> + TCGCond cond, const char *code)
Unused return value. What were you intending?
> +static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
> + tilegx_bundle_bits bundle)
> +{
> + uint8_t rsrc = get_SrcA_Y0(bundle);
> + uint8_t rsrcb = get_SrcB_Y0(bundle);
> + uint8_t rdst = get_Dest_Y0(bundle);
> +
> + switch (get_RRROpcodeExtension_Y0(bundle)) {
> + case UNARY_RRR_1_OPCODE_Y0:
> + switch (get_UnaryOpcodeExtension_Y0(bundle)) {
> + case CNTLZ_UNARY_OPCODE_Y0:
> + gen_cntlz(dc, rdst, rsrc);
> + return;
> + case CNTTZ_UNARY_OPCODE_Y0:
> + gen_cnttz(dc, rdst, rsrc);
> + return;
> + case NOP_UNARY_OPCODE_Y0:
> + case FNOP_UNARY_OPCODE_Y0:
> + if (!rsrc && !rdst) {
> + qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
> + return;
> + }
> + break;
> + case FSINGLE_PACK1_UNARY_OPCODE_Y0:
> + case PCNT_UNARY_OPCODE_Y0:
> + case REVBITS_UNARY_OPCODE_Y0:
> + case REVBYTES_UNARY_OPCODE_Y0:
> + case TBLIDXB0_UNARY_OPCODE_Y0:
> + case TBLIDXB1_UNARY_OPCODE_Y0:
> + case TBLIDXB2_UNARY_OPCODE_Y0:
> + case TBLIDXB3_UNARY_OPCODE_Y0:
> + default:
> + break;
> + }
> + break;
> + case SHL1ADD_RRR_1_OPCODE_Y0:
> + gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
> + return;
> + case SHL2ADD_RRR_1_OPCODE_Y0:
> + gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
> + return;
> + case SHL3ADD_RRR_1_OPCODE_Y0:
> + gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
> + return;
> + default:
> + break;
> + }
> +
> + qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n",
> bundle);
> +}
> +
I can't help thinking, as I read all of these decode functions, that it would
be better if the output disassembly, i.e. qemu_log_mask(CPU_LOG_TB_IN_ASM, *),
were to happen here, instead of being spread across 99 other functions.
This has a side effect of reducing many of your functions to a single
statement, invoking another tcg generator, at which point it's worth inlining
them.
For example:
static void decode_rrr_1_unary_y0(struct DisasContext *dc,
tilegx_bundle_bits bundle,
uint8_t rdst, uint8_t rsrc)
{
unsigned ext = get_UnaryOpcodeExtension_Y0(bundle);
const char *mnemonic;
TCGv vdst, vsrc;
if (ext == NOP_UNARY_OPCODE_Y0 || ext == FNOP_UNARY_OPCODE_Y0) {
if (rsrc != 0 || rdst != 0) {
goto unimplemented;
}
qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
return;
}
vdst = dest_gr(dc, rdst);
vsrc = load_gr(dc, rsrc);
switch (ext) {
case CNTLZ_UNARY_OPCODE_Y0:
gen_helper_cntlz(vdst, vsrc);
mnemonic = "cntlz";
break;
case CNTTZ_UNARY_OPCODE_Y0:
gen_helper_cnttz(vdst, vsrc);
mnemonic = "cnttz";
break;
case FSINGLE_PACK1_UNARY_OPCODE_Y0:
case PCNT_UNARY_OPCODE_Y0:
case REVBITS_UNARY_OPCODE_Y0:
case REVBYTES_UNARY_OPCODE_Y0:
case TBLIDXB0_UNARY_OPCODE_Y0:
case TBLIDXB1_UNARY_OPCODE_Y0:
case TBLIDXB2_UNARY_OPCODE_Y0:
case TBLIDXB3_UNARY_OPCODE_Y0:
default:
unimplemented:
qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_unary_y0, [" FMT64X "]\n",
bundle);
dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
return;
}
qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d\n",
mnemonic, rdst, rsrc);
}
static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
tilegx_bundle_bits bundle)
{
unsigned ext = get_RRROpcodeExtension_Y0(bundle);
uint8_t rsrca = get_SrcA_Y0(bundle);
uint8_t rsrcb = get_SrcB_Y0(bundle);
uint8_t rdst = get_Dest_Y0(bundle);
const char *mnemonic;
TCGv vdst, vsrca, vsrcb;
if (ext == UNARY_RRR_1_OPCODE_Y0) {
decode_rrr_1_unary_y0(dc, bundle, rdst, rsrc);
return;
}
vdst = dest_gr(dc, rdst);
vsrca = load_gr(dc, rsrca);
vsrcb = load_gr(dc, rsrcb);
switch (ext) {
case SHL1ADD_RRR_1_OPCODE_Y0:
gen_shladd(vdst, vsrca, vsrcb, 1, 0);
mnemonic = "shl1add";
break;
case SHL2ADD_RRR_1_OPCODE_Y0:
gen_shladd(vdst, vsrca, vsrcb, 2, 0);
mnemonic = "shl2add";
break;
case SHL3ADD_RRR_1_OPCODE_Y0:
gen_shladd(vdst, vsrca, vsrcb, 3, 0);
mnemonic = "shl3add";
break;
default:
qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n",
bundle);
dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
return;
}
qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d,r%d\n",
mnemonic, rdst, rsrca, rsrcb);
}
r~
- Re: [Qemu-devel] [PATCH 09/10 v11] target-tilegx: Generate tcg instructions to finish "Hello world",
Richard Henderson <=