On 8/27/19 2:37 AM, Stefan Brankovic wrote:
+ for (i = 0; i < 4; i++) {
+ switch (i) {
+ case 0:
+ /*
+ * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 1 and 2.
+ */
+ get_avr64(avr, VA, true);
+ tcg_gen_movi_i64(result, 0x0ULL);
+ break;
+ case 1:
+ /*
+ * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
+ * 3 and 4.
+ */
+ get_avr64(avr, VA, false);
+ break;
+ case 2:
+ /*
+ * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 5 and 6.
+ */
+ get_avr64(avr, VB, true);
+ tcg_gen_movi_i64(result, 0x0ULL);
+ break;
+ case 3:
+ /*
+ * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
+ * 7 and 8.
+ */
+ get_avr64(avr, VB, false);
+ break;
+ }
+ /* Perform the packing for 2 pixels(each iteration for 1). */
+ tcg_gen_movi_i64(tmp, 0x0ULL);
+ for (j = 0; j < 2; j++) {
+ tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
+ tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
+ tcg_gen_or_i64(tmp, tmp, shifted);
+
+ tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
+ tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
+ tcg_gen_or_i64(tmp, tmp, shifted);
+
+ tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
+ tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
+ tcg_gen_or_i64(tmp, tmp, shifted);
+ }
+ if ((i == 0) || (i == 2)) {
+ tcg_gen_shli_i64(tmp, tmp, 32);
+ }
+ tcg_gen_or_i64(result, result, tmp);
+ if (i == 1) {
+ /* Place packed pixels 1:4 to high doubleword of vD. */
+ tcg_gen_mov_i64(result1, result);
+ }
+ if (i == 3) {
+ /* Place packed pixels 5:8 to low doubleword of vD. */
+ tcg_gen_mov_i64(result2, result);
+ }
+ }
+ set_avr64(VT, result1, true);
+ set_avr64(VT, result2, false);
I really have a hard time believing that it is worthwhile to inline all of this
code. By my count this is 82 non-move opcodes. That is a *lot* of inline
expansion.
However, I can well imagine that the existing out-of-line helper is less than
optimal.
-void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
- int i, j;
- ppc_avr_t result;
-#if defined(HOST_WORDS_BIGENDIAN)
- const ppc_avr_t *x[2] = { a, b };
-#else
- const ppc_avr_t *x[2] = { b, a };
-#endif
-
- VECTOR_FOR_INORDER_I(i, u64) {
- VECTOR_FOR_INORDER_I(j, u32) {
- uint32_t e = x[i]->u32[j];
Double indirect loads?
-
- result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
- ((e >> 6) & 0x3e0) |
- ((e >> 3) & 0x1f));
Store to temporary ...
- }
- }
- *r = result;
... and then copy?
Try replacing the existing helper with something like the following.
r~
static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
uint64_t r;
r = ((a >> (shr + 9)) & 0x3f) << shl;
r |= ((a >> (shr + 6)) & 0x1f) << shl;
r |= ((a >> (shr + 3)) & 0x1f) << shl;
return r;
}
static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
return pkpx_1(ah, 32, 48)
| pkpx_1(ah, 0, 32)
| pkpx_1(al, 32, 16)
| pkpx_1(al, 0, 0);
}
void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
{
uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
r->VsrD(0) = rh;
r->VsrD(1) = rl;
}