faster memcpy for P4

bug-glibc

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

faster memcpy for P4

From:	Folkert van Heusden
Subject:	faster memcpy for P4
Date:	Thu, 10 Aug 2006 14:04:44 +0200
User-agent:	Mutt/1.5.12-2006-07-14

Hi,

I did some optimalisation of WORD_COPY_FWD in ./sysdeps/i386/memcopy.h
and got it 8.4% faster (on a Pentium 4) compared to the i586 routine.
How can I proceed for merging this into glibc (if it is at all
appreciated)?

#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes)              \
  do                                                                    \
    {                                                                   \
      asm volatile ("subl       $32,%2\n"                               \
                    "js         2f\n"                                   \
                    "movl       0(%0),%%edx\n"  /* alloc dest line */   \
                    "1:\n"                                              \
                    "movl       28(%0),%%eax\n" /* alloc dest line */   \
                    "subl       $32,%2\n"       /* decr loop count */   \
                    "movl       0(%1),%%eax\n"  /* U pipe */            \
                    "movl       4(%1),%%edx\n"  /* V pipe */            \
                    "movl       8(%1),%%ebx\n"              \
                    "movl       12(%1),%%ecx\n"              \
                    "movl       %%eax,0(%0)\n"              \
                    "movl       %%edx,4(%0)\n"              \
                    "movl       %%ebx,8(%0)\n"              \
                    "movl       %%ecx,12(%0)\n"              \
                    "movl       16(%1),%%eax\n"                         \
                    "movl       20(%1),%%edx\n"                         \
                    "movl       24(%1),%%ecx\n"                         \
                    "movl       28(%1),%%ebx\n"                         \
                    "movl       %%eax,16(%0)\n"                         \
                    "movl       %%edx,20(%0)\n"                         \
                    "movl       %%ecx,24(%0)\n"                         \
                    "movl       %%ebx,28(%0)\n"                         \
                    "leal       32(%1),%1\n"    /* update src ptr */    \
                    "leal       32(%0),%0\n"    /* update dst ptr */    \
                    "jns        1b\n"                                   \
                    "2: addl    $32,%2" :                               \
                    "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) :  \
                    "0" (dst_bp), "1" (src_bp), "2" (nbytes) :          \
                    "ax", "dx");                                        \
    } while (0)

As you can see I doubled the number of parallel loads. I do this as the
pentium 4 has 4 pipelines (while the i586 has only 2).


Folkert van Heusden

-- 
www.vanheusden.com/multitail - multitail is tail on steroids. multiple
               windows, filtering, coloring, anything you can think of
----------------------------------------------------------------------
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com

[Prev in Thread]

Current Thread

[Next in Thread]

faster memcpy for P4, Folkert van Heusden <=