bug#41535: [PATCH] performance optimization for aarch64

bug-gzip
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#41535: [PATCH] performance optimization for aarch64

From:	Li Qiang
Subject:	bug#41535: [PATCH] performance optimization for aarch64
Date:	Sat, 30 May 2020 17:17:49 +0800
User-agent:	Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Thunderbird/68.8.1

在 2020/5/26 10:39, l00374334 写道:
> From: liqiang <liqiang64@huawei.com>
> 
> By analyzing the compression and decompression process of gzip, I found 
> 
> that the hot spots of CRC32 and longest_match function are very high.
> 
> 
> 
> On the aarch64 architecture, we can optimize the efficiency of crc32 
> 
> through the interface provided by the neon instruction set (12x faster 
> 
> in aarch64), and optimize the performance of random access code through 
> 
> prefetch instructions (about 5%~8% improvement). In some compression 
> 
> scenarios, loop expansion can also get a certain performance improvement 
> 
> (about 10%).
> 
> 
> 
> Modify by Li Qiang.
> 
> ---
>  configure | 14 ++++++++++++++
>  deflate.c | 30 +++++++++++++++++++++++++++++-
>  util.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 88 insertions(+), 1 deletion(-)
> 
> diff --git a/configure b/configure
> index cab3daf..dc80cb6 100644
> --- a/configure
> +++ b/configure
> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext 
> conftest.$ac_ext
>             ;;
>  
>           arm* | aarch64 )
> +           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
> +/* end confdefs.h.  */
> +#if defined __ARM_NEON__ || defined __ARM_NEON
> +                   int ok;
> +                  #else
> +                   error fail
> +                  #endif
> +
> +_ACEOF
> +if ac_fn_c_try_compile "$LINENO"
> +then :
> +  CFLAGS="$CFLAGS -march=armv8-a+crc"
> +fi
> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>             # Assume arm with EABI.
>             # On arm64 systems, the C compiler may be generating code in one 
> of
>             # these ABIs:
> diff --git a/deflate.c b/deflate.c
> index 9d379e9..ee77ffd 100644
> --- a/deflate.c
> +++ b/deflate.c
> @@ -378,6 +378,9 @@ longest_match(IPos cur_match)
>      register int len;                           /* length of current match */
> 
>      int best_len = prev_length;                 /* best match length so far 
> */
> 
>      IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
> 
> +#ifdef __aarch64__
> 
> +    IPos next_match;
> 
> +#endif
> 
>      /* Stop when cur_match becomes <= limit. To simplify the code,
> 
>       * we prevent matches with the string of window index 0.
> 
>       */
> 
> @@ -411,6 +414,10 @@ longest_match(IPos cur_match)
>      do {
> 
>          Assert(cur_match < strstart, "no future");
> 
>          match = window + cur_match;
> 
> +#ifdef __aarch64__
> 
> +        next_match = prev[cur_match & WMASK];
> 
> +        __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
> 
> +#endif
> 
>  
> 
>          /* Skip to next match if the match length cannot increase
> 
>           * or if the match length is less than 2:
> 
> @@ -488,8 +495,14 @@ longest_match(IPos cur_match)
>              scan_end   = scan[best_len];
> 
>  #endif
> 
>          }
> 
> -    } while ((cur_match = prev[cur_match & WMASK]) > limit
> 
> +    }
> 
> +#ifdef __aarch64__
> 
> +    while ((cur_match = next_match) > limit
> 
> +             && --chain_length != 0);
> 
> +#else
> 
> +    while ((cur_match = prev[cur_match & WMASK]) > limit
> 
>               && --chain_length != 0);
> 
> +#endif
> 
>  
> 
>      return best_len;
> 
>  }
> 
> @@ -777,7 +790,22 @@ deflate (int pack_level)
>              lookahead -= prev_length-1;
> 
>              prev_length -= 2;
> 
>              RSYNC_ROLL(strstart, prev_length+1);
> 
> +            while (prev_length >= 4) {
> 
> +                /* After actual verification, expanding this loop
> 
> +                 * can improve its performance in certain scenarios.
> 
> +                 */
> 
> +                prev_length -= 4;
> 
> +                strstart++;
> 
> +                INSERT_STRING(strstart, hash_head);
> 
> +                strstart++;
> 
> +                INSERT_STRING(strstart, hash_head);
> 
> +                strstart++;
> 
> +                INSERT_STRING(strstart, hash_head);
> 
> +                strstart++;
> 
> +                INSERT_STRING(strstart, hash_head);
> 
> +            }
> 
>              do {
> 
> +                if (prev_length == 0) break;
> 
>                  strstart++;
> 
>                  INSERT_STRING(strstart, hash_head);
> 
>                  /* strstart never exceeds WSIZE-MAX_MATCH, so there are
> 
> diff --git a/util.c b/util.c
> index 0a0fc21..c9f0e52 100644
> --- a/util.c
> +++ b/util.c
> @@ -38,6 +38,12 @@
>  
> 
>  static int write_buffer (int, voidp, unsigned int);
> 
>  
> 
> +#if defined __ARM_NEON__ || defined __ARM_NEON
> 
> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], 
> %x[v]":[c]"+r"(crc):[v]"r"(val))
> 
> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], 
> %w[v]":[c]"+r"(crc):[v]"r"(val))
> 
> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], 
> %w[v]":[c]"+r"(crc):[v]"r"(val))
> 
> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], 
> %w[v]":[c]"+r"(crc):[v]"r"(val))
> 
> +#else
> 
>  /* ========================================================================
> 
>   * Table of CRC-32's of all single-byte values (made by makecrc.c)
> 
>   */
> 
> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
>    0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
> 
>    0x2d02ef8dL
> 
>  };
> 
> +#endif
> 
>  
> 
>  /* Shift register contents.  */
> 
>  static ulg crc = 0xffffffffL;
> 
> @@ -132,6 +139,43 @@ ulg updcrc(s, n)
>      const uch *s;           /* pointer to bytes to pump through */
> 
>      unsigned n;             /* number of bytes in s[] */
> 
>  {
> 
> +#if defined __ARM_NEON__ || defined __ARM_NEON
> 
> +    register ulg c;
> 
> +    static ulg crc = (ulg)0xffffffffL;
> 
> +    register const uint8_t  *buf1;
> 
> +    register const uint16_t *buf2;
> 
> +    register const uint32_t *buf4;
> 
> +    register const uint64_t *buf8;
> 
> +    int64_t length = (int64_t)n;
> 
> +    buf8 = (const  uint64_t *)(const void *)s;
> 
> +
> 
> +    if (s == NULL) {
> 
> +        c = 0xffffffffL;
> 
> +    } else {
> 
> +        c = crc;
> 
> +        while(length >= sizeof(uint64_t)) {
> 
> +            CRC32D(c, *buf8++);
> 
> +            length -= sizeof(uint64_t);
> 
> +        }
> 
> +        buf4 = (const uint32_t *)(const void *)buf8;
> 
> +        if (length >= sizeof(uint32_t)) {
> 
> +            CRC32W(c, *buf4++);
> 
> +            length -= sizeof(uint32_t);
> 
> +        }
> 
> +        buf2 = (const uint16_t *)(const void *)buf4;
> 
> +        if(length >= sizeof(uint16_t)) {
> 
> +            CRC32H(c, *buf2++);
> 
> +            length -= sizeof(uint16_t);
> 
> +        }
> 
> +        buf1 = (const uint8_t *)(const void *)buf2;
> 
> +        if (length >= sizeof(uint8_t)) {
> 
> +            CRC32B(c, *buf1);
> 
> +            length -= sizeof(uint8_t);
> 
> +        }
> 
> +    }
> 
> +    crc = c;
> 
> +    return (c ^ 0xffffffffL);
> 
> +#else
> 
>      register ulg c;         /* temporary variable */
> 
>  
> 
>      if (s == NULL) {
> 
> @@ -144,6 +188,7 @@ ulg updcrc(s, n)
>      }
> 
>      crc = c;
> 
>      return c ^ 0xffffffffL;       /* (instead of ~c for 64-bit machines) */
> 
> +#endif
> 
>  }
> 
>  
> 
>  /* Return a current CRC value.  */
> 

Please allow me to show a set of actual test data for this patch.

First, I made an original version of the program "gzip-1.10" based
on the gzip-1.10 source code, and then made an optimized version of
the program "gzip-optimized" after applying my optimization patch.

Next I use gzip-1.10 version to test the compression and decompression
time on some **xml** files:
[XML]# time ./gzip-1.10 *.xml

real    0m5.099s
user    0m4.384s
sys     0m0.176s
[XML]# time ./gzip-1.10 -d *.gz

real    0m2.173s
user    0m1.821s
sys     0m0.348s

Then use the optimized version to compare:
[XML]# time ./gzip-optimized *.xml

real    0m2.785s
user    0m2.576s
sys     0m0.204s
[XML]# time ./gzip-optimized -d *.gz

real    0m0.497s
user    0m0.176s
sys     0m0.320s


The next test object is a large **log** file:
[LOG]# time ./gzip-1.10 *.log

real    0m8.883s
user    0m8.652s
sys     0m0.217s
[LOG]# time ./gzip-1.10 -d *.gz

real    0m3.049s
user    0m2.604s
sys     0m0.439s

Also use the optimized version to compare:
[LOG]# time ./gzip-optimized *.log

real    0m6.882s
user    0m6.607s
sys     0m0.264s
[LOG]# time ./gzip-optimized -d *.gz

real    0m1.054s
user    0m0.622s
sys     0m0.431s

The above experimental data are from the aarch64 platform.

-- 
Best regards,
Li Qiang
[Prev in Thread]
Current Thread
[Next in Thread]
bug#41535: [PATCH] performance optimization for aarch64, l00374334, 2020/05/26
- bug#41535: [PATCH] performance optimization for aarch64, Li Qiang <=
Prev by Date: bug#39764: [PATCH] IBM Z DFLTCC: add STREQ definition
Previous by thread: bug#41535: [PATCH] performance optimization for aarch64
Next by thread: bug#39764: [PATCH] IBM Z DFLTCC: add STREQ definition
Index(es):
- Date
- Thread