bug-gzip
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#41535: [PATCH] performance optimization for aarch64


From: l00374334
Subject: bug#41535: [PATCH] performance optimization for aarch64
Date: Tue, 26 May 2020 10:39:40 +0800

From: liqiang <address@hidden>

By analyzing the compression and decompression process of gzip, I found 
that the hot spots of CRC32 and longest_match function are very high.

On the aarch64 architecture, we can optimize the efficiency of crc32 
through the interface provided by the neon instruction set (12x faster 
in aarch64), and optimize the performance of random access code through 
prefetch instructions (about 5%~8% improvement). In some compression 
scenarios, loop expansion can also get a certain performance improvement 
(about 10%).

Modify by Li Qiang.

---
 configure | 14 ++++++++++++++
 deflate.c | 30 +++++++++++++++++++++++++++++-
 util.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index cab3daf..dc80cb6 100644
--- a/configure
+++ b/configure
@@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext 
conftest.$ac_ext
            ;;
 
          arm* | aarch64 )
+           cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#if defined __ARM_NEON__ || defined __ARM_NEON
+                   int ok;
+                  #else
+                   error fail
+                  #endif
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  CFLAGS="$CFLAGS -march=armv8-a+crc"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
            # Assume arm with EABI.
            # On arm64 systems, the C compiler may be generating code in one of
            # these ABIs:
diff --git a/deflate.c b/deflate.c
index 9d379e9..ee77ffd 100644
--- a/deflate.c
+++ b/deflate.c
@@ -378,6 +378,9 @@ longest_match(IPos cur_match)
     register int len;                           /* length of current match */
     int best_len = prev_length;                 /* best match length so far */
     IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
+#ifdef __aarch64__
+    IPos next_match;
+#endif
     /* Stop when cur_match becomes <= limit. To simplify the code,
      * we prevent matches with the string of window index 0.
      */
@@ -411,6 +414,10 @@ longest_match(IPos cur_match)
     do {
         Assert(cur_match < strstart, "no future");
         match = window + cur_match;
+#ifdef __aarch64__
+        next_match = prev[cur_match & WMASK];
+        __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
+#endif
 
         /* Skip to next match if the match length cannot increase
          * or if the match length is less than 2:
@@ -488,8 +495,14 @@ longest_match(IPos cur_match)
             scan_end   = scan[best_len];
 #endif
         }
-    } while ((cur_match = prev[cur_match & WMASK]) > limit
+    }
+#ifdef __aarch64__
+    while ((cur_match = next_match) > limit
+             && --chain_length != 0);
+#else
+    while ((cur_match = prev[cur_match & WMASK]) > limit
              && --chain_length != 0);
+#endif
 
     return best_len;
 }
@@ -777,7 +790,22 @@ deflate (int pack_level)
             lookahead -= prev_length-1;
             prev_length -= 2;
             RSYNC_ROLL(strstart, prev_length+1);
+            while (prev_length >= 4) {
+                /* After actual verification, expanding this loop
+                 * can improve its performance in certain scenarios.
+                 */
+                prev_length -= 4;
+                strstart++;
+                INSERT_STRING(strstart, hash_head);
+                strstart++;
+                INSERT_STRING(strstart, hash_head);
+                strstart++;
+                INSERT_STRING(strstart, hash_head);
+                strstart++;
+                INSERT_STRING(strstart, hash_head);
+            }
             do {
+                if (prev_length == 0) break;
                 strstart++;
                 INSERT_STRING(strstart, hash_head);
                 /* strstart never exceeds WSIZE-MAX_MATCH, so there are
diff --git a/util.c b/util.c
index 0a0fc21..c9f0e52 100644
--- a/util.c
+++ b/util.c
@@ -38,6 +38,12 @@
 
 static int write_buffer (int, voidp, unsigned int);
 
+#if defined __ARM_NEON__ || defined __ARM_NEON
+#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], 
%x[v]":[c]"+r"(crc):[v]"r"(val))
+#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(val))
+#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(val))
+#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], 
%w[v]":[c]"+r"(crc):[v]"r"(val))
+#else
 /* ========================================================================
  * Table of CRC-32's of all single-byte values (made by makecrc.c)
  */
@@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
   0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
   0x2d02ef8dL
 };
+#endif
 
 /* Shift register contents.  */
 static ulg crc = 0xffffffffL;
@@ -132,6 +139,43 @@ ulg updcrc(s, n)
     const uch *s;           /* pointer to bytes to pump through */
     unsigned n;             /* number of bytes in s[] */
 {
+#if defined __ARM_NEON__ || defined __ARM_NEON
+    register ulg c;
+    static ulg crc = (ulg)0xffffffffL;
+    register const uint8_t  *buf1;
+    register const uint16_t *buf2;
+    register const uint32_t *buf4;
+    register const uint64_t *buf8;
+    int64_t length = (int64_t)n;
+    buf8 = (const  uint64_t *)(const void *)s;
+
+    if (s == NULL) {
+        c = 0xffffffffL;
+    } else {
+        c = crc;
+        while(length >= sizeof(uint64_t)) {
+            CRC32D(c, *buf8++);
+            length -= sizeof(uint64_t);
+        }
+        buf4 = (const uint32_t *)(const void *)buf8;
+        if (length >= sizeof(uint32_t)) {
+            CRC32W(c, *buf4++);
+            length -= sizeof(uint32_t);
+        }
+        buf2 = (const uint16_t *)(const void *)buf4;
+        if(length >= sizeof(uint16_t)) {
+            CRC32H(c, *buf2++);
+            length -= sizeof(uint16_t);
+        }
+        buf1 = (const uint8_t *)(const void *)buf2;
+        if (length >= sizeof(uint8_t)) {
+            CRC32B(c, *buf1);
+            length -= sizeof(uint8_t);
+        }
+    }
+    crc = c;
+    return (c ^ 0xffffffffL);
+#else
     register ulg c;         /* temporary variable */
 
     if (s == NULL) {
@@ -144,6 +188,7 @@ ulg updcrc(s, n)
     }
     crc = c;
     return c ^ 0xffffffffL;       /* (instead of ~c for 64-bit machines) */
+#endif
 }
 
 /* Return a current CRC value.  */
-- 
2.17.1







reply via email to

[Prev in Thread] Current Thread [Next in Thread]