[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 1/5] optimize checking for globs
From: |
Paolo Bonzini |
Subject: |
[PATCH 1/5] optimize checking for globs |
Date: |
Wed, 2 Nov 2016 17:24:14 +0100 |
This removes the hotspot in parse_file_seq's call to strpbrk, by using
SSE2 vector instructions. The resulting speedup on QEMU's noop build
is around 6% (15.4 seconds to 14.5).
The code is roughly based on GCC's similar optimizations in the lexer.
* read-opt.c: New.
* read.c (parse_file_seq): Use needs_glob instead of strpbrk.
* Makefile.am (make_SOURCES): Add read-opt.c.
* Makefile.in: Regenerate.
---
(I also had a SSE4.2 version that gave another 1-2% improvement,
but it fails some tests and I also don't feel like adding a lot
of code to detect the instruction set. All x86-64 machines have
SSE2, so this provide the biggest bang for the buck).
Makefile.am | 6 ++---
Makefile.in | 19 ++++++-------
makeint.h | 1 +
read-opt.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
read.c | 2 +-
5 files changed, 103 insertions(+), 13 deletions(-)
create mode 100644 read-opt.c
diff --git a/Makefile.am b/Makefile.am
index c88c465..ef5e1f9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -44,9 +44,9 @@ endif
make_SOURCES = ar.c arscan.c commands.c default.c dir.c expand.c file.c \
function.c getopt.c getopt1.c guile.c implicit.c job.c load.c \
- loadapi.c main.c misc.c $(ossrc) output.c read.c remake.c \
- rule.c signame.c strcache.c variable.c version.c vpath.c \
- hash.c $(remote)
+ loadapi.c main.c misc.c $(ossrc) output.c read.c read-opt.c \
+ remake.c rule.c signame.c strcache.c variable.c version.c \
+ vpath.c hash.c $(remote)
EXTRA_make_SOURCES = vmsjobs.c remote-stub.c remote-cstms.c
diff --git a/Makefile.in b/Makefile.in
index 67b7616..52c854c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -144,8 +144,8 @@ loadavg_DEPENDENCIES =
am__make_SOURCES_DIST = ar.c arscan.c commands.c default.c dir.c \
expand.c file.c function.c getopt.c getopt1.c guile.c \
implicit.c job.c load.c loadapi.c main.c misc.c posixos.c \
- output.c read.c remake.c rule.c signame.c strcache.c \
- variable.c version.c vpath.c hash.c remote-stub.c \
+ output.c read.c read-opt.c remake.c rule.c signame.c \
+ strcache.c variable.c version.c vpath.c hash.c remote-stub.c \
remote-cstms.c
@address@hidden = posixos.$(OBJEXT)
@address@hidden = remote-stub.$(OBJEXT)
@@ -156,10 +156,10 @@ am_make_OBJECTS = ar.$(OBJEXT) arscan.$(OBJEXT)
commands.$(OBJEXT) \
getopt1.$(OBJEXT) guile.$(OBJEXT) implicit.$(OBJEXT) \
job.$(OBJEXT) load.$(OBJEXT) loadapi.$(OBJEXT) main.$(OBJEXT) \
misc.$(OBJEXT) $(am__objects_1) output.$(OBJEXT) \
- read.$(OBJEXT) remake.$(OBJEXT) rule.$(OBJEXT) \
- signame.$(OBJEXT) strcache.$(OBJEXT) variable.$(OBJEXT) \
- version.$(OBJEXT) vpath.$(OBJEXT) hash.$(OBJEXT) \
- $(am__objects_2)
+ read.$(OBJEXT) read-opt.$(OBJEXT) remake.$(OBJEXT) \
+ rule.$(OBJEXT) signame.$(OBJEXT) strcache.$(OBJEXT) \
+ variable.$(OBJEXT) version.$(OBJEXT) vpath.$(OBJEXT) \
+ hash.$(OBJEXT) $(am__objects_2)
make_OBJECTS = $(am_make_OBJECTS)
am__DEPENDENCIES_1 =
@address@hidden = $(am__DEPENDENCIES_1)
@@ -473,9 +473,9 @@ include_HEADERS = gnumake.h
@address@hidden = remote-cstms.c
make_SOURCES = ar.c arscan.c commands.c default.c dir.c expand.c file.c \
function.c getopt.c getopt1.c guile.c implicit.c job.c load.c \
- loadapi.c main.c misc.c $(ossrc) output.c read.c remake.c \
- rule.c signame.c strcache.c variable.c version.c vpath.c \
- hash.c $(remote)
+ loadapi.c main.c misc.c $(ossrc) output.c read.c read-opt.c \
+ remake.c rule.c signame.c strcache.c variable.c version.c \
+ vpath.c hash.c $(remote)
EXTRA_make_SOURCES = vmsjobs.c remote-stub.c remote-cstms.c
noinst_HEADERS = commands.h dep.h filedef.h job.h makeint.h rule.h variable.h \
@@ -684,6 +684,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
@AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
@AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
address@hidden@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
@AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
@AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
@AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
diff --git a/makeint.h b/makeint.h
index 8e0ae6c..ef66312 100644
--- a/makeint.h
+++ b/makeint.h
@@ -712,6 +712,7 @@ void unblock_remote_children (void);
int remote_kill (int id, int sig);
void print_variable_data_base (void);
void print_vpath_data_base (void);
+int needs_glob(const char *);
extern char *starting_directory;
extern unsigned int makelevel;
diff --git a/read-opt.c b/read-opt.c
new file mode 100644
index 0000000..6deb446
--- /dev/null
+++ b/read-opt.c
@@ -0,0 +1,88 @@
+/* Vectorized function for fast parsing of filenames for GNU Make.
+Copyright (C) 2016 Free Software Foundation, Inc.
+This file is part of GNU Make.
+
+GNU Make is free software; you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+GNU Make is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "makeint.h"
+
+#ifdef __SSE2__
+int needs_glob(const char *s)
+{
+ static const char repl_chars[4][16] __attribute__((aligned(16))) = {
+ { '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?' },
+ { '*', '*', '*', '*', '*', '*', '*', '*',
+ '*', '*', '*', '*', '*', '*', '*', '*' },
+ { '[', '[', '[', '[', '[', '[', '[', '[',
+ '[', '[', '[', '[', '[', '[', '[', '[' },
+ };
+
+ typedef char v16qi __attribute__ ((__vector_size__ (16)));
+
+ const v16qi repl_qm = *(const v16qi *)repl_chars[0];
+ const v16qi repl_st = *(const v16qi *)repl_chars[1];
+ const v16qi repl_br = *(const v16qi *)repl_chars[2];
+ const v16qi repl_nul = *(const v16qi *)repl_chars[3];
+
+ unsigned int misalign, found, mask, done;
+ const v16qi *p;
+ v16qi data, t, n;
+
+ /* Align the source pointer. */
+ misalign = (uintptr_t)s & 15;
+ p = (const v16qi *)((uintptr_t)s & -16);
+ data = *p;
+
+ /* Create a mask for the bytes that are valid within the first
+ 16-byte block. The Idea here is that the AND with the mask
+ within the loop is "free", since we need some AND or TEST
+ insn in order to set the flags for the branch anyway. */
+ mask = -1u << misalign;
+
+ /* Main loop processing 16 bytes at a time. */
+ goto start;
+ do
+ {
+ data = *++p;
+ mask = -1;
+
+ start:
+ n = __builtin_ia32_pcmpeqb128(data, repl_nul);
+ t = __builtin_ia32_pcmpeqb128(data, repl_qm);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_st);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_br);
+ t |= n;
+ found = __builtin_ia32_pmovmskb128 (t);
+ found &= mask;
+ }
+ while (!found);
+
+ /* FOUND contains 1 in bits for which we matched a relevant
+ character or NUL. DONE contains 1 in bits for which we
+ matched a NUL. */
+ done = __builtin_ia32_pmovmskb128 (n);
+
+ /* Set to 1 all bits corresponding to characters to the left of the
+ first NUL. */
+ done |= -done;
+ found &= ~done;
+ return found > 0;
+}
+
+#else
+int needs_glob(const char *s)
+{
+ return strpbrk (s, "?*[") == NULL;
+}
+#endif
diff --git a/read.c b/read.c
index b870aa8..0883100 100644
--- a/read.c
+++ b/read.c
@@ -3268,7 +3268,7 @@ parse_file_seq (char **stringp, unsigned int size, int
stopmap,
#endif /* !NO_ARCHIVES */
/* glob() is expensive: don't call it unless we need to. */
- if (NONE_SET (flags, PARSEFS_EXISTS) && strpbrk (name, "?*[") == NULL)
+ if (NONE_SET (flags, PARSEFS_EXISTS) && !needs_glob (name))
{
globme = 0;
i = 1;
--
2.7.4
- [PATCH 0/5] Miscellaneous speed up patches, Paolo Bonzini, 2016/11/02
- [PATCH 1/5] optimize checking for globs,
Paolo Bonzini <=
- [PATCH 3/5] use jhash for STRING_N_HASH, Paolo Bonzini, 2016/11/02
- [PATCH 5/5] speedup parsing of functions, Paolo Bonzini, 2016/11/02
- [PATCH 4/5] remove MAP_PERCENT, Paolo Bonzini, 2016/11/02
- [PATCH 2/5] use strchr for simple case of find_char_unquote, Paolo Bonzini, 2016/11/02
- Re: [PATCH 0/5] Miscellaneous speed up patches, Paolo Bonzini, 2016/11/11