grep-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFC PATCH v3] pcre: migrate to pcre2


From: Carlo Marcelo Arenas Belón
Subject: [RFC PATCH v3] pcre: migrate to pcre2
Date: Fri, 15 Oct 2021 02:40:51 -0700

Mostly a bug by bug translation of the original code to the PCRE2 API.
but includes a couple of fixes that were in the original proposal and that
might be worth doing in independent patches (likely as post fixes).

Code still could do with some optimizations, and looks a little ugly (ex:
the way sub is handled and some of the probably unnecessary backward
compatibility).  The API changes some of the sign of types and
therefore some ugly casts were needed, but had tried to keep the changes
to a minimun to avoid churn and help with review.  More will be likely
required to make sure no odd truncations will happen.

Includes backward compatibility and builds with 10.00, but is recommended
to use 10.34 or newer, and indeed will fail some tests because of UTF-8
issues unless PCRE2 has support for PCRE2_MATCH_INVALID_UTF.

Performance seems equivalent, and it also seems functionally complete
but will like to get some feedback in the code first, before considering
it will be ready for wider testing (AKA not an RFC).

Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
---
 configure.ac             |   2 +-
 m4/{pcre.m4 => pcre2.m4} |  23 +++--
 src/pcresearch.c         | 218 +++++++++++++++++++++------------------
 tests/filename-lineno.pl |   4 +-
 4 files changed, 133 insertions(+), 114 deletions(-)
 rename m4/{pcre.m4 => pcre2.m4} (67%)

diff --git a/configure.ac b/configure.ac
index c49ec4a..9291cee 100644
--- a/configure.ac
+++ b/configure.ac
@@ -197,7 +197,7 @@ if test "$ac_use_included_regex" = no; then
   AC_MSG_WARN([Included lib/regex.c not used])
 fi
 
-gl_FUNC_PCRE
+gl_FUNC_PCRE2
 AM_CONDITIONAL([USE_PCRE], [test $use_pcre = yes])
 
 case $host_os in
diff --git a/m4/pcre.m4 b/m4/pcre2.m4
similarity index 67%
rename from m4/pcre.m4
rename to m4/pcre2.m4
index 78b7fda..7970c4e 100644
--- a/m4/pcre.m4
+++ b/m4/pcre2.m4
@@ -1,15 +1,15 @@
-# pcre.m4 - check for libpcre support
+# pcre2.m4 - check for libpcre2 support
 
 # Copyright (C) 2010-2021 Free Software Foundation, Inc.
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-AC_DEFUN([gl_FUNC_PCRE],
+AC_DEFUN([gl_FUNC_PCRE2],
 [
   AC_ARG_ENABLE([perl-regexp],
     AS_HELP_STRING([--disable-perl-regexp],
-                   [disable perl-regexp (pcre) support]),
+                   [disable perl-regexp (pcre2) support]),
     [case $enableval in
        yes|no) test_pcre=$enableval;;
        *) AC_MSG_ERROR([invalid value $enableval for --disable-perl-regexp]);;
@@ -21,24 +21,25 @@ AC_DEFUN([gl_FUNC_PCRE],
   use_pcre=no
 
   if test $test_pcre != no; then
-    PKG_CHECK_MODULES([PCRE], [libpcre], [], [: ${PCRE_LIBS=-lpcre}])
+    PKG_CHECK_MODULES([PCRE], [libpcre2-8], [], [: ${PCRE_LIBS=-lpcre2-8}])
 
-    AC_CACHE_CHECK([for pcre_compile], [pcre_cv_have_pcre_compile],
+    AC_CACHE_CHECK([for pcre2_compile], [pcre_cv_have_pcre2_compile],
       [pcre_saved_CFLAGS=$CFLAGS
        pcre_saved_LIBS=$LIBS
        CFLAGS="$CFLAGS $PCRE_CFLAGS"
        LIBS="$PCRE_LIBS $LIBS"
        AC_LINK_IFELSE(
-         [AC_LANG_PROGRAM([[#include <pcre.h>
+         [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+                            #include <pcre2.h>
                           ]],
-            [[pcre *p = pcre_compile (0, 0, 0, 0, 0);
+            [[pcre2_code *p = pcre2_compile (0, 0, 0, 0, 0, 0);
               return !p;]])],
-         [pcre_cv_have_pcre_compile=yes],
-         [pcre_cv_have_pcre_compile=no])
+         [pcre_cv_have_pcre2_compile=yes],
+         [pcre_cv_have_pcre2_compile=no])
        CFLAGS=$pcre_saved_CFLAGS
        LIBS=$pcre_saved_LIBS])
 
-    if test "$pcre_cv_have_pcre_compile" = yes; then
+    if test "$pcre_cv_have_pcre2_compile" = yes; then
       use_pcre=yes
     elif test $test_pcre = maybe; then
       AC_MSG_WARN([AC_PACKAGE_NAME will be built without pcre support.])
@@ -50,7 +51,7 @@ AC_DEFUN([gl_FUNC_PCRE],
   if test $use_pcre = yes; then
     AC_DEFINE([HAVE_LIBPCRE], [1],
       [Define to 1 if you have the Perl Compatible Regular Expressions
-       library (-lpcre).])
+       library (-lpcre2).])
   else
     PCRE_CFLAGS=
     PCRE_LIBS=
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3bdaee9..28a9ac7 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -17,94 +17,87 @@
    02110-1301, USA.  */
 
 /* Written August 1992 by Mike Haertel. */
+/* Updated for PCRE2 by Carlo Arenas. */
 
 #include <config.h>
 #include "search.h"
 #include "die.h"
 
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 
-/* This must be at least 2; everything after that is for performance
-   in pcre_exec.  */
-enum { NSUB = 300 };
-
-#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION
-# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
-#endif
-#ifndef PCRE_STUDY_JIT_COMPILE
-# define PCRE_STUDY_JIT_COMPILE 0
+/* 10.30 obsoletes and replaces RECURSIONLIMIT  */
+#ifndef PCRE2_ERROR_DEPTHLIMIT
+#define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
+#define PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_RECURSIONLIMIT
 #endif
-#ifndef PCRE_STUDY_EXTRA_NEEDED
-# define PCRE_STUDY_EXTRA_NEEDED 0
+#ifndef PCRE2_SIZE_MAX
+#define PCRE2_SIZE_MAX SIZE_MAX
 #endif
 
 struct pcre_comp
 {
   /* Compiled internal form of a Perl regular expression.  */
-  pcre *cre;
-
-  /* Additional information about the pattern.  */
-  pcre_extra *extra;
-
-#if PCRE_STUDY_JIT_COMPILE
-  /* The JIT stack and its maximum size.  */
-  pcre_jit_stack *jit_stack;
-  int jit_stack_size;
-#endif
-
+  pcre2_code *cre;
+  pcre2_match_context *mcontext;
+  pcre2_match_data *data;
   /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
      string matches when that flag is used.  */
   int empty_match[2];
 };
 
-
 /* Match the already-compiled PCRE pattern against the data in SUBJECT,
    of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
-   options OPTIONS, and storing resulting matches into SUB.  Return
-   the (nonnegative) match location or a (negative) error number.  */
+   options OPTIONS.
+   Return the (nonnegative) match count or a (negative) error number.  */
 static int
 jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
-          int search_offset, int options, int *sub)
+          int search_offset, uint32_t options, PCRE2_SIZE **sub)
 {
   while (true)
     {
-      int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes,
-                         search_offset, options, sub, NSUB);
+      bool dynamic_jit_stack = false;
+      int e = pcre2_match (pc->cre, (PCRE2_SPTR8)subject, search_bytes,
+                         search_offset, options, pc->data, pc->mcontext);
+
+      if (e == PCRE2_ERROR_JIT_STACKLIMIT && dynamic_jit_stack)
+        return e;
 
-#if PCRE_STUDY_JIT_COMPILE
-      if (e == PCRE_ERROR_JIT_STACKLIMIT
-          && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2)
+      if (e == PCRE2_ERROR_JIT_STACKLIMIT)
         {
-          int old_size = pc->jit_stack_size;
-          int new_size = pc->jit_stack_size = old_size * 2;
-          if (pc->jit_stack)
-            pcre_jit_stack_free (pc->jit_stack);
-          pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size);
-          if (!pc->jit_stack)
+          /* The PCRE documentation says that a 32 KiB stack is the default.  
*/
+          pcre2_jit_stack *s = pcre2_jit_stack_create (64 << 10, INT_MAX / 2,
+                                                       NULL);
+
+          if (!pc->mcontext)
+            pc->mcontext = pcre2_match_context_create (NULL);
+
+          if (!pc->mcontext || !s)
             die (EXIT_TROUBLE, 0,
                  _("failed to allocate memory for the PCRE JIT stack"));
-          pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack);
+          pcre2_jit_stack_assign (pc->mcontext, NULL, s);
+          dynamic_jit_stack = true;
           continue;
         }
-#endif
+      if (e == PCRE2_ERROR_DEPTHLIMIT)
+        {
+          uint32_t lim;
+          pcre2_config (PCRE2_CONFIG_DEPTHLIMIT, &lim);
+          if (lim > INT32_MAX)
+            return e;
+
+          lim *= 2;
+          if (!pc->mcontext)
+            pc->mcontext = pcre2_match_context_create (NULL);
 
-#if PCRE_EXTRA_MATCH_LIMIT_RECURSION
-      if (e == PCRE_ERROR_RECURSIONLIMIT
-          && (PCRE_STUDY_EXTRA_NEEDED || pc->extra))
+          /* FIXME: obsoleted since 10.30  */
+          pcre2_set_recursion_limit (pc->mcontext, lim);
+          continue;
+        }
+      if (e > 0)
         {
-          unsigned long lim
-            = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION
-               ? pc->extra->match_limit_recursion
-               : 0);
-          if (lim <= ULONG_MAX / 2)
-            {
-              pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1;
-              pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
-              continue;
-            }
+          *sub = pcre2_get_ovector_pointer(pc->data);
         }
-#endif
-
       return e;
     }
 }
@@ -115,27 +108,36 @@ jit_exec (struct pcre_comp *pc, char const *subject, int 
search_bytes,
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
-  int e;
-  char const *ep;
+  PCRE2_SIZE e;
+  int ec;
+  PCRE2_UCHAR8 ep[128];
   static char const wprefix[] = "(?<!\\w)(?:";
   static char const wsuffix[] = ")(?!\\w)";
   static char const xprefix[] = "^(?:";
   static char const xsuffix[] = ")$";
   int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
                          sizeof xprefix - 1 + sizeof xsuffix - 1);
-  char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
-  int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
+  unsigned char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
+  uint32_t flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
   char *patlim = pattern + size;
-  char *n = re;
+  char *n = (char *)re;
   char const *p;
   char const *pnul;
   struct pcre_comp *pc = xcalloc (1, sizeof (*pc));
+  pcre2_compile_context *ccontext = NULL;
 
   if (localeinfo.multibyte)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= PCRE_UTF8;
+      flags |= PCRE2_UTF;
+#ifdef PCRE2_NEVER_BACKSLASH_C
+      flags |= PCRE2_NEVER_BACKSLASH_C;
+#endif
+#ifdef PCRE2_MATCH_INVALID_UTF
+      /* Requires at least 10.34  */
+      flags |= PCRE2_MATCH_INVALID_UTF;
+#endif
     }
 
   /* FIXME: Remove this restriction.  */
@@ -145,8 +147,11 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, 
bool exact)
   *n = '\0';
   if (match_words)
     strcpy (n, wprefix);
+#ifndef PCRE2_EXTRA_MATCH_LINE
   if (match_lines)
     strcpy (n, xprefix);
+#endif
+
   n += strlen (n);
 
   /* The PCRE interface doesn't allow NUL bytes in the pattern, so
@@ -171,33 +176,35 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t 
ignored, bool exact)
   if (match_words)
     strcpy (n, wsuffix);
   if (match_lines)
+#ifndef PCRE2_EXTRA_MATCH_LINE
     strcpy (n, xsuffix);
+#else
+    {
+      ccontext = pcre2_compile_context_create(NULL);
+      uint32_t extra_options = PCRE2_EXTRA_MATCH_LINE;
 
-  pc->cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
-  if (!pc->cre)
-    die (EXIT_TROUBLE, 0, "%s", ep);
+      pcre2_set_compile_extra_options(ccontext, extra_options);
+    }
+#endif
 
-  int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE;
-  pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep);
-  if (ep)
-    die (EXIT_TROUBLE, 0, "%s", ep);
+  pc->cre = pcre2_compile (re, PCRE2_ZERO_TERMINATED, flags, &ec, &e, 
ccontext);
+  if (!pc->cre)
+    {
+      pcre2_get_error_message (ec, ep, 128);
+      die (EXIT_TROUBLE, 0, "%s", ep);
+    }
 
-#if PCRE_STUDY_JIT_COMPILE
-  if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e))
-    die (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
+  pc->data = pcre2_match_data_create_from_pattern (pc->cre, NULL);
 
-  /* The PCRE documentation says that a 32 KiB stack is the default.  */
-  if (e)
-    pc->jit_stack_size = 32 << 10;
-#endif
+  ec = pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
+  if (ec && ec != PCRE2_ERROR_JIT_BADOPTION && ec != PCRE2_ERROR_NOMEMORY)
+    die (EXIT_TROUBLE, 0, _("JIT internal error: %d"), ec);
 
   free (re);
 
-  int sub[NSUB];
-  pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0,
-                                      PCRE_NOTBOL, sub, NSUB);
-  pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub,
-                                     NSUB);
+  PCRE2_SIZE *sub;
+  pc->empty_match[false] = jit_exec (pc, "", 0, 0, PCRE2_NOTBOL, &sub);
+  pc->empty_match[true] = jit_exec (pc, "", 0, 0, 0, &sub);
 
   return pc;
 }
@@ -206,15 +213,15 @@ ptrdiff_t
 Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
           char const *start_ptr)
 {
-  int sub[NSUB];
+  PCRE2_SIZE *sub;
   char const *p = start_ptr ? start_ptr : buf;
   bool bol = p[-1] == eolbyte;
   char const *line_start = buf;
-  int e = PCRE_ERROR_NOMATCH;
+  int e = PCRE2_ERROR_NOMATCH;
   char const *line_end;
   struct pcre_comp *pc = vcp;
 
-  /* The search address to pass to pcre_exec.  This is the start of
+  /* The search address to pass to PCRE.  This is the start of
      the buffer, or just past the most-recently discovered encoding
      error or line end.  */
   char const *subject = buf;
@@ -226,14 +233,14 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
          better and the correctness issues were too puzzling.  See
          Bug#22655.  */
       line_end = rawmemchr (p, eolbyte);
-      if (INT_MAX < line_end - p)
+      if (PCRE2_SIZE_MAX < line_end - p)
         die (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
       for (;;)
         {
           /* Skip past bytes that are easily determined to be encoding
              errors, treating them as data that cannot match.  This is
-             faster than having pcre_exec check them.  */
+             faster than having PCRE check them.  */
           while (localeinfo.sbclen[to_uchar (*p)] == -1)
             {
               p++;
@@ -244,9 +251,10 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
           int search_offset = p - subject;
 
           /* Check for an empty match; this is faster than letting
-             pcre_exec do it.  */
+             PCRE do it.  */
           if (p == line_end)
             {
+              sub = xnmalloc(2, sizeof (*sub));
               sub[0] = sub[1] = search_offset;
               e = pc->empty_match[bol];
               break;
@@ -254,12 +262,13 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
 
           int options = 0;
           if (!bol)
-            options |= PCRE_NOTBOL;
+            options |= PCRE2_NOTBOL;
 
           e = jit_exec (pc, subject, line_end - subject, search_offset,
-                        options, sub);
-          if (e != PCRE_ERROR_BADUTF8)
+                        options, &sub);
+          if (PCRE2_ERROR_UTF8_ERR1 <= e || e < PCRE2_ERROR_UTF8_ERR21)
             break;
+
           int valid_bytes = sub[0];
 
           if (search_offset <= valid_bytes)
@@ -275,9 +284,9 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
                 }
               else
                 e = jit_exec (pc, subject, valid_bytes, search_offset,
-                              options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
+                              options | PCRE2_NOTEOL, &sub);
 
-              if (e != PCRE_ERROR_NOMATCH)
+              if (e != PCRE2_ERROR_NOMATCH)
                 break;
 
               /* Treat the encoding error as data that cannot match.  */
@@ -288,7 +297,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
           subject += valid_bytes + 1;
         }
 
-      if (e != PCRE_ERROR_NOMATCH)
+      if (e != PCRE2_ERROR_NOMATCH)
         break;
       bol = true;
       p = subject = line_start = line_end + 1;
@@ -299,26 +308,35 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
     {
       switch (e)
         {
-        case PCRE_ERROR_NOMATCH:
+        case PCRE2_ERROR_NOMATCH:
           break;
 
-        case PCRE_ERROR_NOMEMORY:
+        case PCRE2_ERROR_NOMEMORY:
           die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
 
-#if PCRE_STUDY_JIT_COMPILE
-        case PCRE_ERROR_JIT_STACKLIMIT:
+        case PCRE2_ERROR_JIT_STACKLIMIT:
           die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
                input_filename ());
-#endif
 
-        case PCRE_ERROR_MATCHLIMIT:
+        case PCRE2_ERROR_MATCHLIMIT:
           die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
                input_filename ());
 
-        case PCRE_ERROR_RECURSIONLIMIT:
-          die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"),
+        case PCRE2_ERROR_DEPTHLIMIT:
+          die (EXIT_TROUBLE, 0,
+               _("%s: exceeded PCRE's nested backtracking limit"),
                input_filename ());
 
+        case PCRE2_ERROR_RECURSELOOP:
+          die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
+               input_filename ());
+
+#ifdef PCRE2_ERROR_HEAPLIMIT
+        case PCRE2_ERROR_HEAPLIMIT:
+          die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
+               input_filename ());
+#endif
+
         default:
           /* For now, we lump all remaining PCRE failures into this basket.
              If anyone cares to provide sample grep usage that can trigger
diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl
index 1e84b45..1ff3d6a 100755
--- a/tests/filename-lineno.pl
+++ b/tests/filename-lineno.pl
@@ -101,13 +101,13 @@ my @Tests =
    ],
    ['invalid-re-P-paren', '-P ")"', {EXIT=>2},
     {ERR => $ENV{PCRE_WORKS} == 1
-       ? "$prog: unmatched parentheses\n"
+       ? "$prog: unmatched closing parenthesis\n"
        : $no_pcre
     },
    ],
    ['invalid-re-P-star-paren', '-P "a.*)"', {EXIT=>2},
     {ERR => $ENV{PCRE_WORKS} == 1
-       ? "$prog: unmatched parentheses\n"
+       ? "$prog: unmatched closing parenthesis\n"
        : $no_pcre
     },
    ],
-- 
2.33.0.1155.gbdb71ac078




reply via email to

[Prev in Thread] Current Thread [Next in Thread]