bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#58558: 29.0.50; re-search-forward is slow in some buffers


From: Stefan Monnier
Subject: bug#58558: 29.0.50; re-search-forward is slow in some buffers
Date: Thu, 13 Apr 2023 00:33:52 -0400
User-agent: Gnus/5.13 (Gnus v5.13)

>> For the former, we could probably extend the `b_property` and
>> `e_property` fields of `gl_state` (which hold charpos) to also store
>> their bytepos equivalent, which should significantly reduce the number
>> of conversions between bytepos and charpos.
> I.e. something like the patch below (which passes all tests except for
> `test/src/comp-tests` for a reason that completely escapes me).

Found the culprit!
The patch below passes `make check`.


        Stefan
diff --git a/src/fns.c b/src/fns.c
index e92ef7e4c81..591b00103da 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -1194,6 +1194,8 @@ string_char_to_byte (Lisp_Object string, ptrdiff_t 
char_index)
   if (best_above == best_above_byte)
     return char_index;
 
+  eassert (char_index >= 0 && char_index <= best_above);
+
   if (BASE_EQ (string, string_char_byte_cache_string))
     {
       if (string_char_byte_cache_charpos < char_index)
@@ -1254,6 +1256,8 @@ string_byte_to_char (Lisp_Object string, ptrdiff_t 
byte_index)
   if (best_above == best_above_byte)
     return byte_index;
 
+  eassert (byte_index >= 0 && byte_index <= best_above_byte);
+
   if (BASE_EQ (string, string_char_byte_cache_string))
     {
       if (string_char_byte_cache_bytepos < byte_index)
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 746779490ad..f75f805cd9c 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3979,7 +3979,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
 
   /* Prevent shrinking and relocation of buffer text if GC happens
      while we are inside this function.  The calls to
-     UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
+     RE_UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
      `internal--syntax-propertize`); these calls are careful to defend against
      buffer modifications, but even with no modifications, the buffer text may
      be relocated during GC by `compact_buffer` which would invalidate
@@ -4792,12 +4792,11 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                int s1, s2;
                int dummy;
                 ptrdiff_t offset = POINTER_TO_OFFSET (d);
-                ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
-               UPDATE_SYNTAX_TABLE (charpos);
+               RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
                GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
                nchars++;
                s1 = SYNTAX (c1);
-               UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+               RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
                PREFETCH_NOLIMIT ();
                GET_CHAR_AFTER (c2, d, dummy);
                nchars++;
@@ -4832,8 +4831,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
              int s1, s2;
              int dummy;
              ptrdiff_t offset = POINTER_TO_OFFSET (d);
-             ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
-             UPDATE_SYNTAX_TABLE (charpos);
+             RE_UPDATE_SYNTAX_TABLE (offset);
              PREFETCH ();
              GET_CHAR_AFTER (c2, d, dummy);
              nchars++;
@@ -4848,7 +4846,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                {
                  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
                  nchars++;
-                 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+                 RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset);
                  s1 = SYNTAX (c1);
 
                  /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
@@ -4875,8 +4873,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
              int s1, s2;
              int dummy;
               ptrdiff_t offset = POINTER_TO_OFFSET (d);
-              ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
-             UPDATE_SYNTAX_TABLE (charpos);
+             RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
              GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
              nchars++;
              s1 = SYNTAX (c1);
@@ -4891,13 +4888,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                  PREFETCH_NOLIMIT ();
                  GET_CHAR_AFTER (c2, d, dummy);
                  nchars++;
-                  UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+                  RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
                  s2 = SYNTAX (c2);
 
                  /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
                     returns 0.  */
                  if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
-         goto fail;
+                   goto fail;
                }
            }
          break;
@@ -4917,8 +4914,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
              int c1, c2;
              int s1, s2;
              ptrdiff_t offset = POINTER_TO_OFFSET (d);
-             ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
-             UPDATE_SYNTAX_TABLE (charpos);
+             RE_UPDATE_SYNTAX_TABLE (offset);
              PREFETCH ();
              c2 = RE_STRING_CHAR (d, target_multibyte);
              nchars++;
@@ -4933,7 +4929,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                {
                  GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
                  nchars++;
-                 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+                 RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset);
                  s1 = SYNTAX (c1);
 
                  /* ... and S1 is Sword or Ssymbol.  */
@@ -4958,8 +4954,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
              int c1, c2;
              int s1, s2;
               ptrdiff_t offset = POINTER_TO_OFFSET (d);
-              ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
-             UPDATE_SYNTAX_TABLE (charpos);
+             RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
              GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
              nchars++;
              s1 = SYNTAX (c1);
@@ -4974,7 +4969,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                  PREFETCH_NOLIMIT ();
                  c2 = RE_STRING_CHAR (d, target_multibyte);
                  nchars++;
-                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+                 RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
                  s2 = SYNTAX (c2);
 
                  /* ... and S2 is Sword or Ssymbol.  */
@@ -4994,8 +4989,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
            PREFETCH ();
            {
              ptrdiff_t offset = POINTER_TO_OFFSET (d);
-             ptrdiff_t pos1 = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
-             UPDATE_SYNTAX_TABLE (pos1);
+             RE_UPDATE_SYNTAX_TABLE (offset);
            }
            {
              int len;
diff --git a/src/syntax.c b/src/syntax.c
index e9e04e2d638..fbd08c74092 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -250,6 +250,8 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
   gl_state.b_property = BEGV;
   gl_state.e_property = ZV + 1;
   gl_state.object = Qnil;
+  gl_state.b_re_byte = -1;
+  gl_state.e_re_byte = -1;
   if (parse_sexp_lookup_properties)
     {
       if (count > 0)
@@ -265,14 +267,15 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
 /* Same as above, but in OBJECT.  If OBJECT is nil, use current buffer.
    If it is t (which is only used in fast_c_string_match_ignore_case),
    ignore properties altogether.
-   FROMBYTE is an regexp-byteoffset.  */
+   FROMBYTE is a regexp-byteoffset.  */
 
 void
-RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
-                                 ptrdiff_t frombyte)
+RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, ptrdiff_t frombyte)
 {
   SETUP_BUFFER_SYNTAX_TABLE ();
   gl_state.object = object;
+  gl_state.b_re_byte = -1;
+  gl_state.e_re_byte = -1;
   if (BUFFERP (gl_state.object))
     {
       struct buffer *buf = XBUFFER (gl_state.object);
@@ -282,21 +285,25 @@ RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
   else if (NILP (gl_state.object))
     {
       gl_state.b_property = BEG;
-      gl_state.e_property = ZV; /* FIXME: Why not +1 like in 
SETUP_SYNTAX_TABLE? */
+      gl_state.e_property = ZV;
     }
   else if (EQ (gl_state.object, Qt))
     {
       gl_state.b_property = 0;
-      gl_state.e_property = PTRDIFF_MAX;
+      /* -1 so we can do +1 in `re_update_byteoffsets`.  */
+      gl_state.e_property = PTRDIFF_MAX - 1;
     }
   else
     {
       gl_state.b_property = 0;
-      gl_state.e_property = 1 + SCHARS (gl_state.object);
+      gl_state.e_property = SCHARS (gl_state.object);
     }
   if (parse_sexp_lookup_properties)
-    update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte),
-                        1, 1, gl_state.object);
+    {
+      update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte),
+                          1, 1, gl_state.object);
+      re_update_byteoffsets ();
+    }
 }
 
 /* Update gl_state to an appropriate interval which contains CHARPOS.  The
diff --git a/src/syntax.h b/src/syntax.h
index 01982be25a0..420ba8f31dc 100644
--- a/src/syntax.h
+++ b/src/syntax.h
@@ -66,7 +66,7 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults, 
syntax_table)
 struct gl_state_s
 {
   Lisp_Object object;                  /* The object we are scanning.  */
-  ptrdiff_t start;                     /* Where to stop.  */
+  ptrdiff_t start;                     /* Where to stop(?FIXME?).  */
   ptrdiff_t stop;                      /* Where to stop.  */
   bool use_global;                     /* Whether to use global_code
                                           or c_s_t.  */
@@ -85,6 +85,11 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults, 
syntax_table)
                                           and possibly at the
                                           intervals too, depending
                                           on:  */
+  /* The regexp engine prefers byteoffsets over char positions, so
+     store those to try and reduce the number of byte<->char conversions.
+     This is only kept uptodate when used from the regexp engine.  */
+  ptrdiff_t b_re_byte;      /* First byteoffset where c_s_t is valid.  */
+  ptrdiff_t e_re_byte;      /* First byteoffset where c_s_t is not valid.  */
 };
 
 extern struct gl_state_s gl_state;
@@ -145,19 +150,14 @@ SYNTAX (int c)
 
 extern unsigned char const syntax_spec_code[0400];
 
-/* Convert the regexp's BYTEOFFSET into a character position,
-   for the object recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT.
-
-   The value is meant for use in code that does nothing when
-   parse_sexp_lookup_properties is false, so return 0 in that case,
-   for speed.  */
+/* Convert the BYTEOFFSET into a character position, for the object
+   recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT.  */
 
 INLINE ptrdiff_t
 RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset)
 {
-  return (! parse_sexp_lookup_properties
-         ? 0
-         : STRINGP (gl_state.object)
+  eassert (parse_sexp_lookup_properties);
+  return (STRINGP (gl_state.object)
          ? string_byte_to_char (gl_state.object, byteoffset)
          : BUFFERP (gl_state.object)
          ? ((buf_bytepos_to_charpos
@@ -168,6 +168,44 @@ RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset)
          : byteoffset);
 }
 
+INLINE ptrdiff_t
+RE_SYNTAX_TABLE_CHAR_TO_BYTE (ptrdiff_t charpos)
+{
+  eassert (parse_sexp_lookup_properties);
+  return (STRINGP (gl_state.object)
+         ? string_char_to_byte (gl_state.object, charpos)
+         : BUFFERP (gl_state.object)
+         ? ((buf_charpos_to_bytepos
+             (XBUFFER (gl_state.object), charpos)
+             - BUF_BEGV_BYTE (XBUFFER (gl_state.object))))
+         : NILP (gl_state.object)
+         ? CHAR_TO_BYTE (charpos) - BEGV_BYTE
+         : charpos);
+}
+
+static void re_update_byteoffsets (void)
+{
+  gl_state.b_re_byte = RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.b_property);
+  eassert (gl_state.b_property
+           == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.b_re_byte));
+  /* `e_property` is often set to EOB+1 (or to some value
+     much further than `stop` in narrowed buffers).  */
+  gl_state.e_re_byte
+    = gl_state.e_property > gl_state.stop
+      ? 1 + RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.stop)
+      : RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.e_property);
+  eassert (gl_state.e_property > gl_state.stop
+           ? gl_state.e_property
+             >= 1 + RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte - 1)
+           : gl_state.e_property
+             == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte));
+}
+
+/* The regexp-engine doesn't keep track of char positions, but instead
+   uses byteoffsets, so `syntax.c` uses `UPDATE_SYNTAX_TABLE_*` functions,
+   passing them `charpos`s whereas `regexp.c` uses `RE_UPDATE_SYNTAX_TABLE_*`
+   functions, passing them byteoffsets.  */
+
 /* Make syntax table state (gl_state) good for CHARPOS, assuming it is
    currently good for a position before CHARPOS.  */
 
@@ -178,6 +216,36 @@ UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t charpos)
     update_syntax_table_forward (charpos, false, gl_state.object);
 }
 
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t byteoffset)
+{ /* Performs just-in-time syntax-propertization.  */
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset >= gl_state.e_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+      eassert (charpos >= gl_state.e_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+      re_update_byteoffsets ();
+    }
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE (ptrdiff_t byteoffset)
+{ /* Performs just-in-time syntax-propertization.  */
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset > gl_state.e_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset) - 1;
+      eassert (charpos >= gl_state.e_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+      re_update_byteoffsets ();
+    }
+}
+
 /* Make syntax table state (gl_state) good for CHARPOS, assuming it is
    currently good for a position after CHARPOS.  */
 
@@ -188,6 +256,36 @@ UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t charpos)
     update_syntax_table (charpos, -1, false, gl_state.object);
 }
 
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t byteoffset)
+{
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset < gl_state.b_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+      eassert (charpos < gl_state.b_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+      re_update_byteoffsets ();
+    }
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (ptrdiff_t byteoffset)
+{
+  if (!parse_sexp_lookup_properties)
+    return;
+  eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative.  */
+  if (byteoffset <= gl_state.b_re_byte)
+    {
+      ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+      eassert (charpos <= gl_state.b_property);
+      UPDATE_SYNTAX_TABLE_FORWARD (charpos - 1);
+      re_update_byteoffsets ();
+    }
+}
+
 /* Make syntax table good for CHARPOS.  */
 
 INLINE void
@@ -197,6 +295,20 @@ UPDATE_SYNTAX_TABLE (ptrdiff_t charpos)
   UPDATE_SYNTAX_TABLE_FORWARD (charpos);
 }
 
+INLINE void
+RE_UPDATE_SYNTAX_TABLE (ptrdiff_t byteoffset)
+{
+  RE_UPDATE_SYNTAX_TABLE_BACKWARD (byteoffset);
+  RE_UPDATE_SYNTAX_TABLE_FORWARD  (byteoffset);
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BEFORE (ptrdiff_t byteoffset)
+{
+  RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (byteoffset);
+  RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE  (byteoffset);
+}
+
 /* Set up the buffer-global syntax table.  */
 
 INLINE void

reply via email to

[Prev in Thread] Current Thread [Next in Thread]