emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] master a1f76ad: Correct regexp matching of raw bytes


From: Mattias Engdegård
Subject: [Emacs-diffs] master a1f76ad: Correct regexp matching of raw bytes
Date: Fri, 28 Jun 2019 11:30:26 -0400 (EDT)

branch: master
commit a1f76adfb03c23bb4242928e8efe6193c301f0c1
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>

    Correct regexp matching of raw bytes
    
    Make regexp matching of raw bytes work in all combination of unibyte
    and multibyte patterns and targets, as exact strings and in character
    alternatives (bug#3687).
    
    * src/regex-emacs.c (analyze_first):
    Include raw byte in fastmap when pattern is a multibyte exact string.
    Include leading byte in fastmap for raw bytes in character alternatives.
    (re_match_2_internal):
    Decrement the byte count by the number of bytes in the pattern character,
    not 1.
    * test/src/regex-emacs-tests.el (regexp-unibyte-unibyte)
    (regexp-multibyte-unibyte, regexp-unibyte-mutibyte)
    (regexp-multibyte-multibyte): New tests.
---
 src/regex-emacs.c             |  24 +++++++--
 test/src/regex-emacs-tests.el | 120 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index c353a78..5887eaa 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -2794,6 +2794,7 @@ static int
 analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
 {
   int j, k;
+  int nbits;
   bool not;
 
   /* If all elements for base leading-codes in fastmap is set, this
@@ -2854,7 +2855,14 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, 
bool multibyte)
                 each byte is a character.  Thus, this works in both
                 cases. */
              fastmap[p[1]] = 1;
-             if (! multibyte)
+             if (multibyte)
+               {
+                 /* Cover the case of matching a raw char in a
+                    multibyte regexp against unibyte.  */
+                 if (CHAR_BYTE8_HEAD_P (p[1]))
+                   fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
+               }
+             else
                {
                  /* For the case of matching this unibyte regex
                     against multibyte, we must set a leading code of
@@ -2886,11 +2894,18 @@ analyze_first (re_char *p, re_char *pend, char 
*fastmap, bool multibyte)
        case charset:
          if (!fastmap) break;
          not = (re_opcode_t) *(p - 1) == charset_not;
-         for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
-              j >= 0; j--)
+         nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+         p++;
+         for (j = 0; j < nbits; j++)
            if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
              fastmap[j] = 1;
 
+         /* To match raw bytes (in the 80..ff range) against multibyte
+            strings, add their leading bytes to the fastmap.  */
+         for (j = 0x80; j < nbits; j++)
+           if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
+             fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
+
          if (/* Any leading code can possibly start a character
                 which doesn't match the specified set of characters.  */
              not
@@ -4251,8 +4266,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
                  }
                p += pat_charlen;
                d++;
+               mcnt -= pat_charlen;
              }
-           while (--mcnt);
+           while (mcnt > 0);
 
          break;
 
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index 0ae50c9..50ed3e8 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -683,4 +683,124 @@ This evaluates the TESTS test cases from glibc."
   (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x)))
   (should-error (string-match "\\`x\\{65536\\}" "X") :type 'invalid-regexp))
 
+(ert-deftest regexp-unibyte-unibyte ()
+  "Test matching a unibyte regexp against a unibyte string."
+  ;; Sanity check
+  (should-not (multibyte-string-p "ab"))
+  (should-not (multibyte-string-p "\xff"))
+  ;; ASCII
+  (should (string-match "a[b]" "ab"))
+  ;; Raw
+  (should (string-match "\xf1" "\xf1"))
+  (should-not (string-match "\xf1" "\xc1\xb1"))
+  ;; Raw, char alt
+  (should (string-match "[\xf1]" "\xf1"))
+  (should-not (string-match "[\xf1]" "\xc1\xb1"))
+  ;; Raw range
+  (should (string-match "[\x82-\xd3]" "\xbb"))
+  (should-not (string-match "[\x82-\xd3]" "a"))
+  (should-not (string-match "[\x82-\xd3]" "\x81"))
+  (should-not (string-match "[\x82-\xd3]" "\xd4"))
+  ;; ASCII-raw range
+  (should (string-match "[f-\xd3]" "q"))
+  (should (string-match "[f-\xd3]" "\xbb"))
+  (should-not (string-match "[f-\xd3]" "e"))
+  (should-not (string-match "[f-\xd3]" "\xd4")))
+
+(ert-deftest regexp-multibyte-multibyte ()
+  "Test matching a multibyte regexp against a multibyte string."
+  ;; Sanity check
+  (should (multibyte-string-p "åü"))
+  ;; ASCII
+  (should (string-match (string-to-multibyte "a[b]")
+                        (string-to-multibyte "ab")))
+  ;; Unicode
+  (should (string-match "å[ü]z" "åüz"))
+  (should-not (string-match "ü" (string-to-multibyte "\xc3\xbc")))
+  ;; Raw
+  (should (string-match (string-to-multibyte "\xf1")
+                        (string-to-multibyte "\xf1")))
+  (should-not (string-match (string-to-multibyte "\xf1")
+                            (string-to-multibyte "\xc1\xb1")))
+  (should-not (string-match (string-to-multibyte "\xc1\xb1")
+                            (string-to-multibyte "\xf1")))
+  ;; Raw, char alt
+  (should (string-match (string-to-multibyte "[\xf1]")
+                        (string-to-multibyte "\xf1")))
+  ;; Raw range
+  (should (string-match (string-to-multibyte "[\x82-\xd3]")
+                        (string-to-multibyte "\xbb")))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "a"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "Ã…"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "ü"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\x81"))
+  (should-not (string-match (string-to-multibyte "[\x82-\xd3]") "\xd4"))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match (string-to-multibyte "[f-\xd3]")
+                        (string-to-multibyte "q")))
+  (should (string-match (string-to-multibyte "[f-\xd3]")
+                        (string-to-multibyte "\xbb")))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "Ã…"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "ü"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+  ;; Unicode-raw range: should be empty
+  (should-not (string-match "[Ã¥-\xd3]" "Ã¥"))
+  (should-not (string-match "[Ã¥-\xd3]" (string-to-multibyte "\xd3")))
+  (should-not (string-match "[Ã¥-\xd3]" (string-to-multibyte "\xbb")))
+  (should-not (string-match "[å-\xd3]" "ü"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "Ã¥" (string-to-multibyte "\xe5")))
+  (should-not (string-match "[Ã¥]" (string-to-multibyte "\xe5")))
+  (should-not (string-match "\xe5" "Ã¥"))
+  (should-not (string-match "[\xe5]" "Ã¥")))
+
+(ert-deftest regexp-unibyte-multibyte ()
+  "Test matching a unibyte regexp against a multibyte string."
+  ;; ASCII
+  (should (string-match "a[b]" (string-to-multibyte "ab")))
+  ;; Unicode
+  (should (string-match "a.[^b]c" (string-to-multibyte "aåüc")))
+  ;; Raw
+  (should (string-match "\xf1" (string-to-multibyte "\xf1")))
+  (should-not (string-match "\xc1\xb1" (string-to-multibyte "\xf1")))
+  ;; Raw, char alt
+  (should (string-match "[\xf1]" (string-to-multibyte "\xf1")))
+  (should-not (string-match "[\xc1][\xb1]" (string-to-multibyte "\xf1")))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match "[f-\xd3]" (string-to-multibyte "q")))
+  (should (string-match "[f-\xd3]" (string-to-multibyte "\xbb")))
+  (should-not (string-match "[f-\xd3]" "e"))
+  (should-not (string-match "[f-\xd3]" "Ã…"))
+  (should-not (string-match "[f-\xd3]" "ü"))
+  (should-not (string-match "[f-\xd3]" "\xd4"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "\xe5" "Ã¥"))
+  (should-not (string-match "[\xe5]" "Ã¥")))
+
+(ert-deftest regexp-multibyte-unibyte ()
+  "Test matching a multibyte regexp against a unibyte string."
+  ;; ASCII
+  (should (string-match (string-to-multibyte "a[b]") "ab"))
+  ;; Unicode
+  (should (string-match "a[^ü]c" "abc"))
+  (should-not (string-match "ü" "\xc3\xbc"))
+  ;; Raw
+  (should (string-match (string-to-multibyte "\xf1") "\xf1"))
+  (should-not (string-match (string-to-multibyte "\xf1") "\xc1\xb1"))
+  ;; Raw, char alt
+  (should (string-match (string-to-multibyte "[\xf1]") "\xf1"))
+  (should-not (string-match (string-to-multibyte "[\xf1]") "\xc1\xb1"))
+  ;; ASCII-raw range: should exclude U+0100..U+10FFFF
+  (should (string-match (string-to-multibyte "[f-\xd3]") "q"))
+  (should (string-match (string-to-multibyte "[f-\xd3]") "\xbb"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "e"))
+  (should-not (string-match (string-to-multibyte "[f-\xd3]") "\xd4"))
+  ;; Unicode-raw range: should be empty
+  (should-not (string-match "[Ã¥-\xd3]" "\xd3"))
+  (should-not (string-match "[Ã¥-\xd3]" "\xbb"))
+  ;; No equivalence between raw bytes and latin-1
+  (should-not (string-match "Ã¥" "\xe5"))
+  (should-not (string-match "[Ã¥]" "\xe5")))
+
 ;;; regex-emacs-tests.el ends here



reply via email to

[Prev in Thread] Current Thread [Next in Thread]