qemacs-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemacs-commit] qemacs buffer.c qe.c qe.h


From: Charlie Gordon
Subject: [Qemacs-commit] qemacs buffer.c qe.c qe.h
Date: Sun, 16 Aug 2015 23:15:02 +0000

CVSROOT:        /sources/qemacs
Module name:    qemacs
Changes by:     Charlie Gordon <chqrlie>        15/08/16 23:15:02

Modified files:
        .              : buffer.c qe.c qe.h 

Log message:
        search: another 35% speed improvement on large files
        - improve utf8 handling in eb_prevc() using eb_read_one_byte()
        - added utf8_is_trailing_byte(b) to standardize testing for this
        - improve hex searching by 50% using eb_read_one_byte()
        - improve regular searching by 35% by removing extra calls to eb_nextc()

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/buffer.c?cvsroot=qemacs&r1=1.85&r2=1.86
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.c?cvsroot=qemacs&r1=1.200&r2=1.201
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.h?cvsroot=qemacs&r1=1.197&r2=1.198

Patches:
Index: buffer.c
===================================================================
RCS file: /sources/qemacs/qemacs/buffer.c,v
retrieving revision 1.85
retrieving revision 1.86
diff -u -b -r1.85 -r1.86
--- buffer.c    16 Aug 2015 19:22:01 -0000      1.85
+++ buffer.c    16 Aug 2015 23:15:01 -0000      1.86
@@ -1214,6 +1214,7 @@
         if (ch == ESCAPE_CHAR) {
             eb_read(b, offset - 1, buf, MAX_CHAR_BYTES);
             b->charset_state.p = buf;
+            /* XXX: incorrect behaviour on ill encoded utf8 sequences */
             ch = b->charset_state.decode_func(&b->charset_state);
             offset += (b->charset_state.p - buf) - 1;
         }
@@ -1297,27 +1298,36 @@
         offset = 0;
         ch = '\n';
     } else {
-        /* XXX: it cannot be generic here. Should use the
-           line/column system to be really generic */
-        char_size = b->charset_state.char_size;
-        offset -= char_size;
-        q = buf + sizeof(buf) - char_size;
-        eb_read(b, offset, q, char_size);
         if (b->charset == &charset_utf8) {
-            while (*q >= 0x80 && *q < 0xc0) {
-                if (offset == 0 || q == buf) {
-                    /* error: take only previous byte */
-                    offset += buf - 1 - q;
+            char_size = 1;
+            offset -= 1;
+            ch = eb_read_one_byte(b, offset);
+            if (utf8_is_trailing_byte(ch)) {
+                int offset1 = offset;
+                q = buf + sizeof(buf);
+                *--q = ch;
+                while (utf8_is_trailing_byte(ch) && offset > 0 && q > buf) {
+                    offset -= 1;
+                    *--q = ch = eb_read_one_byte(b, offset);
+                }
+                if (ch >= 0xc0) {
+                    ch = utf8_decode((const char **)(void *)&q);
+                }
+                if (q != buf + sizeof(buf)) {
+                    /* decoding error: only take the last byte */
+                    offset = offset1;
                     ch = buf[sizeof(buf) - 1];
-                    goto the_end;
                 }
-                offset--;
-                q--;
-                eb_read(b, offset, q, 1);
             }
-            ch = utf8_decode((const char **)(void *)&q);
         } else {
-            /* CG: this only works for stateless charsets */
+            /* XXX: this only works for stateless charsets.
+             * it would fail for utf-16 and east-asian encodings.
+             * Should use the line/column system to be really generic
+             */
+            char_size = b->charset_state.char_size;
+            offset -= char_size;
+            q = buf + sizeof(buf) - char_size;
+            eb_read(b, offset, q, char_size);
             b->charset_state.p = q;
             ch = b->charset_state.decode_func(&b->charset_state);
         }
@@ -1340,7 +1350,6 @@
             }
         }
     }
- the_end:
     *prev_ptr = offset;
     return ch;
 }
@@ -1482,7 +1491,7 @@
             /* Round offset down to character boundary */
             u8 buf[1];
             while (offset > 0 && eb_read(b, offset, buf, 1) == 1 &&
-                   (buf[0] & 0xC0) == 0x80) {
+                   utf8_is_trailing_byte(buf[0])) {
                 /* backtrack over trailing bytes */
                 offset--;
             }

Index: qe.c
===================================================================
RCS file: /sources/qemacs/qemacs/qe.c,v
retrieving revision 1.200
retrieving revision 1.201
diff -u -b -r1.200 -r1.201
--- qe.c        16 Aug 2015 18:01:06 -0000      1.200
+++ qe.c        16 Aug 2015 23:15:01 -0000      1.201
@@ -2251,9 +2251,8 @@
 {
     char buf[256];
     buf_t outbuf, *out;
-    unsigned char cc;
     int line_num, col_num;
-    int c, c2, offset1, offset2, off;
+    int c, c2, cc, offset1, offset2, off;
 
     out = buf_init(&outbuf, buf, sizeof(buf));
     if (s->offset < s->b->total_size) {
@@ -2295,11 +2294,11 @@
 
         /* Display buffer bytes if char is encoded */
         off = s->offset;
-        eb_read(s->b, off++, &cc, 1);
+        cc = eb_read_one_byte(s->b, off++);
         if (cc != c || c2 || off != offset2) {
             buf_printf(out, " [%02X", cc);
             while (off < offset2) {
-                eb_read(s->b, off++, &cc, 1);
+                cc = eb_read_one_byte(s->b, off++);
                 buf_printf(out, " %02X", cc);
             }
             buf_put_byte(out, ']');
@@ -6472,7 +6471,7 @@
                      int *found_offset, int *found_end)
 {
     int total_size = b->total_size;
-    int c, c2, offset = start_offset, offset1, offset2, pos;
+    int c, c2, offset = start_offset, offset1, offset2, offset3, pos;
 
     if (len == 0)
         return 0;
@@ -6504,19 +6503,16 @@
             if (offset >= total_size)
                 return 0;
 
-            if ((offset & 0x1ffff) == 0) {
-                /* check for search abort every 128k */
+            if ((offset & 0xfffff) == 0) {
+                /* check for search abort every megabyte */
                 if (abort_func && abort_func(abort_opaque))
                     return -1;
             }
 
             pos = 0;
             for (offset2 = offset; offset2 < total_size;) {
-                u8 data[1];
-
                 /* CG: Should bufferize a bit ? */
-                eb_read(b, offset2++, data, 1);
-                c = data[0];
+                c = eb_read_one_byte(b, offset2++);
                 c2 = buf[pos++];
                 if (c != c2)
                     break;
@@ -6530,33 +6526,35 @@
         }
     }
 
-    for (;; (void)(dir >= 0 && eb_nextc(b, offset, &offset))) {
+    for (offset1 = offset;;) {
         if (dir < 0) {
             if (offset == 0)
                 return 0;
             eb_prevc(b, offset, &offset);
-        }
+        } else {
+            offset = offset1;
         if (offset >= total_size)
             return 0;
-
-        if ((offset & 0x1ffff) == 0) {
-            /* check for search abort every 128k */
+        }
+        if ((offset & 0xfffff) == 0) {
+            /* check for search abort every megabyte */
             if (abort_func && abort_func(abort_opaque))
                 return -1;
         }
 
         if (flags & SEARCH_FLAG_WORD) {
             /* check for start of word */
-            c = eb_prevc(b, offset, &offset1);
+            c = eb_prevc(b, offset, &offset3);
             if (qe_isword(c))
                 continue;
         }
 
-        pos = 0;
-        offset2 = offset;
-        while (offset2 < total_size) {
             /* CG: XXX: Should use buffer specific accelerator */
-            c = eb_nextc(b, offset2, &offset2);
+        /* Get first char separately to compute offset1 */
+        c = eb_nextc(b, offset, &offset1);
+
+        pos = 0;
+        for (offset2 = offset1;;) {
             c2 = buf[pos++];
             if (flags & SEARCH_FLAG_IGNORECASE) {
                 if (qe_toupper(c) != qe_toupper(c2))
@@ -6568,7 +6566,7 @@
             if (pos >= len) {
                 if (flags & SEARCH_FLAG_WORD) {
                     /* check for end of word */
-                    c = eb_nextc(b, offset2, &offset1);
+                    c = eb_nextc(b, offset2, &offset3);
                     if (qe_isword(c))
                         break;
                 }
@@ -6578,6 +6576,9 @@
                     return 1;
                 }
             }
+            if (offset2 >= total_size)
+                break;
+            c = eb_nextc(b, offset2, &offset2);
         }
     }
 }

Index: qe.h
===================================================================
RCS file: /sources/qemacs/qemacs/qe.h,v
retrieving revision 1.197
retrieving revision 1.198
diff -u -b -r1.197 -r1.198
--- qe.h        16 Aug 2015 19:22:01 -0000      1.197
+++ qe.h        16 Aug 2015 23:15:02 -0000      1.198
@@ -601,6 +601,7 @@
 void qe_register_charset(QECharset *charset);
 
 extern unsigned char utf8_length[256];
+static inline int utf8_is_trailing_byte(int c) { return (c & 0xC0) == 0x80; }
 int utf8_encode(char *q, int c);
 int utf8_decode(const char **pp);
 int utf8_to_unicode(unsigned int *dest, int dest_length, const char *str);



reply via email to

[Prev in Thread] Current Thread [Next in Thread]