bug-gnu-utils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

gawk: improvements for IGNORECASE and FS/RS


From: Aharon Robbins
Subject: gawk: improvements for IGNORECASE and FS/RS
Date: Tue, 15 Oct 2002 18:34:18 +0200

Greetings.

It turns out that gawk has both some memory leaks, and some gross
inefficiencies when IGNORECASE is toggled a lot for each record.
This is particularly apparent when using FS as a regex to split records.
The following script, applied to a series of files with about 1,790,000
lines, locked up my CPU under stock gawk 3.1.1:

        BEGIN { FS = "[ \t]+" }

        {
                junk = NF
                IGNORECASE = ! IGNORECASE
        }

        END { print NR }

With the patch below applied, gawk finished in under 2 minutes.

Enjoy,

Arnold Robbins
--------------------------------------------------------------------------
Tue Oct 15 14:18:53 2002  Arnold D. Robbins   <address@hidden>

        * eval.c (set_IGNORECASE): Call set_RS() instead of
        set_FS_if_not_FIELDWIDTHS().  The former calls the latter
        for us, and also makes IGNORECASE affect RS like it's supposed to.
        * field.c (FS_re_yes_case, FS_re_no_case): New variables.
        (set_FS): Smarten up routine to not recompile FS_regexp if all
        that's changed is IGNORECASE or if switching back to FS from
        FIELDWIDTHS.  Significant speed-up for cases where IGNORECASE
        is assigned to for every record.
        * io.c (RS_re_yes_case, RS_re_no_case): New variables.
        (set_RS): Similar changes as to set_FS().  In particular,
        IGNORECASE changing now affects record splitting too.
        * re.c (refree): Set rp->pat.tranaslate to NULL. It comes
        from casetable and shouldn't be freed. (Strictly necessary
        only for old regex, but a good idea anyway).
        Also, call regfree(& rp->pat) instead of manually free()ing
        things, since there's dynamically allocated stuff hiding in
        the buffer. Avoids a memory leak.

diff -cr gawk-3.1.1/eval.c gawk-foo/eval.c
*** gawk-3.1.1/eval.c   Tue Apr 16 14:41:14 2002
--- gawk-foo/eval.c     Tue Oct 15 13:55:02 2002
***************
*** 1884,1890 ****
                IGNORECASE = (force_number(IGNORECASE_node->var_value) != 0.0);
        else
                IGNORECASE = FALSE;             /* shouldn't happen */
!       set_FS_if_not_FIELDWIDTHS();
  }
  
  /* set_BINMODE --- set translation mode (OS/2, DOS, others) */
--- 1884,1891 ----
                IGNORECASE = (force_number(IGNORECASE_node->var_value) != 0.0);
        else
                IGNORECASE = FALSE;             /* shouldn't happen */
! 
!       set_RS();       /* set_RS() calls set_FS_if_not_FIELDWIDTHS() for us */
  }
  
  /* set_BINMODE --- set translation mode (OS/2, DOS, others) */
diff -cr gawk-3.1.1/field.c gawk-foo/field.c
*** gawk-3.1.1/field.c  Tue Apr 16 14:57:25 2002
--- gawk-foo/field.c    Tue Oct 15 12:14:35 2002
***************
*** 60,65 ****
--- 60,67 ----
  NODE **fields_arr;            /* array of pointers to the field nodes */
  int field0_valid;             /* $(>0) has not been changed yet */
  int default_FS;                       /* TRUE when FS == " " */
+ Regexp *FS_re_yes_case = NULL;
+ Regexp *FS_re_no_case = NULL;
  Regexp *FS_regexp = NULL;
  static NODE *Null_field = NULL;
  
***************
*** 964,969 ****
--- 966,972 ----
        NODE *fs;
        static NODE *save_fs = NULL;
        static NODE *save_rs = NULL;
+       int remake_re = TRUE;
  
        /*
         * If changing the way fields are split, obey least-suprise
***************
*** 972,992 ****
        if (fields_arr != NULL)
                (void) get_field(HUGE - 1, 0);
  
!       if (! (save_fs && cmp_nodes(FS_node->var_value, save_fs) == 0
!               && save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0)) {
!               unref(save_fs);
!               save_fs = dupnode(FS_node->var_value);
!               unref(save_rs);
!               save_rs = dupnode(RS_node->var_value);
!               resave_fs = TRUE;
!               if (FS_regexp) {
!                       refree(FS_regexp);
!                       FS_regexp = NULL;
                }
        }
        buf[0] = '\0';
        default_FS = FALSE;
        fs = force_string(FS_node->var_value);
        if (! do_traditional && fs->stlen == 0) {
                static short warned = FALSE;
  
--- 975,1012 ----
        if (fields_arr != NULL)
                (void) get_field(HUGE - 1, 0);
  
!       /* It's possible that only IGNORECASE changed, or FS = FS */
!       if (save_fs && cmp_nodes(FS_node->var_value, save_fs) == 0
!               && save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0) {
!               if (FS_regexp != NULL)
!                       FS_regexp = (IGNORECASE ? FS_re_no_case : 
FS_re_yes_case);
! 
!               /* FS = FS */
!               if (! using_FIELDWIDTHS())
!                       return;
!               else {
!                       remake_re = FALSE;
!                       goto choose_fs_function;
                }
        }
+ 
+       unref(save_fs);
+       save_fs = dupnode(FS_node->var_value);
+       unref(save_rs);
+       save_rs = dupnode(RS_node->var_value);
+       resave_fs = TRUE;
+       if (FS_regexp != NULL) {
+               refree(FS_re_yes_case);
+               refree(FS_re_no_case);
+               FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
+       }
+ 
+ 
+ choose_fs_function:
        buf[0] = '\0';
        default_FS = FALSE;
        fs = force_string(FS_node->var_value);
+ 
        if (! do_traditional && fs->stlen == 0) {
                static short warned = FALSE;
  
***************
*** 1024,1036 ****
                                sprintf(buf, "[%c]", fs->stptr[0]);
                }
        }
!       if (buf[0] != '\0') {
!               FS_regexp = make_regexp(buf, strlen(buf), IGNORECASE, TRUE);
!               parse_field = re_parse_field;
!       } else if (parse_field == re_parse_field) {
!               FS_regexp = make_regexp(fs->stptr, fs->stlen, IGNORECASE, TRUE);
!       } else
!               FS_regexp = NULL;
  
        update_PROCINFO("FS", "FS");
  }
--- 1044,1067 ----
                                sprintf(buf, "[%c]", fs->stptr[0]);
                }
        }
!       if (remake_re) {
!               if (FS_regexp != NULL) {
!                       refree(FS_re_yes_case);
!                       refree(FS_re_no_case);
!                       FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
!               }
!               if (buf[0] != '\0') {
!                       FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE, 
TRUE);
!                       FS_re_no_case = make_regexp(buf, strlen(buf), TRUE, 
TRUE);
!                       FS_regexp = (IGNORECASE ? FS_re_no_case : 
FS_re_yes_case);
!                       parse_field = re_parse_field;
!               } else if (parse_field == re_parse_field) {
!                       FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, 
FALSE, TRUE);
!                       FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE, 
TRUE);
!                       FS_regexp = (IGNORECASE ? FS_re_no_case : 
FS_re_yes_case);
!               } else
!                       FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
!       }
  
        update_PROCINFO("FS", "FS");
  }
diff -cr gawk-3.1.1/io.c gawk-foo/io.c
*** gawk-3.1.1/io.c     Tue Apr 16 14:57:44 2002
--- gawk-foo/io.c       Tue Oct 15 11:57:14 2002
***************
*** 122,127 ****
--- 122,129 ----
  
  static struct redirect *red_head = NULL;
  static NODE *RS;
+ static Regexp *RS_re_yes_case;
+ static Regexp *RS_re_no_case;
  static Regexp *RS_regexp;
  
  int RS_is_null;
***************
*** 2500,2521 ****
  {
        static NODE *save_rs = NULL;
  
!       if (save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0)
                return;
        unref(save_rs);
        save_rs = dupnode(RS_node->var_value);
        RS_is_null = FALSE;
        RS = force_string(RS_node->var_value);
        if (RS_regexp != NULL) {
!               refree(RS_regexp);
!               RS_regexp = NULL;
        }
        if (RS->stlen == 0)
                RS_is_null = TRUE;
        else if (RS->stlen > 1) {
                static int warned = FALSE;
  
!               RS_regexp = make_regexp(RS->stptr, RS->stlen, IGNORECASE, TRUE);
  
                if (do_lint && ! warned) {
                        lintwarn(_("multicharacter value of `RS' is a gawk 
extension"));
--- 2502,2528 ----
  {
        static NODE *save_rs = NULL;
  
!       if (save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0) {
!               RS_regexp = (IGNORECASE ? RS_re_no_case : RS_re_yes_case);
                return;
+       }
        unref(save_rs);
        save_rs = dupnode(RS_node->var_value);
        RS_is_null = FALSE;
        RS = force_string(RS_node->var_value);
        if (RS_regexp != NULL) {
!               refree(RS_re_yes_case);
!               refree(RS_re_no_case);
!               RS_re_yes_case = RS_re_no_case = RS_regexp = NULL;
        }
        if (RS->stlen == 0)
                RS_is_null = TRUE;
        else if (RS->stlen > 1) {
                static int warned = FALSE;
  
!               RS_re_yes_case = make_regexp(RS->stptr, RS->stlen, FALSE, TRUE);
!               RS_re_no_case = make_regexp(RS->stptr, RS->stlen, TRUE, TRUE);
!               RS_regexp = (IGNORECASE ? RS_re_no_case : RS_re_yes_case);
  
                if (do_lint && ! warned) {
                        lintwarn(_("multicharacter value of `RS' is a gawk 
extension"));
diff -cr gawk-3.1.1/re.c gawk-foo/re.c
*** gawk-3.1.1/re.c     Tue Apr 16 14:58:58 2002
--- gawk-foo/re.c       Tue Oct 15 11:42:31 2002
***************
*** 221,228 ****
  void
  refree(Regexp *rp)
  {
!       free(rp->pat.buffer);
!       free(rp->pat.fastmap);
        if (rp->regs.start)
                free(rp->regs.start);
        if (rp->regs.end)
--- 221,230 ----
  void
  refree(Regexp *rp)
  {
!       /* this isn't malloced, don't let regfree free it. */
!       rp->pat.translate = NULL;
! 
!       regfree(& rp->pat);
        if (rp->regs.start)
                free(rp->regs.start);
        if (rp->regs.end)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]