[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
gawk: improvements for IGNORECASE and FS/RS
From: |
Aharon Robbins |
Subject: |
gawk: improvements for IGNORECASE and FS/RS |
Date: |
Tue, 15 Oct 2002 18:34:18 +0200 |
Greetings.
It turns out that gawk has both some memory leaks, and some gross
inefficiencies when IGNORECASE is toggled a lot for each record.
This is particularly apparent when using FS as a regex to split records.
The following script, applied to a series of files with about 1,790,000
lines, locked up my CPU under stock gawk 3.1.1:
BEGIN { FS = "[ \t]+" }
{
junk = NF
IGNORECASE = ! IGNORECASE
}
END { print NR }
With the patch below applied, gawk finished in under 2 minutes.
Enjoy,
Arnold Robbins
--------------------------------------------------------------------------
Tue Oct 15 14:18:53 2002 Arnold D. Robbins <address@hidden>
* eval.c (set_IGNORECASE): Call set_RS() instead of
set_FS_if_not_FIELDWIDTHS(). The former calls the latter
for us, and also makes IGNORECASE affect RS like it's supposed to.
* field.c (FS_re_yes_case, FS_re_no_case): New variables.
(set_FS): Smarten up routine to not recompile FS_regexp if all
that's changed is IGNORECASE or if switching back to FS from
FIELDWIDTHS. Significant speed-up for cases where IGNORECASE
is assigned to for every record.
* io.c (RS_re_yes_case, RS_re_no_case): New variables.
(set_RS): Similar changes as to set_FS(). In particular,
IGNORECASE changing now affects record splitting too.
* re.c (refree): Set rp->pat.tranaslate to NULL. It comes
from casetable and shouldn't be freed. (Strictly necessary
only for old regex, but a good idea anyway).
Also, call regfree(& rp->pat) instead of manually free()ing
things, since there's dynamically allocated stuff hiding in
the buffer. Avoids a memory leak.
diff -cr gawk-3.1.1/eval.c gawk-foo/eval.c
*** gawk-3.1.1/eval.c Tue Apr 16 14:41:14 2002
--- gawk-foo/eval.c Tue Oct 15 13:55:02 2002
***************
*** 1884,1890 ****
IGNORECASE = (force_number(IGNORECASE_node->var_value) != 0.0);
else
IGNORECASE = FALSE; /* shouldn't happen */
! set_FS_if_not_FIELDWIDTHS();
}
/* set_BINMODE --- set translation mode (OS/2, DOS, others) */
--- 1884,1891 ----
IGNORECASE = (force_number(IGNORECASE_node->var_value) != 0.0);
else
IGNORECASE = FALSE; /* shouldn't happen */
!
! set_RS(); /* set_RS() calls set_FS_if_not_FIELDWIDTHS() for us */
}
/* set_BINMODE --- set translation mode (OS/2, DOS, others) */
diff -cr gawk-3.1.1/field.c gawk-foo/field.c
*** gawk-3.1.1/field.c Tue Apr 16 14:57:25 2002
--- gawk-foo/field.c Tue Oct 15 12:14:35 2002
***************
*** 60,65 ****
--- 60,67 ----
NODE **fields_arr; /* array of pointers to the field nodes */
int field0_valid; /* $(>0) has not been changed yet */
int default_FS; /* TRUE when FS == " " */
+ Regexp *FS_re_yes_case = NULL;
+ Regexp *FS_re_no_case = NULL;
Regexp *FS_regexp = NULL;
static NODE *Null_field = NULL;
***************
*** 964,969 ****
--- 966,972 ----
NODE *fs;
static NODE *save_fs = NULL;
static NODE *save_rs = NULL;
+ int remake_re = TRUE;
/*
* If changing the way fields are split, obey least-suprise
***************
*** 972,992 ****
if (fields_arr != NULL)
(void) get_field(HUGE - 1, 0);
! if (! (save_fs && cmp_nodes(FS_node->var_value, save_fs) == 0
! && save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0)) {
! unref(save_fs);
! save_fs = dupnode(FS_node->var_value);
! unref(save_rs);
! save_rs = dupnode(RS_node->var_value);
! resave_fs = TRUE;
! if (FS_regexp) {
! refree(FS_regexp);
! FS_regexp = NULL;
}
}
buf[0] = '\0';
default_FS = FALSE;
fs = force_string(FS_node->var_value);
if (! do_traditional && fs->stlen == 0) {
static short warned = FALSE;
--- 975,1012 ----
if (fields_arr != NULL)
(void) get_field(HUGE - 1, 0);
! /* It's possible that only IGNORECASE changed, or FS = FS */
! if (save_fs && cmp_nodes(FS_node->var_value, save_fs) == 0
! && save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0) {
! if (FS_regexp != NULL)
! FS_regexp = (IGNORECASE ? FS_re_no_case :
FS_re_yes_case);
!
! /* FS = FS */
! if (! using_FIELDWIDTHS())
! return;
! else {
! remake_re = FALSE;
! goto choose_fs_function;
}
}
+
+ unref(save_fs);
+ save_fs = dupnode(FS_node->var_value);
+ unref(save_rs);
+ save_rs = dupnode(RS_node->var_value);
+ resave_fs = TRUE;
+ if (FS_regexp != NULL) {
+ refree(FS_re_yes_case);
+ refree(FS_re_no_case);
+ FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
+ }
+
+
+ choose_fs_function:
buf[0] = '\0';
default_FS = FALSE;
fs = force_string(FS_node->var_value);
+
if (! do_traditional && fs->stlen == 0) {
static short warned = FALSE;
***************
*** 1024,1036 ****
sprintf(buf, "[%c]", fs->stptr[0]);
}
}
! if (buf[0] != '\0') {
! FS_regexp = make_regexp(buf, strlen(buf), IGNORECASE, TRUE);
! parse_field = re_parse_field;
! } else if (parse_field == re_parse_field) {
! FS_regexp = make_regexp(fs->stptr, fs->stlen, IGNORECASE, TRUE);
! } else
! FS_regexp = NULL;
update_PROCINFO("FS", "FS");
}
--- 1044,1067 ----
sprintf(buf, "[%c]", fs->stptr[0]);
}
}
! if (remake_re) {
! if (FS_regexp != NULL) {
! refree(FS_re_yes_case);
! refree(FS_re_no_case);
! FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
! }
! if (buf[0] != '\0') {
! FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE,
TRUE);
! FS_re_no_case = make_regexp(buf, strlen(buf), TRUE,
TRUE);
! FS_regexp = (IGNORECASE ? FS_re_no_case :
FS_re_yes_case);
! parse_field = re_parse_field;
! } else if (parse_field == re_parse_field) {
! FS_re_yes_case = make_regexp(fs->stptr, fs->stlen,
FALSE, TRUE);
! FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE,
TRUE);
! FS_regexp = (IGNORECASE ? FS_re_no_case :
FS_re_yes_case);
! } else
! FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
! }
update_PROCINFO("FS", "FS");
}
diff -cr gawk-3.1.1/io.c gawk-foo/io.c
*** gawk-3.1.1/io.c Tue Apr 16 14:57:44 2002
--- gawk-foo/io.c Tue Oct 15 11:57:14 2002
***************
*** 122,127 ****
--- 122,129 ----
static struct redirect *red_head = NULL;
static NODE *RS;
+ static Regexp *RS_re_yes_case;
+ static Regexp *RS_re_no_case;
static Regexp *RS_regexp;
int RS_is_null;
***************
*** 2500,2521 ****
{
static NODE *save_rs = NULL;
! if (save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0)
return;
unref(save_rs);
save_rs = dupnode(RS_node->var_value);
RS_is_null = FALSE;
RS = force_string(RS_node->var_value);
if (RS_regexp != NULL) {
! refree(RS_regexp);
! RS_regexp = NULL;
}
if (RS->stlen == 0)
RS_is_null = TRUE;
else if (RS->stlen > 1) {
static int warned = FALSE;
! RS_regexp = make_regexp(RS->stptr, RS->stlen, IGNORECASE, TRUE);
if (do_lint && ! warned) {
lintwarn(_("multicharacter value of `RS' is a gawk
extension"));
--- 2502,2528 ----
{
static NODE *save_rs = NULL;
! if (save_rs && cmp_nodes(RS_node->var_value, save_rs) == 0) {
! RS_regexp = (IGNORECASE ? RS_re_no_case : RS_re_yes_case);
return;
+ }
unref(save_rs);
save_rs = dupnode(RS_node->var_value);
RS_is_null = FALSE;
RS = force_string(RS_node->var_value);
if (RS_regexp != NULL) {
! refree(RS_re_yes_case);
! refree(RS_re_no_case);
! RS_re_yes_case = RS_re_no_case = RS_regexp = NULL;
}
if (RS->stlen == 0)
RS_is_null = TRUE;
else if (RS->stlen > 1) {
static int warned = FALSE;
! RS_re_yes_case = make_regexp(RS->stptr, RS->stlen, FALSE, TRUE);
! RS_re_no_case = make_regexp(RS->stptr, RS->stlen, TRUE, TRUE);
! RS_regexp = (IGNORECASE ? RS_re_no_case : RS_re_yes_case);
if (do_lint && ! warned) {
lintwarn(_("multicharacter value of `RS' is a gawk
extension"));
diff -cr gawk-3.1.1/re.c gawk-foo/re.c
*** gawk-3.1.1/re.c Tue Apr 16 14:58:58 2002
--- gawk-foo/re.c Tue Oct 15 11:42:31 2002
***************
*** 221,228 ****
void
refree(Regexp *rp)
{
! free(rp->pat.buffer);
! free(rp->pat.fastmap);
if (rp->regs.start)
free(rp->regs.start);
if (rp->regs.end)
--- 221,230 ----
void
refree(Regexp *rp)
{
! /* this isn't malloced, don't let regfree free it. */
! rp->pat.translate = NULL;
!
! regfree(& rp->pat);
if (rp->regs.start)
free(rp->regs.start);
if (rp->regs.end)
- gawk: improvements for IGNORECASE and FS/RS,
Aharon Robbins <=