[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 13/15] Reduce trie->accepting to a flag
From: |
Nick Cleaton |
Subject: |
[PATCH 13/15] Reduce trie->accepting to a flag |
Date: |
Sat, 2 Oct 2010 07:07:44 +0100 |
The use pattern of kwset in grep is such that an int trie->accepting
value isn't really needed - a flag is enough.
---
src/cpsort.c | 31 +++++++++++++++++--------------
src/cpsort.h | 7 ++++---
src/dfasearch.c | 24 +++++++++---------------
src/kwsearch.c | 2 +-
src/kwset.c | 18 ++++++++++--------
src/kwset.h | 9 +++++----
6 files changed, 46 insertions(+), 45 deletions(-)
diff --git a/src/cpsort.c b/src/cpsort.c
index 263d895..fc561f6 100644
--- a/src/cpsort.c
+++ b/src/cpsort.c
@@ -165,13 +165,13 @@ merge_blocks (struct mergeblock *start)
{
if (b_len == a_elt->suf_len)
{
- /* Duplicate. Keep the lower index value. */
- if (b_elt->index < a_elt->index)
- a_elt->index = b_elt->index;
+ /* Duplicate. Prefer a flag=0 word. */
+ if (b_elt->flag < a_elt->flag)
+ a_elt->flag = b_elt->flag;
/* Advance b, to skip the duplicate. */
b += b_elt++->suf_len;
- if (b_elt->index < 0)
+ if (b_elt->flag < 0)
{
b_elt = a_elt;
break;
@@ -203,7 +203,7 @@ merge_blocks (struct mergeblock *start)
}
/* Copy metadata from a to dest. */
- dest_md->index = a_elt->index;
+ dest_md->flag = a_elt->flag;
dest_md->suf_len = a_elt->suf_len - (cpl_a_dest - a_elt->cpl);
dest_md->cpl = cpl_a_dest;
dest_md++;
@@ -213,14 +213,14 @@ merge_blocks (struct mergeblock *start)
cpl_a_dest = a_elt->cpl;
cpl_b_dest = cpl_ab;
- if (a_elt->index < 0)
+ if (a_elt->flag < 0)
{
memcpy (dest, dc_base, a - dc_base);
dest += a - dc_base;
dc_base = b + (cpl_b_dest - b_elt->cpl);
- dest_md->index = b_elt->index;
+ dest_md->flag = b_elt->flag;
dest_md->cpl = cpl_b_dest;
dest_md->suf_len = b_elt->suf_len - (cpl_b_dest - b_elt->cpl);
dest_md++;
@@ -241,13 +241,13 @@ merge_blocks (struct mergeblock *start)
}
/* Copy remaining metadata elts from input b. */
- while (b_elt->index != -1)
+ while (b_elt->flag != -1)
{
- dest_md->index = b_elt->index;
+ dest_md->flag = b_elt->flag;
dest_md->cpl = b_elt->cpl;
dest_md++->suf_len = b_elt++->suf_len;
}
- dest_md->index = -1; /* Terminate the vector. */
+ dest_md->flag = -1; /* Terminate the vector. */
/* Store the combination in the first block, remove the second. */
free (start->suffixes);
@@ -304,7 +304,7 @@ flush_sorted_group (cpsort_t c)
return _("memory exhausted");
/* Terminate the vector of cpsort_md structs. */
- c->psg.strs[c->psg.count].index = -1;
+ c->psg.strs[c->psg.count].flag = -1;
/* Move the data to the mergeblock. */
mb->str_count = c->psg.count;
@@ -342,7 +342,7 @@ flush_sorted_group (cpsort_t c)
/* Add the given string to the cpsort. Return NULL for success, or an
error message. */
const char *
-cpsort_incr (cpsort_t c, char const *str, size_t len)
+cpsort_incr (cpsort_t c, char const *str, size_t len, int flag)
{
char const *err;
unsigned char const *ustr = str;
@@ -361,7 +361,10 @@ cpsort_incr (cpsort_t c, char const *str, size_t len)
{
if (c->psg.last_len == len && c->psg.count != 0)
{
- /* Duplicate, discard. */
+ /* Duplicate, discard. Keep the flag=0 instance if there
+ is one. */
+ if (!flag)
+ c->psg.strs[c->psg.count-1].flag = 0;
c->str_count++;
return NULL;
}
@@ -390,7 +393,7 @@ cpsort_incr (cpsort_t c, char const *str, size_t len)
c->psg.last_len = len;
c->psg.strs[c->psg.count].cpl = cpl;
c->psg.strs[c->psg.count].suf_len = len - cpl;
- c->psg.strs[c->psg.count].index = c->str_count++;
+ c->psg.strs[c->psg.count].flag = flag ? 1 : 0;
c->psg.count++;
memcpy (c->psg.suffixes + c->psg.tot_suf_len, str + cpl, len - cpl);
c->psg.tot_suf_len += len - cpl;
diff --git a/src/cpsort.h b/src/cpsort.h
index 87a5f03..e3f7cd4 100644
--- a/src/cpsort.h
+++ b/src/cpsort.h
@@ -37,7 +37,7 @@
/* Meta-data for one string in a sorted dump of a cpsort. */
struct cpsort_md
{
- int index; /* Index number of this string. */
+ int flag; /* The flag for this string, 0 or 1. */
int cpl; /* Common Prefix Length with previous. */
int suf_len; /* Length, excluding the common prefix. */
};
@@ -59,8 +59,9 @@ typedef struct cpsort *cpsort_t;
extern cpsort_t cpsort_alloc (void);
/* Add the given string to the cpsort. Return NULL for success, or an
- error message. */
-extern const char *cpsort_incr (cpsort_t, char const *, size_t);
+ error message. The int value is the flag value to assign to this
+ string, 0 or 1. */
+extern const char *cpsort_incr (cpsort_t, char const *, size_t, int);
/* Put a sorted dump of the cpsort in the specified cpcsort_dump struct.
Return NULL for success, or an error message. */
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 780891f..446a9e4 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -68,13 +68,8 @@ dfawarn (char const *mesg)
dfaerror (mesg);
}
-/* Number of compiled fixed strings known to exactly match the regexp.
- If kwsexec returns < kwset_exact_matches, then we don't need to
- call the regexp matcher at all. */
-static int kwset_exact_matches;
-
static char const *
-kwsincr_case (const char *must)
+kwsincr_case (const char *must, int flag)
{
const char *buf;
size_t n;
@@ -86,7 +81,7 @@ kwsincr_case (const char *must)
else
#endif
buf = must;
- return kwsincr (kwset, buf, n);
+ return kwsincr (kwset, buf, n, flag);
}
/* If the DFA turns out to have some set of fixed strings one of
@@ -104,23 +99,22 @@ kwsmusts (void)
{
kwsinit (&kwset);
/* First, we compile in the substrings known to be exact
- matches. The kwset matcher will return the index
- of the matching string that it chooses. */
+ matches. We tell kwset to set the flag to 0 for these
+ strings. */
for (; dm; dm = dm->next)
{
if (!dm->exact)
continue;
- ++kwset_exact_matches;
- if ((err = kwsincr_case (dm->must)) != NULL)
+ if ((err = kwsincr_case (dm->must, 0)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
- /* Now, we compile the substrings that will require
- the use of the regexp matcher. */
+ /* Now, we compile the substrings that will require the use
+ * of the regexp matcher. Flag=1 in kwset for these. */
for (dm = dfamusts (dfa); dm; dm = dm->next)
{
if (dm->exact)
continue;
- if ((err = kwsincr_case (dm->must)) != NULL)
+ if ((err = kwsincr_case (dm->must, 1)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
if ((err = kwsprep (kwset)) != NULL)
@@ -261,7 +255,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
match = beg;
while (beg > buf && beg[-1] != eol)
--beg;
- if (kwsm.index < kwset_exact_matches)
+ if (!kwsm.flag)
{
#if MBS_SUPPORT
if (mb_start < beg)
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 9244a8a..f15a6b1 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -66,7 +66,7 @@ Fcompile (char const *pattern, size_t size)
#endif
}
- if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
+ if ((err = kwsincr (kwset, beg, end - beg, 0)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
beg = lim;
}
diff --git a/src/kwset.c b/src/kwset.c
index e23cbca..aa75fbd 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -71,7 +71,7 @@ struct trie
rd;
unsigned char label; /* Label on the incoming edge. */
unsigned char is_lastkid; /* True for the last of a group of siblings. */
- unsigned int accepting; /* Word index of accepted word, or zero. */
+ unsigned char accepting; /* Flag value of accepted word, or zero. */
struct trie *links; /* Tree of edges leaving this node. */
int shift; /* Shift function for search failures. */
};
@@ -105,6 +105,7 @@ struct kwset
struct trie *next_free_node; /* Next unallocated trie in node_storage. */
struct broodnote *bn_cursor; /* A cursor for walking the broodnotes. */
struct trie *next_prealloced; /* Linked list of pre-allocated trie nodes. */
+ int first_string_flag; /* Flag value on first kwsincr() call. */
};
/* Allocate and initialize a keyword set object, returning an opaque
@@ -144,7 +145,7 @@ kwsalloc (char const *trans)
/* Add the given string to the contents of the keyword set. Return NULL
for success, an error message otherwise. */
const char *
-kwsincr (kwset_t kws, char const *text, size_t len)
+kwsincr (kwset_t kws, char const *text, size_t len, int flag)
{
char const *err, *end;
char *dest;
@@ -163,10 +164,11 @@ kwsincr (kwset_t kws, char const *text, size_t len)
while (text < end)
*dest++ = kws->trans ? kws->trans[U (*--end)] : *--end;
- if ((err = cpsort_incr (kws->cps, kws->kwbuf, len)))
+ if ((err = cpsort_incr (kws->cps, kws->kwbuf, len, flag ? 1 : 0)))
return err;
- ++kws->words;
+ if (!kws->words++)
+ kws->first_string_flag = flag ? 1 : 0;
/* Keep track of the longest and shortest string of the keyword set. */
if ((int)len < kws->mind)
@@ -358,8 +360,8 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char
const *text,
}
/* Mark the node we finally reached as accepting, encoding the
- index number of this word in the keyword set. */
- trie->accepting = 1 + 2 * md->index;
+ flag value for this keyword. */
+ trie->accepting = 1 + 2 * md->flag;
trie->links = NULL;
}
@@ -874,7 +876,7 @@ cwexec (kwset_t kws, char const *text, size_t len, struct
kwsmatch *kwsmatch)
d = 1;
}
- kwsmatch->index = accept->accepting / 2;
+ kwsmatch->flag = accept->accepting / 2;
kwsmatch->offset[0] = mch - text;
kwsmatch->size[0] = mch_len;
@@ -895,7 +897,7 @@ kwsexec (kwset_t kws, char const *text, size_t size, struct
kwsmatch *kwsmatch)
size_t ret = bmexec (kws, text, size);
if (ret != (size_t) -1)
{
- kwsmatch->index = 0;
+ kwsmatch->flag = kwset->first_string_flag;
kwsmatch->offset[0] = ret;
kwsmatch->size[0] = kwset->mind;
}
diff --git a/src/kwset.h b/src/kwset.h
index 35caa41..6acb1c1 100644
--- a/src/kwset.h
+++ b/src/kwset.h
@@ -23,7 +23,7 @@
struct kwsmatch
{
- int index; /* Index number of matching keyword. */
+ int flag; /* Flag value for the matching keyword. */
size_t offset[1]; /* Offset of each submatch. */
size_t size[1]; /* Length of each submatch. */
};
@@ -40,9 +40,10 @@ typedef struct kwset *kwset_t;
extern kwset_t kwsalloc (char const *);
/* Incrementally extend the keyword set to include the given string.
- Return NULL for success, or an error message. Remember an index
- number for each keyword included in the set. */
-extern const char *kwsincr (kwset_t, char const *, size_t);
+ Return NULL for success, or an error message. The final arg is
+ a flag that determines whether or not kwsmatch.flag should be set
+ for matches on this keyword. */
+extern const char *kwsincr (kwset_t, char const *, size_t, int);
/* When the keyword set has been completely built, prepare it for
use. Return NULL for success, or an error message. */
--
1.5.6.3
- [PATCH 03/15] Lay out the trie more systematically, (continued)
- [PATCH 03/15] Lay out the trie more systematically, Nick Cleaton, 2010/10/02
- [PATCH 04/15] Lay out the trees more systematically, Nick Cleaton, 2010/10/02
- [PATCH 05/15] Avoid using llink and rlink during prep, Nick Cleaton, 2010/10/02
- [PATCH 06/15] Defer llink/rlink plumbing to the end of prep, Nick Cleaton, 2010/10/02
- [PATCH 07/15] Share storage between llink and fail, Nick Cleaton, 2010/10/02
- [PATCH 08/15] Avoid using trie->depth during matching, Nick Cleaton, 2010/10/02
- [PATCH 09/15] Share storage between rlink and depth, Nick Cleaton, 2010/10/02
- [PATCH 10/15] Eliminate the trie->parent field, Nick Cleaton, 2010/10/02
- [PATCH 11/15] Eliminate the trie->next field, Nick Cleaton, 2010/10/02
- [PATCH 12/15] Eliminate the trie->maxshift field, Nick Cleaton, 2010/10/02
- [PATCH 13/15] Reduce trie->accepting to a flag,
Nick Cleaton <=
- [PATCH 14/15] Merge flags into a trie->flags field, Nick Cleaton, 2010/10/02
- [PATCH 15/15] Make trie->shift an unsigned short, Nick Cleaton, 2010/10/02
- Re: Proof of concept: kwset running 3 times smaller, Nick Cleaton, 2010/10/30