[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 05/15] Avoid using llink and rlink during prep
From: |
Nick Cleaton |
Subject: |
[PATCH 05/15] Avoid using llink and rlink during prep |
Date: |
Sat, 2 Oct 2010 07:07:44 +0100 |
Laying groundwork for struct trie size reductions based on sharing
storage between fields used only at prep time and fields used only
at match time:
Don't make any use of the trie->llink and trie->rlink pointers during
the trie preparation process.
---
src/kwset.c | 144 ++++++++++++++++++++++++-----------------------------------
1 files changed, 59 insertions(+), 85 deletions(-)
diff --git a/src/kwset.c b/src/kwset.c
index 079c28a..f36f036 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -60,6 +60,7 @@ struct trie
struct trie *llink; /* Left tree link (to smaller labels). */
struct trie *rlink; /* Right tree link (to larger labels). */
unsigned char label; /* Label on the incoming edge. */
+ unsigned char is_lastkid; /* True for the last of a group of siblings. */
unsigned int accepting; /* Word index of accepted word, or zero. */
struct trie *links; /* Tree of edges leaving this node. */
struct trie *parent; /* Parent of this node. */
@@ -254,6 +255,7 @@ alloc_and_plumb_siblings (struct kwset *kwset, int count,
int depth)
t->rlink = alloc_and_plumb_siblings (kwset, rcount, depth);
t->depth = depth;
+ t->is_lastkid = 0;
t->links = kwset->next_prealloced;
kwset->next_prealloced = t;
t->llink = alloc_and_plumb_siblings (kwset, lcount, depth);
@@ -263,8 +265,8 @@ alloc_and_plumb_siblings (struct kwset *kwset, int count,
int depth)
/* Used to allocate each trie node from the node vector when building the
trie from the sorted reversed keyword list, and to plumb the new node
- into the trie. Responsible for setting 'llink' and 'rlink', and for
- updating 'links' in the parent node. */
+ into the trie. Responsible for setting 'llink', 'rlink' and 'is_lastkid',
+ and for updating 'links' in the parent node. */
static struct trie *
allocate_and_plumb_trie_node (struct kwset *kwset, int index, int depth,
struct trie *parent)
@@ -284,6 +286,7 @@ allocate_and_plumb_trie_node (struct kwset *kwset, int
index, int depth,
parent->links = alloc_and_plumb_siblings (kwset,
kwset->bn_cursor->count,
depth);
+ kwset->next_free_node[-1].is_lastkid = 1;
kwset->bn_cursor++;
}
@@ -299,6 +302,7 @@ allocate_and_plumb_trie_node (struct kwset *kwset, int
index, int depth,
/* This node is an only child. */
parent->links = kwset->next_free_node++;
parent->links->llink = parent->links->rlink = NULL;
+ parent->links->is_lastkid = 1;
return parent->links;
}
@@ -365,6 +369,7 @@ build_trie_from_sorted_keywords (struct kwset *kwset)
kwset->trie->fail = NULL;
kwset->trie->depth = 0;
kwset->trie->shift = 0;
+ kwset->trie->is_lastkid = 1;
/* Set up a vector by depth of the last trie node visited. */
MALLOC (kwset->prev_kw_tries, kwset->maxd + 1);
@@ -393,97 +398,54 @@ build_trie_from_sorted_keywords (struct kwset *kwset)
return NULL;
}
-/* Enqueue the trie nodes referenced from the given tree in the
- given queue. */
+/* Compute the Aho-Corasick failure function for the given trie node,
+ given the failure function for its parent as well as a last resort
+ failure node. */
static void
-enqueue (struct trie *tree, struct trie **last)
-{
- if (!tree)
- return;
- enqueue(tree->llink, last);
- enqueue(tree->rlink, last);
- (*last) = (*last)->next = tree;
-}
-
-/* Compute the Aho-Corasick failure function for the trie nodes referenced
- from the given tree, given the failure function for their parent as
- well as a last resort failure node. */
-static void
-treefails (struct trie *tree, struct trie const *fail,
- struct trie *recourse)
+triefail (struct trie *trie, struct trie const *fail, struct trie *recourse)
{
struct trie *link;
- if (!tree)
- return;
-
- treefails(tree->llink, fail, recourse);
- treefails(tree->rlink, fail, recourse);
-
/* Find, in the chain of fails going back to the root, the first
node that has a descendent on the current label. */
while (fail)
{
- link = fail->links;
- while (link && tree->label != link->label)
- if (tree->label < link->label)
- link = link->llink;
- else
- link = link->rlink;
- if (link)
+ if ((link = fail->links))
{
- tree->fail = link;
- return;
+ do
+ {
+ if (trie->label == link->label)
+ {
+ trie->fail = link;
+ return;
+ }
+ }
+ while (!link++->is_lastkid);
}
fail = fail->fail;
}
- tree->fail = recourse;
-}
-
-/* Set delta entries for the links of the given tree such that
- the preexisting delta value is larger than the current depth. */
-static void
-treedelta (struct trie const *tree,
- unsigned int depth,
- unsigned char delta[])
-{
- if (!tree)
- return;
- treedelta(tree->llink, depth, delta);
- treedelta(tree->rlink, depth, delta);
- if (depth < delta[tree->label])
- delta[tree->label] = depth;
+ trie->fail = recourse;
}
/* Return true if A has every label in B. */
static int
hasevery (struct trie const *a, struct trie const *b)
{
- if (!b)
- return 1;
- if (!hasevery(a, b->llink))
- return 0;
- if (!hasevery(a, b->rlink))
- return 0;
- while (a && b->label != a->label)
- if (b->label < a->label)
- a = a->llink;
- else
- a = a->rlink;
- return !!a;
-}
+ unsigned char a_has[NCHAR];
+ int i;
-/* Compute a vector, indexed by character code, of the trie nodes
- referenced from the given tree. */
-static void
-treenext (struct trie *tree, struct trie *next[])
-{
- if (!tree)
- return;
- treenext(tree->llink, next);
- treenext(tree->rlink, next);
- next[tree->label] = tree;
+ memset (a_has, 0, NCHAR);
+ do
+ a_has[a->label] = 1;
+ while (!a++->is_lastkid);
+
+ do
+ if (!a_has[b->label])
+ return 0;
+ while (!b++->is_lastkid);
+
+ return 1;
}
/* Compute the shift for each trie node, as well as the delta
@@ -530,7 +492,7 @@ kwsprep (kwset_t kws)
else
{
struct trie *fail;
- struct trie *last, *next[NCHAR];
+ struct trie *last, *next[NCHAR], *child;
if ((err = build_trie_from_sorted_keywords (kwset)))
return err;
@@ -542,17 +504,26 @@ kwsprep (kwset_t kws)
computing the delta table, failure function, and shift function. */
for (curr = last = kwset->trie; curr; curr = curr->next)
{
- /* Enqueue the immediate descendents in the level order queue. */
- enqueue(curr->links, &last);
-
curr->shift = kwset->mind;
curr->maxshift = kwset->mind;
- /* Update the delta table for the descendents of this node. */
- treedelta(curr->links, curr->depth, delta);
+ /* Loop over the immediate descendents of this node. */
+ if ((child = curr->links))
+ {
+ do
+ {
+ /* Enqueue the descendent in the level order queue. */
+ last = last->next = child;
- /* Compute the failure function for the decendents of this node. */
- treefails(curr->links, curr->fail, kwset->trie);
+ /* Update the delta table. */
+ if (curr->depth < delta[child->label])
+ delta[child->label] = curr->depth;
+
+ /* Compute the failure function for this node. */
+ triefail (child, curr->fail, kwset->trie);
+ }
+ while (!child++->is_lastkid);
+ }
/* Update the shifts at each node in the current node's chain
of fails back to the root. */
@@ -561,8 +532,8 @@ kwsprep (kwset_t kws)
/* If the current node has some outgoing edge that the fail
doesn't, then the shift at the fail should be no larger
than the difference of their depths. */
- if (!hasevery(fail->links, curr->links))
- if (curr->depth - fail->depth < fail->shift)
+ if (curr->links && curr->depth - fail->depth < fail->shift)
+ if (!fail->links || !hasevery(fail->links, curr->links))
fail->shift = curr->depth - fail->depth;
/* If the current node is accepting then the shift at the
@@ -587,7 +558,10 @@ kwsprep (kwset_t kws)
from the root node. */
for (i = 0; i < NCHAR; ++i)
next[i] = NULL;
- treenext(kwset->trie->links, next);
+ if ((curr = kwset->trie->links))
+ do
+ next[curr->label] = curr;
+ while (!curr++->is_lastkid);
if ((trans = kwset->trans) != NULL)
for (i = 0; i < NCHAR; ++i)
--
1.5.6.3
- Proof of concept: kwset running 3 times smaller, Nick Cleaton, 2010/10/02
- [PATCH 01/15] Merge trie and tree into a single struct, Nick Cleaton, 2010/10/02
- [PATCH 02/15] Sort the keywords before building the trie, Nick Cleaton, 2010/10/02
- [PATCH 03/15] Lay out the trie more systematically, Nick Cleaton, 2010/10/02
- [PATCH 04/15] Lay out the trees more systematically, Nick Cleaton, 2010/10/02
- [PATCH 05/15] Avoid using llink and rlink during prep,
Nick Cleaton <=
- [PATCH 06/15] Defer llink/rlink plumbing to the end of prep, Nick Cleaton, 2010/10/02
- [PATCH 07/15] Share storage between llink and fail, Nick Cleaton, 2010/10/02
- [PATCH 08/15] Avoid using trie->depth during matching, Nick Cleaton, 2010/10/02
- [PATCH 09/15] Share storage between rlink and depth, Nick Cleaton, 2010/10/02
- [PATCH 10/15] Eliminate the trie->parent field, Nick Cleaton, 2010/10/02
- [PATCH 11/15] Eliminate the trie->next field, Nick Cleaton, 2010/10/02
- [PATCH 12/15] Eliminate the trie->maxshift field, Nick Cleaton, 2010/10/02
- [PATCH 13/15] Reduce trie->accepting to a flag, Nick Cleaton, 2010/10/02
- [PATCH 14/15] Merge flags into a trie->flags field, Nick Cleaton, 2010/10/02
- [PATCH 15/15] Make trie->shift an unsigned short, Nick Cleaton, 2010/10/02