[elpa] externals/trie 3a734c3 077/111: Implement trie-fuzzy-match and tr

emacs-elpa-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/trie 3a734c3 077/111: Implement trie-fuzzy-match and tr

From:	Stefan Monnier
Subject:	[elpa] externals/trie 3a734c3 077/111: Implement trie-fuzzy-match and trie-fuzzy-complete functions.
Date:	Mon, 14 Dec 2020 11:35:24 -0500 (EST)
branch: externals/trie
commit 3a734c3fe82f4e19c467f9b165f675f8e3af5a15
Author: Toby S. Cubitt <toby-predictive@dr-qubit.org>
Commit: Toby S. Cubitt <toby-predictive@dr-qubit.org>

    Implement trie-fuzzy-match and trie-fuzzy-complete functions.
    
    Searches a trie for matches or completions within a given Lewenstein 
distance
    of a string.
---
 trie.el | 528 ++++++++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 415 insertions(+), 113 deletions(-)

diff --git a/trie.el b/trie.el
index 33d72b3..0cf1dd8 100644
--- a/trie.el
+++ b/trie.el
@@ -48,9 +48,9 @@
 ;; `trie-stack', you can create an object that allows the contents of the trie
 ;; to be used like a stack, useful for building other algorithms on top of
 ;; tries; `trie-stack-pop' pops elements off the stack one-by-one, in
-;; "lexical" order, whilst `trie-stack-push' pushes things onto the
+;; "lexicographic" order, whilst `trie-stack-push' pushes things onto the
 ;; stack. Similarly, `trie-complete-stack', and `trie-regexp-stack' create
-;; "lexically-ordered" stacks of query results.
+;; "lexicographicly-ordered" stacks of query results.
 ;;
 ;; Note that there are two uses for a trie: as a lookup table, in which case
 ;; only the presence or absence of a key in the trie is significant, or as an
@@ -145,6 +145,7 @@
 
 
 
+
 ;;; ================================================================
 ;;;                   Pre-defined trie types
 
@@ -167,6 +168,7 @@
 
 
 
+
 ;;; ================================================================
 ;;;           Internal utility functions and macros
 
@@ -203,17 +205,17 @@
    (:constructor trie--create-custom
                 (comparison-function
                  &key
-                 (createfun 'avl-tree-create-bare)
-                 (insertfun 'avl-tree-enter)
-                 (deletefun 'avl-tree-delete)
-                 (lookupfun 'avl-tree-member)
-                 (mapfun 'avl-tree-mapc)
-                 (emptyfun 'avl-tree-empty)
-                 (stack-createfun 'avl-tree-stack)
-                 (stack-popfun 'avl-tree-stack-pop)
-                 (stack-emptyfun 'avl-tree-stack-empty-p)
-                 (transform-for-print 'trie--avl-transform-for-print)
-                 (transform-from-read 'trie--avl-transform-from-read)
+                 (createfun #'avl-tree-create-bare)
+                 (insertfun #'avl-tree-enter)
+                 (deletefun #'avl-tree-delete)
+                 (lookupfun #'avl-tree-member)
+                 (mapfun #'avl-tree-mapc)
+                 (emptyfun #'avl-tree-empty)
+                 (stack-createfun #'avl-tree-stack)
+                 (stack-popfun #'avl-tree-stack-pop)
+                 (stack-emptyfun #'avl-tree-stack-empty-p)
+                 (transform-for-print #'trie--avl-transform-for-print)
+                 (transform-from-read #'trie--avl-transform-from-read)
                  &aux
                  (cmpfun (trie--wrap-cmpfun comparison-function))
                  (root (trie--node-create-root createfun cmpfun))
@@ -257,15 +259,16 @@
 (if (trie-lexical-binding-p)
     (defun trie--construct-equality-function (comparison-function)
       (lambda (a b)
-        (and (not (funcall comparison-function a b))
-             (not (funcall comparison-function b a)))))
+        (not (or (funcall comparison-function a b)
+                 (funcall comparison-function b a)))))
   (defun trie--construct-equality-function (comparison-function)
     `(lambda (a b)
-       (and (not (,comparison-function a b))
-           (not (,comparison-function b a))))))
+       (not (or (,comparison-function a b)
+               (,comparison-function b a))))))
 
 
 
+
 ;;; ----------------------------------------------------------------
 ;;;          Functions and macros for handling a trie node.
 
@@ -310,10 +313,9 @@
 (defun trie--node-find (node seq lookupfun)
   ;; Returns the node below NODE corresponding to SEQ, or nil if none
   ;; found.
-  (let ((len (length seq))
-       (i -1))
+  (let ((i -1))
     ;; descend trie until we find SEQ or run out of trie
-    (while (and node (< (incf i) len))
+    (while (and node (< (incf i) (length seq)))
       (setq node
            (funcall lookupfun
                     (trie--node-subtree node)
@@ -339,6 +341,7 @@
 
 
 
+
 ;;; ----------------------------------------------------------------
 ;;;              print/read transformation functions
 
@@ -385,6 +388,7 @@
 
 
 
+
 ;;; ----------------------------------------------------------------
 ;;;                Replacements for CL functions
 
@@ -419,7 +423,7 @@ If START or END is negative, it counts from the end."
 (defun trie--position (item list)
   "Find the first occurrence of ITEM in LIST.
 Return the index of the matching item, or nil of not found.
-Comparison is done with 'equal."
+Comparison is done with `equal'."
   (let ((i 0))
     (catch 'found
       (while (progn
@@ -441,13 +445,13 @@ Comparison is done with 'equal."
   "Concatenate SEQ and SEQUENCES, and make the result the same
 type of sequence as SEQ."
   (cond
-   ((stringp seq) (apply 'concat  seq sequences))
-   ((vectorp seq) (apply 'vconcat seq sequences))
-   ((listp seq)          (apply 'append  seq sequences))))
-
+   ((stringp seq) (apply #'concat  seq sequences))
+   ((vectorp seq) (apply #'vconcat seq sequences))
+   ((listp seq)          (apply #'append  seq sequences))))
 
 
 
+
 ;;; ================================================================
 ;;;                     Basic trie operations
 
@@ -647,6 +651,7 @@ reversed if REVERSE is non-nil."
 
 
 
+
 ;; ----------------------------------------------------------------
 ;;                        Inserting data
 
@@ -705,6 +710,7 @@ bind any variables with names commencing \"--\"."
 
 
 
+
 ;; ----------------------------------------------------------------
 ;;                        Deleting data
 
@@ -819,7 +825,7 @@ also `trie-member-p', which does this for you.)"
 
 
 
-
+
 ;;; ================================================================
 ;;;                      Mapping over tries
 
@@ -960,7 +966,7 @@ trie, and its associated data.
 
 Optional argument TYPE (one of the symbols vector, lisp or
 string; defaults to vector) sets the type of sequence passed to
-FUNCTION. If TYPE is 'string, it must be possible to apply the
+FUNCTION. If TYPE is string, it must be possible to apply the
 function `string' to the individual elements of key sequences
 stored in TRIE.
 
@@ -1007,7 +1013,7 @@ Note that if you don't care about the order in which 
FUNCTION is
 applied, just that the resulting list is in the correct order,
 then
 
-  (trie-mapf function 'cons trie type (not reverse))
+  (trie-mapf function #'cons trie type (not reverse))
 
 is more efficient.
 
@@ -1016,11 +1022,11 @@ bind any variables with names commencing \"--\"."
   ;; convert from print-form if necessary
   (trie-transform-from-read-warn trie)
   ;; map FUNCTION over TRIE and accumulate in a list
-  (nreverse (trie-mapf function 'cons trie type reverse)))
-
+  (nreverse (trie-mapf function #'cons trie type reverse)))
 
 
 
+
 ;;; ================================================================
 ;;;                    Using tries as stacks
 
@@ -1038,7 +1044,7 @@ bind any variables with names commencing \"--\"."
              (stack-createfun (trie--stack-createfun trie))
              (stack-popfun (trie--stack-popfun trie))
              (stack-emptyfun (trie--stack-emptyfun trie))
-             (repopulatefun 'trie--stack-repopulate)
+             (repopulatefun #'trie--stack-repopulate)
              (store
               (if (trie-empty trie)
                   nil
@@ -1067,7 +1073,7 @@ bind any variables with names commencing \"--\"."
              (stack-createfun (trie--stack-createfun trie))
              (stack-popfun (trie--stack-popfun trie))
              (stack-emptyfun (trie--stack-emptyfun trie))
-             (repopulatefun 'trie--stack-repopulate)
+             (repopulatefun #'trie--stack-repopulate)
              (store (trie--completion-stack-construct-store
                      trie prefix reverse))
              (pushed '())
@@ -1083,7 +1089,7 @@ bind any variables with names commencing \"--\"."
              (stack-createfun (trie--stack-createfun trie))
              (stack-popfun (trie--stack-popfun trie))
              (stack-emptyfun (trie--stack-emptyfun trie))
-             (repopulatefun 'trie--regexp-stack-repopulate)
+             (repopulatefun #'trie--regexp-stack-repopulate)
              (store (trie--regexp-stack-construct-store
                      trie regexp reverse))
              (pushed '())
@@ -1097,24 +1103,27 @@ bind any variables with names commencing \"--\"."
 (defun trie-stack (trie &optional type reverse)
   "Return an object that allows TRIE to be accessed as a stack.
 
-The stack is sorted in \"lexical\" order, i.e. the order defined
-by the trie's comparison function, or in reverse order if REVERSE
-is non-nil. Calling `trie-stack-pop' pops the top element (a key
-and its associated data) from the stack.
+The stack is sorted in \"lexicographic\" order, i.e. the order
+defined by the trie's comparison function, or in reverse order if
+REVERSE is non-nil. Calling `trie-stack-pop' pops the top element
+\(a cons cell containing a key and its associated data\) from the
+stack.
 
-Optional argument TYPE (one of the symbols vector, lisp or
-string) sets the type of sequence used for the keys.
+Optional argument TYPE \(one of the symbols vector, lisp or
+string\) sets the type of sequence used for the keys. \(If TYPE
+is string, it must be possible to apply `string' to individual
+elements of TRIE keys.\)
 
 Note that any modification to TRIE *immediately* invalidates all
-trie-stacks created before the modification (in particular,
-calling `trie-stack-pop' will give unpredictable results).
+trie-stacks created before the modification \(in particular,
+calling `trie-stack-pop' will give unpredictable results\).
 
 Operations on trie-stacks are significantly more efficient than
 constructing a real stack from the trie and using standard stack
 functions. As such, they can be useful in implementing efficient
-algorithms on tries. However, in cases where mapping functions
+algorithms over tries. However, in cases where mapping functions
 `trie-mapc', `trie-mapcar' or `trie-mapf' would be sufficient, it
-is better to use one of those instead."
+may be better to use one of those instead."
   ;; convert trie from print-form if necessary
   (trie-transform-from-read-warn trie)
   ;; if stack functions aren't defined for trie type, throw error
@@ -1213,22 +1222,21 @@ element stored in the trie.)"
 
 
 
-
+
 ;; ================================================================
 ;;                   Query-building utility macros
 
 ;; Implementation Note
 ;; -------------------
-;; For queries ranked in anything other than lexical order, we use a
-;; partial heap-sort to find the k=MAXNUM highest ranked matches among
-;; the n possibile matches. This has worst-case time complexity
-;; O(n log k), and is both simple and elegant. An optimal algorithm
-;; (e.g. partial quick-sort discarding the irrelevant partition at each
-;; step) would have complexity O(n + k log k), but is probably not worth
-;; the extra coding effort, and would have worse space complexity unless
-;; coded to work "in-place", which would be highly non-trivial. (I
-;; haven't done any benchmarking, though, so feel free to do so and let
-;; me know the results!)
+;; For queries ranked in anything other than lexicographic order, we use a
+;; partial heap-sort to find the k=MAXNUM highest ranked matches among the n
+;; possibile matches. This has worst-case time complexity O(n log k), and is
+;; both simple and elegant. An optimal algorithm (e.g. partial quick-sort
+;; discarding the irrelevant partition at each step) would have complexity O(n
+;; + k log k), but is probably not worth the extra coding effort, and would
+;; have worse space complexity unless coded to work "in-place", which would be
+;; highly non-trivial. (I haven't done any benchmarking, though, so feel free
+;; to do so and let me know the results!)
 
 (defun trie--construct-accumulator (maxnum filter resultfun)
   ;; Does what it says on the tin! | sed -e 's/tin/macro name/'
@@ -1242,7 +1250,7 @@ element stored in the trie.)"
              (cons (funcall resultfun seq data)
                    (aref trie--accumulate 0)))
        (and (>= (length (aref trie--accumulate 0)) maxnum)
-            (throw 'trie-accumulate--done nil)))))
+            (throw 'trie--accumulate-done nil)))))
    ;; filter, maxnum, !resultfun
    ((and filter maxnum (not resultfun))
     (lambda (seq data)
@@ -1251,7 +1259,7 @@ element stored in the trie.)"
              (cons (cons seq data)
                    (aref trie--accumulate 0)))
        (and (>= (length (aref trie--accumulate 0)) maxnum)
-            (throw 'trie-accumulate--done nil)))))
+            (throw 'trie--accumulate-done nil)))))
    ;; filter, !maxnum, resultfun
    ((and filter (not maxnum) resultfun)
     (lambda (seq data)
@@ -1273,7 +1281,7 @@ element stored in the trie.)"
            (cons (funcall resultfun seq data)
                  (aref trie--accumulate 0)))
       (and (>= (length (aref trie--accumulate 0)) maxnum)
-          (throw 'trie-accumulate--done nil))))
+          (throw 'trie--accumulate-done nil))))
    ;; !filter, maxnum, !resultfun
    ((and (not filter) maxnum (not resultfun))
     (lambda (seq data)
@@ -1281,7 +1289,7 @@ element stored in the trie.)"
            (cons (cons seq data)
                  (aref trie--accumulate 0)))
       (and (>= (length (aref trie--accumulate 0)) maxnum)
-          (throw 'trie-accumulate--done nil))))
+          (throw 'trie--accumulate-done nil))))
    ;; !filter, !maxnum, resultfun
    ((and (not filter) (not maxnum) resultfun)
     (lambda (seq data)
@@ -1329,18 +1337,18 @@ element stored in the trie.)"
 
 (defmacro trie--accumulate-results
   (rankfun maxnum reverse filter resultfun accfun duplicates &rest body)
-  ;; Accumulate results of running BODY code, and return them in
-  ;; appropriate order. BODY should call ACCFUN to accumulate a result,
-  ;; passing it two arguments: a trie data node, and the corresponding
-  ;; sequence. BODY can throw 'trie-accumulate--done to terminate the
-  ;; accumulation and return the results. A non-null DUPLICATES flag
-  ;; signals that the accumulated results might contain duplicates,
-  ;; which should be deleted. Note that DUPLICATES is ignored if RANKFUN
-  ;; is null. The other arguments should be passed straight through from
-  ;; the query function.
-
-  ;; rename functions to help avoid dynamic-scoping bugs
-  ;; FIXME: not needed with lexical scoping
+  ;; Accumulate results of running BODY code, and return them in appropriate
+  ;; order. BODY should call ACCFUN to accumulate a result, passing it two
+  ;; arguments: a trie key and its associated data. BODY can throw
+  ;; trie--accumulate-done to terminate the accumulation and return the
+  ;; results. A non-null DUPLICATES flag signals that the accumulated results
+  ;; might contain duplicates, which should be deleted. Note that DUPLICATES
+  ;; is ignored if RANKFUN is null, and that duplicates *do* count towards
+  ;; MAXNUM. The remaining arguments have the usual meanings, and should be
+  ;; passed straight through from the query function's arguments.
+
+  ;; rename functions to help avoid dynamic-scoping bugs FIXME: not needed
+  ;; with lexical scoping
   `(let* ((--trie-accumulate--rankfun ,rankfun)
          (--trie-accumulate--filter ,filter)
          (--trie-accumulate--resultfun ,resultfun)
@@ -1365,7 +1373,7 @@ element stored in the trie.)"
              --trie-accumulate--resultfun))))
 
      ;; accumulate results
-     (catch 'trie-accumulate--done ,@body)
+     (catch 'trie--accumulate-done ,@body)
 
      ;; return list of results
      (cond
@@ -1390,14 +1398,14 @@ element stored in the trie.)"
                     results))))
         results))
 
-      ;; for lexical query, reverse result list if MAXNUM supplied
+      ;; for lexicographic query, reverse result list if MAXNUM supplied
       (,maxnum (nreverse (aref trie--accumulate 0)))
       ;; otherwise, just return list
       (t (aref trie--accumulate 0)))))
 
 
 
-
+
 ;; ================================================================
 ;;                          Completing
 
@@ -1405,10 +1413,10 @@ element stored in the trie.)"
   (trie prefix &optional rankfun maxnum reverse filter resultfun)
   "Return an alist containing all completions of PREFIX in TRIE
 along with their associated data, in the order defined by
-RANKFUN, defaulting to \"lexical\" order (i.e. the order defined
-by the trie's comparison function). If REVERSE is non-nil, the
-completions are sorted in the reverse order. Returns nil if no
-completions are found.
+RANKFUN, defaulting to \"lexicographic\" order \(i.e. the order
+defined by the trie's comparison function\). If REVERSE is
+non-nil, the completions are sorted in the reverse order. Returns
+nil if no completions are found.
 
 PREFIX must be a sequence (vector, list or string) containing
 elements of the type used to reference data in the trie. (If
@@ -1437,7 +1445,7 @@ included in the results, and does not count towards 
MAXNUM.
 
 RESULTFUN defines a function used to process results before
 adding them to the final result list. If specified, it should
-accept two arguments: a key and its associated data. It's return
+accept two arguments: a key and its associated data. Its return
 value is what gets added to the final result list, instead of the
 default key-data cons cell."
 
@@ -1451,7 +1459,7 @@ default key-data cons cell."
   (if (or (atom prefix)
          (and (listp prefix) (not (sequencep (car prefix)))))
       (setq prefix (list prefix))
-    ;; sort list of prefixes if sorting completions lexically
+    ;; sort list of prefixes if sorting completions lexicographicly
     (when (null rankfun)
       (setq prefix
            (sort prefix (trie-construct-sortfun
@@ -1479,30 +1487,31 @@ default key-data cons cell."
   "Return an object that allows completions of PREFIX to be accessed
 as if they were a stack.
 
-The stack is sorted in \"lexical\" order, i.e. the order defined
-by TRIE's comparison function, or in reverse order if REVERSE is
-non-nil. Calling `trie-stack-pop' pops the top element (a key and
-its associated data) from the stack.
+The stack is sorted in \"lexicographic\" order, i.e. the order
+defined by TRIE's comparison function, or in reverse order if
+REVERSE is non-nil. Calling `trie-stack-pop' pops the top element
+\(a cons cell containing the next completion and its associated
+data\) from the stack.
 
 PREFIX must be a sequence (vector, list or string) that forms the
-initial part of a TRIE key, or a list of such sequences. (If
+initial part of a TRIE key, or a list of such sequences. \(If
 PREFIX is a string, it must be possible to apply `string' to
-individual elements of TRIE keys.)  The completions returned in
-the alist will be sequences of the same type as KEY. If PREFIX is
-a list of sequences, completions of all sequences in the list are
-included in the stack. All sequences in the list must be of the
-same type.
+individual elements of TRIE keys.\) The completions returned by
+`trie-stack-pop' will be sequences of the same type as KEY. If
+PREFIX is a list of sequences, they must all be of the same
+type. In this case, completions of all sequences in the list are
+included in the stack.
 
 Note that any modification to TRIE *immediately* invalidates all
-trie-stacks created before the modification (in particular,
-calling `trie-stack-pop' will give unpredictable results).
+trie-stacks created before the modification \(in particular,
+calling `trie-stack-pop' will give unpredictable results\).
 
 Operations on trie-stacks are significantly more efficient than
 constructing a real stack from completions of PREFIX in TRIE and
 using standard stack functions. As such, they can be useful in
-implementing efficient algorithms on tries. However, in cases
-where `trie-complete' or `trie-complete-ordered' is sufficient,
-it is better to use one of those instead."
+implementing efficient algorithms over tries. However, in cases
+where `trie-complete' is sufficient, it is better to use that
+instead."
   ;; convert trie from print-form if necessary
   (trie-transform-from-read-warn trie)
   ;; if stack functions aren't defined for trie type, throw error
@@ -1541,7 +1550,7 @@ it is better to use one of those instead."
 
 
 
-
+
 ;; ================================================================
 ;;                        Regexp search
 
@@ -1549,10 +1558,10 @@ it is better to use one of those instead."
   (trie regexp &optional rankfun maxnum reverse filter resultfun)
   "Return an alist containing all matches for REGEXP in TRIE
 along with their associated data, in the order defined by
-RANKFUN, defauling to \"lexical\" order (i.e. the order defined
-by the trie's comparison function).  If REVERSE is non-nil, the
-results are sorted in the reverse order. Returns nil if no
-results are found.
+RANKFUN, defaulting to \"lexicographic\" order \(i.e. the order
+defined by the trie's comparison function\). If REVERSE is
+non-nil, the results are sorted in the reverse order. Returns nil
+if no results are found.
 
 REGEXP is a regular expression, but it need not necessarily be a
 string. It must be a sequence (vector, list, or string) whose
@@ -1597,7 +1606,7 @@ results, and does not count towards MAXNUM.
 
 RESULTFUN defines a function used to process results before
 adding them to the final result list. If specified, it should
-accept two arguments: a key and its associated data. It's return
+accept two arguments: a key and its associated data. Its return
 value is what gets added to the final result list, instead of the
 default key-data cons cell."
 
@@ -1629,8 +1638,8 @@ default key-data cons cell."
              (funcall --trie-regexp-search--rankfun a b))))
 
     ;; accumulate results
-    (trie--accumulate-results rankfun maxnum reverse
-                             filter resultfun accumulator nil
+    (trie--accumulate-results
+     rankfun maxnum reverse filter resultfun accumulator nil
      (trie--do-regexp-search
       (trie--root trie)
       (tNFA-from-regexp regexp :test (trie--construct-equality-function
@@ -1736,19 +1745,23 @@ default key-data cons cell."
   "Return an object that allows matches to REGEXP to be accessed
 as if they were a stack.
 
-The stack is sorted in \"lexical\" order, i.e. the order defined
-by TRIE's comparison function, or in reverse order if REVERSE is
-non-nil. Calling `trie-stack-pop' pops the top element (a cons
-cell containing a key and its associated data) from the stack.
+The stack is sorted in \"lexicographic\" order, i.e. the order
+defined by TRIE's comparison function, or in reverse order if
+REVERSE is non-nil. Calling `trie-stack-pop' pops the top element
+\(a cons cell containing a key and its associated data\) from the
+stack.
 
 REGEXP is a regular expression, but it need not necessarily be a
-string. It must be a sequence (vector, list of string) whose
-elements are either elements of the same type as elements of the
-trie keys (which behave as literals in the regexp), or any of the
-usual regexp special characters and backslash constructs. If
-REGEXP is a string, it must be possible to apply `string' to
+string. It must be a sequence \(vector, list or string\) whose
+elements either have the same type as elements of the trie keys
+\(which behave as literals in the regexp\), or are any of the
+usual regexp special characters \(character type\) or backslash
+constructs \(string type\).
+
+If REGEXP is a string, it must be possible to apply `string' to
 individual elements of the keys stored in the trie. The matches
-returned in the alist will be sequences of the same type as KEY.
+returned by `trie-stack-pop' will be sequences of the same type
+as KEY.
 
 Back-references and non-greedy postfix operators are *not*
 supported, and the matches are always anchored, so `$' and `^'
@@ -1756,7 +1769,7 @@ lose their special meanings.
 
 If the regexp contains any non-shy grouping constructs, subgroup
 match data is included in the results. In this case, the car of
-each match (as returned by a call to `trie-stack-pop' is no
+each match \(as returned by a call to `trie-stack-pop'\) is no
 longer just a key. Instead, it is a list whose first element is
 the matching key, and whose remaining elements are cons cells
 whose cars and cdrs give the start and end indices of the
@@ -1878,6 +1891,295 @@ elements that matched the corresponding groups, in 
order."
 
 
 
+
+;; ================================================================
+;;                        Fuzzy matching
+
+;; Implementation Note
+;; -------------------
+;; The standard dynamical-programming solution to computing Lewenstein
+;; distance constructs a table of Lewenstein distances to successive prefixes
+;; of the target string, row-by-row. Our trie search algorithms are based on
+;; constructing the next row of this table as we (recursively) descend the
+;; trie. Since the each row only depends on entries in the previous row, we
+;; only need to pass a single row of the table down the recursion stack. (A
+;; nice description of this algorithm can be found at
+;; http://stevehanov.ca/blog/index.php?id=114.)
+;;
+;; I haven't benchmarked this (let me know the results if you do!), but it
+;; seems clear that this algorithm will be much faster than constructing a
+;; Lewenstein automata and stepping through it as we descend the trie
+;; (similarly to regexp searches, cf. `trie-regexp-match'.)
+
+
+(defun trie-fuzzy-match
+  (trie string distance &optional rankfun maxnum reverse filter resultfun)
+  "Return matches for STRING in TRIE within Lewenstein DISTANCE
+\(edit distance\) of STRING along with their associated data, in
+the order defined by RANKFUN, defaulting to \"lexicographic\"
+order \(i.e. the order defined by the trie's comparison
+function\). If REVERSE is non-nil, the results are sorted in the
+reverse order. Returns nil if no results are found.
+
+STRING is a sequence (vector, list or string), whose elements are
+of the same type as elements of the trie keys. If STRING is a
+string, it must be possible to apply `string' to individual
+elements of the keys stored in the trie. The matches returned in
+the alist will be sequences of the same type as STRING.
+
+DISTANCE must be an integer.
+
+The optional integer argument MAXNUM limits the results to the
+first MAXNUM matches. Otherwise, all matches are returned.
+
+RANKFUN overrides the default ordering of the results. If it is
+`t', matches are instead ordered by increasing Lewenstein
+distance of their prefix \(with same-distance matches ordered
+lexicographically\).
+
+If RANKFUN is a function, it must accept two arguments, both of
+the form:
+
+    (KEY DIST . DATA)
+
+where KEY is a key from the trie, DIST is its Lewenstein
+distances from STRING, and DATA is its associated data. RANKFUN
+should return non-nil if first argument is ranked strictly higher
+than the second, nil otherwise.
+
+The FILTER argument sets a filter function for the matches. If
+supplied, it is called for each possible match with two
+arguments: a KEY and a (DIST . DATA) cons cell. If the filter
+function returns nil, the match is not included in the results,
+and does not count towards MAXNUM.
+
+RESULTFUN defines a function used to process results before
+adding them to the final result list. If specified, it should
+accept two arguments: a KEY and a (DIST . DATA) cons cell. Its
+return value is what gets added to the final result list, instead
+of the default key-dist-data list."
+
+  ;; convert trie from print-form if necessary
+  (trie-transform-from-read-warn trie)
+
+  ;; construct rankfun to sort by Lewenstein distance if requested
+  (when (eq rankfun t)
+    (setq rankfun `(lambda (a b)
+                    (cond
+                    ((< (cadr a) (cadr b)) t)
+                    ((> (cadr a) (cadr b)) nil)
+                    (t ,(trie-construct-sortfun
+                         (trie--comparison-function trie))
+                       (car a) (car b))))))
+
+  ;; accumulate results
+  (trie--accumulate-results
+   rankfun maxnum reverse filter resultfun accumulator nil
+   (funcall (trie--mapfun trie)
+           (lambda (node)
+             (trie--do-fuzzy-match
+              node
+              (apply #'vector (number-sequence 0 (length string)))
+              (cond ((stringp string) "") ((listp string) ()) (t []))
+              ;; FIXME: Would it pay to replace these arguments with
+              ;;        dynamically-scoped variables, to save stack space?
+              string distance (if maxnum reverse (not reverse))
+              (trie--comparison-function trie)
+              (trie--construct-equality-function
+               (trie--comparison-function trie))
+              (trie--lookupfun trie)
+              (trie--mapfun trie)
+              accumulator))
+           (trie--node-subtree (trie--root trie))
+           (if maxnum reverse (not reverse)))))
+
+
+(defun trie--do-fuzzy-match (node row seq string distance reverse
+                            cmpfun equalfun lookupfun mapfun accumulator)
+  ;; Search everything below NODE for matches within Lewenstein distance
+  ;; DISTANCE of STRING. ROW is the previous row of the Lewenstein table. SEQ
+  ;; is the sequence corresponding to NODE. If COMPLETE is non-nil, return
+  ;; completions of matches, otherwise return matches themselves. Remaining
+  ;; arguments are corresponding trie functions.
+
+  ;; if we're at a data node and SEQ is within DISTANCE of STRING (i.e. last
+  ;; entry of row is <= DISTANCE), accumulate result
+  (if (trie--node-data-p node)
+      (when (<= (aref row (1- (length row))) distance)
+       (funcall accumulator
+                seq (cons (aref row (1- (length row)))
+                          (trie--node-data node))))
+
+    ;; build next row of Lewenstein table
+    (let ((next-row (make-vector (length row) nil)))
+      (let ((i 0) inscost delcost subcost)
+       (aset next-row 0 (1+ (aref row 0)))
+       (while (< (incf i) (length row))
+         (setq inscost (1+ (aref next-row (1- i)))
+               delcost (1+ (aref row i))
+               subcost (if (funcall equalfun
+                                    (trie--node-split node)
+                                    (elt string (1- i)))
+                           (aref row (1- i))
+                         (1+ (aref row (1- i)))))
+         (aset next-row i (min inscost delcost subcost))))
+      (setq row next-row))
+    (setq seq (trie--seq-append seq (trie--node-split node)))
+
+    ;; as long as some row entry is < DISTANCE, recursively search below NODE
+    (when (< (apply #'min (append row nil)) distance)
+      (funcall mapfun
+              (lambda (n)
+                (trie--do-fuzzy-match
+                 n row seq string distance reverse
+                 cmpfun equalfun lookupfun mapfun accumulator))
+              (trie--node-subtree node)
+              reverse))))
+
+
+
+(defun trie-fuzzy-complete
+  (trie prefix distance &optional rankfun maxnum reverse filter resultfun)
+  "Return matches for PREFIX in TRIE within Lewenstein DISTANCE
+\(edit distance\) of PREFIX along with their associated data, in
+the order defined by RANKFUN, defaulting to \"lexicographic\"
+order \(i.e. the order defined by the trie's comparison
+function\). If REVERSE is non-nil, the results are sorted in the
+reverse order. Returns nil if no results are found.
+
+PREFIX is a sequence (vector, list or string), whose elements are
+of the same type as elements of the trie keys. If PREFIX is a
+string, it must be possible to apply `string' to individual
+elements of the keys stored in the trie. The matches returned in
+the alist will be sequences of the same type as PREFIX.
+
+DISTANCE must be an integer.
+
+The optional integer argument MAXNUM limits the results to the
+first MAXNUM matches. Otherwise, all matches are returned.
+
+RANKFUN overrides the default ordering of the results. If it is
+`t', matches are instead ordered by increasing Lewenstein
+distance of their prefix \(with same-distance matches ordered
+lexicographically\).
+
+If RANKFUN is a function, it must accepts two arguments, both of
+the form:
+
+    (KEY DIST . DATA)
+
+where KEY is a key from the trie, DIST is its Lewenstein
+distances from PREFIX, and DATA is its associated data. RANKFUN
+should return non-nil if first argument is ranked strictly higher
+than the second, nil otherwise.
+
+The FILTER argument sets a filter function for the matches. If
+supplied, it is called for each possible match with two
+arguments: a KEY and a (DIST . DATA) cons cell. If the filter
+function returns nil, the match is not included in the results,
+and does not count towards MAXNUM.
+
+RESULTFUN defines a function used to process results before
+adding them to the final result list. If specified, it should
+accept two arguments: a KEY and a (DIST . DATA) cons cell. Its
+return value is what gets added to the final result list, instead
+of the default key-dist-data list."
+
+  ;; convert trie from print-form if necessary
+  (trie-transform-from-read-warn trie)
+
+  ;; construct rankfun to sort by Lewenstein distance if requested
+  (when (eq rankfun t)
+    (setq rankfun `(lambda (a b)
+                    (cond
+                    ((< (cadr a) (cadr b)) t)
+                    ((> (cadr a) (cadr b)) nil)
+                    (t ,(trie-construct-sortfun
+                         (trie--comparison-function trie))
+                       (car a) (car b))))))
+
+  ;; accumulate results
+  (trie--accumulate-results
+   rankfun maxnum reverse filter resultfun accumulator nil
+   (funcall (trie--mapfun trie)
+           (lambda (node)
+             (trie--do-fuzzy-complete
+              node
+              (apply #'vector (number-sequence 0 (length prefix)))
+              (cond ((stringp prefix) "") ((listp prefix) ()) (t []))
+              (length prefix)
+              ;; FIXME: Would it pay to replace these arguments with
+              ;;        dynamically-scoped variables, to save stack space?
+              prefix distance (if maxnum reverse (not reverse))
+              (trie--comparison-function trie)
+              (trie--construct-equality-function
+               (trie--comparison-function trie))
+              (trie--lookupfun trie)
+              (trie--mapfun trie)
+              accumulator))
+           (trie--node-subtree (trie--root trie))
+           (if maxnum reverse (not reverse)))))
+
+
+(defun trie--do-fuzzy-complete (node row seq pfxcost prefix distance reverse
+                               cmpfun equalfun lookupfun mapfun accumulator)
+  ;; Search everything below NODE for completions of prefixes within
+  ;; Lewenstein distance DISTANCE of PREFIX. ROW is the previous row of the
+  ;; Lewenstein table. SEQ is the sequence corresponding to NODE. PFXCOST is
+  ;; minimum distance of any prefix of seq. Remaining arguments are
+  ;; corresponding trie functions.
+
+  ;; if we're at a data node and SEQ is within DISTANCE of STRING (i.e. last
+  ;; entry of row is <= DISTANCE), accumulate result
+  (if (trie--node-data-p node)
+      (when (<= (aref row (1- (length row))) distance)
+       (funcall accumulator
+                seq (cons (aref row (1- (length row)))
+                          (trie--node-data node))))
+
+    ;; build next row of Lewenstein table
+    (let ((next-row (make-vector (length row) nil)))
+      (let ((i 0) inscost delcost subcost)
+       (aset next-row 0 (1+ (aref row 0)))
+       (while (< (incf i) (length row))
+         (setq inscost (1+ (aref next-row (1- i)))
+               delcost (1+ (aref row i))
+               subcost (if (funcall equalfun
+                                    (trie--node-split node)
+                                    (elt prefix (1- i)))
+                           (aref row (1- i))
+                         (1+ (aref row (1- i)))))
+         (aset next-row i (min inscost delcost subcost))))
+      (setq row next-row))
+    (setq seq (trie--seq-append seq (trie--node-split node)))
+    (setq pfxcost (min pfxcost (aref row (1- (length row)))))
+
+    ;; as long as some row entry is < DISTANCE, recursively search below NODE
+    (if (< (apply #'min (append row nil)) distance)
+       (funcall mapfun
+                (lambda (n)
+                  (trie--do-fuzzy-complete
+                   n row seq pfxcost prefix distance reverse
+                   cmpfun equalfun lookupfun mapfun accumulator))
+                (trie--node-subtree node)
+                reverse)
+    ;; otherwise, accumulate all results below node
+    (if (<= (aref row (1- (length row))) distance)
+       (trie--mapc
+        (lambda (n s)
+          (funcall accumulator
+                   s (cons (aref row (1- (length row)))
+                           (trie--node-data n))))
+        mapfun node seq reverse)
+      ))))
+
+
+
+
+
+
+
+
 ;; ----------------------------------------------------------------
 ;;            Pretty-print tries during edebug
[Prev in Thread]
Current Thread
[Next in Thread]
[elpa] externals/trie 2a9d7ec 099/111: Port efficiency improvements to trie-fuzzy-match., (continued)
- [elpa] externals/trie 2a9d7ec 099/111: Port efficiency improvements to trie-fuzzy-match., Stefan Monnier, 2020/12/14
- [elpa] externals/trie a2554d6 094/111: Fix function symbol quoting., Stefan Monnier, 2020/12/14
- [elpa] externals/trie c6ddbb9 096/111: Bump version numbers., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 94a1a86 087/111: Bump version numbers since we've added iterator generators., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 4001f61 097/111: Fix corresponding bug in trie-fuzzy-complete-stack., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 91d299c 104/111: Pretty-print trie nodes in edebug., Stefan Monnier, 2020/12/14
- [elpa] externals/trie fc9b218 032/111: Removed support for non-terminal * wildcards, Stefan Monnier, 2020/12/14
- [elpa] externals/trie 5a064c0 092/111: Fix bug in trie-delete return value., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 9f49d95 086/111: Implement iterator generators on collection data structures., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 2957aec 103/111: Fix bugs in trie-fuzzy-match/complete., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 3a734c3 077/111: Implement trie-fuzzy-match and trie-fuzzy-complete functions., Stefan Monnier <=
- [elpa] externals/trie 9259d51 088/111: Improve edebug pretty-printing., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 81899c0 110/111: * packages/trie/trie.el (trie--if-lexical-binding): Simplify, Stefan Monnier, 2020/12/14
- [elpa] externals/trie 31c4ac2 024/111: Implemented trie-wildcard-stacks!, Stefan Monnier, 2020/12/14
- [elpa] externals/trie a438b01 090/111: Fix bugs in lexical binding support(?), Stefan Monnier, 2020/12/14
- [elpa] externals/trie ee4b459 106/111: Allow pruning of trie branches in queries., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 929cb78 101/111: Rename to trie--map-internal to clarify not for public use., Stefan Monnier, 2020/12/14
- [elpa] externals/trie 14c4bec 109/111: Fix lexical binding bugs., Stefan Monnier, 2020/12/14
Prev by Date: [elpa] externals/trie 2957aec 103/111: Fix bugs in trie-fuzzy-match/complete.
Next by Date: [elpa] externals/trie 9259d51 088/111: Improve edebug pretty-printing.
Previous by thread: [elpa] externals/trie 2957aec 103/111: Fix bugs in trie-fuzzy-match/complete.
Next by thread: [elpa] externals/trie 9259d51 088/111: Improve edebug pretty-printing.
Index(es):
- Date
- Thread