>From e154cabb2171e16319866c5fce6ee3b50b73da0a Mon Sep 17 00:00:00 2001 From: Peter Bex Date: Sun, 22 Jun 2014 18:02:33 +0200 Subject: [PATCH] Fix #878 which was indeed a bug, caused by an incorrect hand-rolled optimisation. This adds some more "integration" test cases so that we can verify more easily that the combination of make-kmp-restart-vector and kmp-step is looping through the pattern correctly as it advances through the search string. The optimisation was due to a mistaken reading of the reference implementation: the pattern was indexed as pat[k+1] at the j=-1 case and as pat[k] at the pat[k]=pat[j+start] case, but the optimisation changed the code to use pat[k] in both cases. --- NEWS | 1 + srfi-13.scm | 43 +++++++++++++++-------------- tests/srfi-13-tests.scm | 70 +++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 82 insertions(+), 32 deletions(-) diff --git a/NEWS b/NEWS index a9ded9e..693ee40 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,7 @@ - Unit tcp now implicitly depends on ports instead of extras. This may break programs which don't use modules and forgot to require extras but use procedures from it. + - Fixed bug in make-kmp-restart-vector from SRFI-13. - Unit lolevel: - Restore long-lost but still documented "vector-like?" procedure (#983) diff --git a/srfi-13.scm b/srfi-13.scm index 65b748f..dec54b2 100644 --- a/srfi-13.scm +++ b/srfi-13.scm @@ -1404,35 +1404,36 @@ ((c= char=?) rest) ; (procedure? c=)) (receive (rest2 start end) (string-parse-start+end make-kmp-restart-vector pattern rest) (let* ((rvlen (- end start)) - (rv (make-vector rvlen -1))) + (rv (make-vector rvlen -1))) (if (> rvlen 0) (let ((rvlen-1 (- rvlen 1)) (c0 (string-ref pattern start))) ;; Here's the main loop. We have set rv[0] ... rv[i]. ;; K = I + START -- it is the corresponding index into PATTERN. - (let lp1 ((i 0) (j -1) (k start)) + (let lp1 ((i 0) (j -1) (k start)) (if (< i rvlen-1) - (let ((ck (string-ref pattern k))) - ;; lp2 invariant: - ;; pat[(k-j) .. k-1] matches pat[start .. start+j-1] - ;; or j = -1. - (let lp2 ((j j)) - - (cond ((= j -1) - (let ((i1 (+ i 1))) - (vector-set! rv i1 (if (c= ck c0) -1 0)) - (lp1 i1 0 (+ k 1)))) - - ;; pat[(k-j) .. k] matches pat[start..start+j]. - ((c= ck (string-ref pattern (+ j start))) - (let* ((i1 (+ 1 i)) - (j1 (+ 1 j))) - (vector-set! rv i1 j1) - (lp1 i1 j1 (+ k 1)))) - - (else (lp2 (vector-ref rv j)))))))))) + ;; lp2 invariant: + ;; pat[(k-j) .. k-1] matches pat[start .. start+j-1] + ;; or j = -1. + (let lp2 ((j j)) + + (cond ((= j -1) + (let ((i1 (+ i 1)) + (ck+1 (string-ref pattern (add1 k)))) + (vector-set! rv i1 (if (c= ck+1 c0) -1 0)) + (lp1 i1 0 (+ k 1)))) + + ;; pat[(k-j) .. k] matches pat[start..start+j]. + ((c= (string-ref pattern k) + (string-ref pattern (+ j start))) + (let* ((i1 (+ 1 i)) + (j1 (+ 1 j))) + (vector-set! rv i1 j1) + (lp1 i1 j1 (+ k 1)))) + + (else (lp2 (vector-ref rv j))))))))) rv)))) diff --git a/tests/srfi-13-tests.scm b/tests/srfi-13-tests.scm index 1262b82..5016b9c 100644 --- a/tests/srfi-13-tests.scm +++ b/tests/srfi-13-tests.scm @@ -607,17 +607,65 @@ (test "make-kmp-restart-vector" '#(-1) (make-kmp-restart-vector "a")) -;;; The following two tests for make-kmp-restart-vector are -;;; intentionally commented (see http://bugs.call-cc.org/ticket/878) -;;; -- mario - -; This seems right to me, but is it? -; (test "make-kmp-restart-vector" '#(-1 0) (make-kmp-restart-vector "ab")) - -; The following is from an example in the code, but I expect it is not right. -; (test "make-kmp-restart-vector" '#(-1 0 0 -1 1 2) (make-kmp-restart-vector "abdabx")) - - +(test "make-kmp-restart-vector" '#(-1 0) (make-kmp-restart-vector "ab")) + +; The following is from an example in the code. It is the "optimised" +; version; it's also valid to return #(-1 0 0 0 1 2), but that will +; needlessly check the "a" twice before giving up. +(test "make-kmp-restart-vector" + '#(-1 0 0 -1 1 2) + (make-kmp-restart-vector "abdabx")) + +;; Each entry in kmp-cases is a pattern, a string to match against and +;; the expected run of the algorithm through the positions in the +;; pattern. So for example 0 1 2 means it looks at position 0 first, +;; then at 1 and then at 2. +;; +;; This is easy to verify in simple cases; If there's a shared +;; substring and matching fails, you try matching again starting at +;; the end of the shared substring, otherwise you rewind. For more +;; complex cases, it's increasingly difficult for humans to verify :) +(define kmp-cases + '(("abc" "xx" #f 0 0) + ("abc" "abc" #t 0 1 2) + ("abcd" "abc" #f 0 1 2) + ("abc" "abcd" #t 0 1 2) + ("abc" "aabc" #t 0 1 1 2) + ("ab" "aa" #f 0 1) + ("ab" "aab" #t 0 1 1) + ("abdabx" "abdbbabda" #f 0 1 2 3 0 0 1 2 3) + ("aabc" "axaabc" #t 0 1 0 1 2 3) + ("aabac" "aabaabac" #t 0 1 2 3 4 2 3 4))) + +(for-each + (lambda (test-case) + (let* ((pat (car test-case)) + (n (string-length pat)) + (str (cadr test-case)) + (match? (caddr test-case)) + (steps (cdddr test-case)) + (rv (make-kmp-restart-vector pat))) + (call-with-input-string + str + (lambda (p) + (let lp ((i 0) + (step 0) + (steps steps)) + (cond + ((or (= i n) (eof-object? (peek-char p))) + (test-assert (sprintf "KMP match? ~S, case: ~S" match? test-case) + (eq? (= i n) match?)) + (test-assert (sprintf "KMP empty remaining steps: ~S, case: ~S" + steps test-case) + (null? steps))) + (else + (let ((new-i (kmp-step pat rv (read-char p) i char=? 0)) + (expected-i (and (not (null? steps)) (car steps)))) + (test (sprintf "KMP step ~S (exp: ~S, act: ~S), case: ~S" + step expected-i i test-case) + expected-i i) + (lp new-i (add1 step) (cdr steps)))))))))) + kmp-cases) ; FIXME! Implement tests for these: ; string-kmp-partial-search -- 1.7.10.4