guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] branch master updated: Handle CRLF and Unicode line endi


From: Mike Gran
Subject: [Guile-commits] branch master updated: Handle CRLF and Unicode line endings in read-line
Date: Thu, 11 Mar 2021 22:48:35 -0500

This is an automated email from the git hooks/post-receive script.

mike121 pushed a commit to branch master
in repository guile.

The following commit(s) were added to refs/heads/master by this push:
     new 0f983e3  Handle CRLF and Unicode line endings in read-line
0f983e3 is described below

commit 0f983e3db0c43ad7c89f57ea84f792ede373ba0c
Author: Mike Gran <spk121@yahoo.com>
AuthorDate: Thu Mar 11 19:42:33 2021 -0800

    Handle CRLF and Unicode line endings in read-line
    
    * libguile/rdelim.c (scm_read_line): handle CRLF, LS and PS
    * module/ice-9/suspendable-ports.scm (read-line): handle CRLF, LS, and PS
    * module/web/http.scm (read-header-line): take advantage of CRLF in 
read-line
       (read-header): don't need to test for \return
    * test-suite/tests/rdelim.test: new tests for read-line CRLF, LS and PS
    * doc/ref/api-io.texi: update doc for read-line
---
 doc/ref/api-io.texi                | 10 +++--
 libguile/rdelim.c                  | 47 ++++++++++++++++++-----
 module/ice-9/suspendable-ports.scm | 77 ++++++++++++++++++++++++++++++++++++--
 module/web/http.scm                | 14 +++----
 test-suite/tests/rdelim.test       | 42 ++++++++++++++++++++-
 5 files changed, 165 insertions(+), 25 deletions(-)

diff --git a/doc/ref/api-io.texi b/doc/ref/api-io.texi
index 777f282..2345f04 100644
--- a/doc/ref/api-io.texi
+++ b/doc/ref/api-io.texi
@@ -755,8 +755,10 @@ a specified set of characters.
 
 @deffn {Scheme Procedure} read-line [port] [handle-delim]
 Return a line of text from @var{port} if specified, otherwise from the
-value returned by @code{(current-input-port)}.  Under Unix, a line of text
-is terminated by the first end-of-line character or by end-of-file.
+value returned by @code{(current-input-port)}.  Under Unix, a line of
+text is terminated by the first end-of-line character or by end-of-file.
+The end-of-line characters handled are newline, carriage return plus
+newline, or the Unicode line or paragraph separators.
 
 If @var{handle-delim} is specified, it should be one of the following
 symbols:
@@ -771,7 +773,9 @@ Append the terminating delimiter (if any) to the returned 
string.
 Push the terminating delimiter (if any) back on to the port.
 @item split
 Return a pair containing the string read from the port and the
-terminating delimiter or end-of-file object.
+terminating delimiter or end-of-file object.  The delimiter will either
+be a single character for newline or the Unicode line or paragraph
+separators, or it will be the string @code{"\r\n"}.
 @end table
 @end deffn
 
diff --git a/libguile/rdelim.c b/libguile/rdelim.c
index 4a0b209..c1b9202 100644
--- a/libguile/rdelim.c
+++ b/libguile/rdelim.c
@@ -112,10 +112,11 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 
3, 0,
 
 SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0, 
             (SCM port),
-           "Read a newline-terminated line from @var{port}, allocating storage 
as\n"
-           "necessary.  The newline terminator (if any) is removed from the 
string,\n"
+           "Read a line from @var{port}, allocating storage as necessary.\n"
+           "The terminator (if any) is removed from the string,\n"
            "and a pair consisting of the line and its delimiter is returned.  
The\n"
-           "delimiter may be either a newline or the @var{eof-object}; if\n"
+           "delimiter may be either a newline, return + newline, the Unicode\n"
+            "line or paragraph separators, or the @var{eof-object}; if\n"
            "@code{%read-line} is called at the end of file, it returns the 
pair\n"
            "@code{(#<eof> . #<eof>)}.")
 #define FUNC_NAME s_scm_read_line
@@ -127,6 +128,7 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
   SCM line, strings, result;
   scm_t_wchar buf[LINE_BUFFER_SIZE], delim;
   size_t index;
+  int cr = 0;
 
   if (SCM_UNBNDP (port))
     port = scm_current_input_port ();
@@ -152,12 +154,24 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
          buf[index] = scm_getc (port);
          switch (buf[index])
            {
-           case EOF:
            case '\n':
              delim = buf[index];
-             break;
+              break;
+
+            case EOF:
+            case 0x2028:        /* U+2028 LINE SEPARATOR */
+            case 0x2029:        /* U+2029 PARAGRAPH SEPARATOR */
+              cr = 0;
+              delim = buf[index];
+              break;
+
+            case '\r':
+              cr = 1;
+              index ++;
+              break;
 
            default:
+              cr = 0;
              index++;
            }
        }
@@ -165,20 +179,33 @@ SCM_DEFINE (scm_read_line, "%read-line", 0, 1, 0,
   while (delim == 0);
 
   if (SCM_LIKELY (scm_is_false (strings)))
-    /* The fast path.  */
-    line = scm_from_utf32_stringn (buf, index);
+    {
+      /* The fast path.  */
+      if (cr)
+        line = scm_from_utf32_stringn (buf, index - 1);
+      else
+        line = scm_from_utf32_stringn (buf, index);
+    }
   else
     {
       /* Aggregate the intermediary results.  */
-      strings = scm_cons (scm_from_utf32_stringn (buf, index), strings);
+      if (cr)
+        strings = scm_cons (scm_from_utf32_stringn (buf, index - 1), strings);
+      else
+        strings = scm_cons (scm_from_utf32_stringn (buf, index), strings);
       line = scm_string_concatenate (scm_reverse (strings));
     }
 
   if (delim == EOF && scm_i_string_length (line) == 0)
     result = scm_cons (SCM_EOF_VAL, SCM_EOF_VAL);
   else
-    result = scm_cons (line,
-                      delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim));
+    {
+      if (cr)
+        result = scm_cons (line, scm_from_latin1_string("\r\n"));
+      else
+        result = scm_cons (line,
+                           delim == EOF ? SCM_EOF_VAL : SCM_MAKE_CHAR (delim));
+    }
 
   return result;
 #undef LINE_BUFFER_SIZE
diff --git a/module/ice-9/suspendable-ports.scm 
b/module/ice-9/suspendable-ports.scm
index f5f005c..ba8d225 100644
--- a/module/ice-9/suspendable-ports.scm
+++ b/module/ice-9/suspendable-ports.scm
@@ -1,5 +1,5 @@
 ;;; Ports, implemented in Scheme
-;;; Copyright (C) 2016, 2019 Free Software Foundation, Inc.
+;;; Copyright (C) 2016, 2018, 2021 Free Software Foundation, Inc.
 ;;;
 ;;; This library is free software: you can redistribute it and/or modify
 ;;; it under the terms of the GNU Lesser General Public License as
@@ -689,10 +689,81 @@
 
 (define* (read-line #:optional (port (current-input-port))
                     (handle-delim 'trim))
-  (read-delimited "\n" port handle-delim))
+  (let* ((line/delim   (%read-line port))
+        (line          (car line/delim))
+        (delim         (cdr line/delim)))
+    (case handle-delim
+      ((trim) line)
+      ((split) line/delim)
+      ((concat) (if (and (string? line) (char? delim))
+                   (string-append line (string delim))
+                   line))
+      ((peek) (if (char? delim)
+                 (unread-char delim port))
+             line)
+      (else
+       (error "unexpected handle-delim value: " handle-delim)))))
 
 (define* (%read-line port)
-  (read-line port 'split))
+  (let ((LINE_BUFFER_SIZE 256))
+    (let ((strings #f)
+          (result #f)
+          (buf (make-string LINE_BUFFER_SIZE #\nul))
+          (delim #f)
+          (index 0)
+          (cr #f)
+          (go #t))
+      (cond
+       ((not (input-port? port))
+        (error "Not an input port." port))
+       (else
+        (while go
+               (cond
+                ((>= index LINE_BUFFER_SIZE)
+                 (set! strings (cons (substring buf 0 index)
+                                     (or strings '())))
+                 (set! index 0))
+                (else
+                 (let ((c (read-char port)))
+                   (cond
+                    ((or (eof-object? c)
+                         (char=? c #\x2028)  ; U+2028 LINE SEPARATOR
+                         (char=? c #\x2029)) ; U+2029 PARAGRAPH SEPARATOR
+                     (set! cr #f)
+                     (set! delim c))
+                    ((char=? c #\newline)
+                     (set! delim c))
+                    ((char=? c #\return)
+                     (set! cr #t)
+                     (string-set! buf index c)
+                     (set! index (1+ index)))
+                    (else
+                     (set! cr #f)
+                     (string-set! buf index c)
+                     (set! index (1+ index)))))))
+
+               (if (or (eof-object? delim)
+                       (char? delim))
+                   (set! go #f)))
+      (let ((line (if (not strings)
+                      ;; A short string.
+                      (if cr
+                          (substring buf 0 (1- index))
+                          (substring buf 0 index))
+                      ;; Else, aggregate the intermediary results.
+                      (begin
+                        (if cr
+                            (set! strings (cons (substring buf 0 (1- index)) 
strings))
+                            (set! strings (cons (substring buf 0 index) 
strings)))
+                        (apply string-append (reverse strings))))))
+
+        (if (and (eof-object? delim)
+                 (zero? (string-length line)))
+            (cons the-eof-object the-eof-object)
+            ;; Else
+            (if cr
+                (cons line "\r\n")
+                (cons line delim)))))))))
 
 (define* (put-string port str #:optional (start 0)
                      (count (- (string-length str) start)))
diff --git a/module/web/http.scm b/module/web/http.scm
index 4276e17..32a3093 100644
--- a/module/web/http.scm
+++ b/module/web/http.scm
@@ -157,13 +157,12 @@ The default writer will call ‘put-string’."
 Raise a 'bad-header' exception if the line does not end in CRLF or LF,
 or if EOF is reached."
   (match (%read-line port)
+    (((? string? line) . "\r\n")
+     line)
     (((? string? line) . #\newline)
-     ;; '%read-line' does not consider #\return a delimiter; so if it's
-     ;; there, remove it.  We are more tolerant than the RFC in that we
-     ;; tolerate LF-only endings.
-     (if (string-suffix? "\r" line)
-         (string-drop-right line 1)
-         line))
+     ;; We are more tolerant than the RFC in that we tolerate LF-only
+     ;; endings.
+     line)
     ((line . _)                                ;EOF or missing delimiter
      (bad-header 'read-header-line line))))
 
@@ -184,8 +183,7 @@ was known but the value was invalid.
 Returns the end-of-file object for both values if the end of the message
 body was reached (i.e., a blank line)."
   (let ((line (read-header-line port)))
-    (if (or (string-null? line)
-            (string=? line "\r"))
+    (if (string-null? line)
         (values *eof* *eof*)
         (let* ((delim (or (string-index line #\:)
                           (bad-header '%read line)))
diff --git a/test-suite/tests/rdelim.test b/test-suite/tests/rdelim.test
index 3aaa0b2..1060d5c 100644
--- a/test-suite/tests/rdelim.test
+++ b/test-suite/tests/rdelim.test
@@ -1,7 +1,7 @@
 ;;;; rdelim.test --- Delimited I/O.      -*- mode: scheme; coding: utf-8; -*-
 ;;;; Ludovic Courtès <ludo@gnu.org>
 ;;;;
-;;;;   Copyright (C) 2011, 2013, 2014 Free Software Foundation, Inc.
+;;;;   Copyright (C) 2011, 2013, 2014, 2021 Free Software Foundation, Inc.
 ;;;;
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -62,6 +62,46 @@
                          (read-line p 'split)))
            (eof-object? (read-line p)))))
 
+  (pass-if "two lines, split, CRLF"
+    (let* ((s "foo\r\nbar\r\n")
+           (p (open-input-string s)))
+      (and (equal? '(("foo" . "\r\n")
+                     ("bar" . "\r\n"))
+                   (list (read-line p 'split)
+                         (read-line p 'split)))
+           (eof-object? (read-line p)))))
+
+  (pass-if "two long lines, split, CRLF"
+    ;; Must be longer than 256 codepoints
+    (let* ((text0 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
+           (text1 (string-append text0 text0 text0 text0 text0))
+           (text2 (string-append text1 "\r\n" text1 "\r\n")))
+      (let* ((s text2)
+             (p (open-input-string s)))
+        (and (equal? `((,text1 . "\r\n")
+                       (,text1 . "\r\n"))
+                     (list (read-line p 'split)
+                           (read-line p 'split)))
+             (eof-object? (read-line p))))))
+
+  (pass-if "two lines, split, LS"
+    (let* ((s "foo\u2028bar\u2028")
+           (p (open-input-string s)))
+      (and (equal? '(("foo" . #\x2028)
+                     ("bar" . #\x2028))
+                   (list (read-line p 'split)
+                         (read-line p 'split)))
+           (eof-object? (read-line p)))))
+
+  (pass-if "two lines, split, PS"
+    (let* ((s "foo\u2029bar\u2029")
+           (p (open-input-string s)))
+      (and (equal? '(("foo" . #\x2029)
+                     ("bar" . #\x2029))
+                   (list (read-line p 'split)
+                         (read-line p 'split)))
+           (eof-object? (read-line p)))))
+
   (pass-if "two Greek lines, trim"
     (let* ((s "λαμβδα\nμυ\n")
            (p (open-input-string s)))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]