gawk-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[SCM] gawk branch, gawk-5.1-stable, updated. gawk-4.1.0-4132-g6a34364


From: Arnold Robbins
Subject: [SCM] gawk branch, gawk-5.1-stable, updated. gawk-4.1.0-4132-g6a34364
Date: Thu, 1 Oct 2020 09:49:06 -0400 (EDT)

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".

The branch, gawk-5.1-stable has been updated
       via  6a34364cde8eec7df0dd9f1de005babea18e45ec (commit)
      from  2ab1c82b4097cff8763d1ed63be6478edf55eb54 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=6a34364cde8eec7df0dd9f1de005babea18e45ec

commit 6a34364cde8eec7df0dd9f1de005babea18e45ec
Author: Arnold D. Robbins <arnold@skeeve.com>
Date:   Thu Oct 1 16:48:51 2020 +0300

    Rewrite split program.

diff --git a/awklib/eg/prog/split.awk b/awklib/eg/prog/split.awk
index 9239a6c..6e0ac16 100644
--- a/awklib/eg/prog/split.awk
+++ b/awklib/eg/prog/split.awk
@@ -1,56 +1,142 @@
 # split.awk --- do split in awk
 #
-# Requires ord() and chr() library functions
+# Requires getopt() library function.
 #
 # Arnold Robbins, arnold@skeeve.com, Public Domain
 # May 1993
 # Revised slightly, May 2014
+# Rewritten September 2020
 
-# usage: split [-count] [file] [outname]
-
+function usage()
+{
+    print("usage: split [-l count]  [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+    print("       split [-b N[k|m]] [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+    exit 1
+}
 BEGIN {
-    outfile = "x"    # default
-    count = 1000
-    if (ARGC > 4)
-        usage()
-
-    i = 1
-    if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) {
-        count = -ARGV[i]
-        ARGV[i] = ""
-        i++
+    # Set defaults:
+    Suffix_length = 2
+    Line_count = 1000
+    Byte_count = 0
+    Outfile = "x"
+
+    parse_arguments()
+
+    init_suffix_data()
+
+    Output = (Outfile compute_suffix())
+}
+function parse_arguments(   i, c, l, modifier)
+{
+    while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) {
+        if (c == "a")
+            Suffix_length = Optarg + 0
+        else if (c == "b") {
+            Byte_count = Optarg + 0
+            Line_count = 0
+
+            l = length(Optarg)
+            modifier = substr(Optarg, l, 1)
+            if (modifier == "k")
+                Byte_count *= 1024
+            else if (modifier == "m")
+                Byte_count *= 1024 * 1024
+        } else if (c == "l") {
+            Line_count = Optarg + 0
+            Byte_count = 0
+        } else
+            usage()
     }
-    # test argv in case reading from stdin instead of file
-    if (i in ARGV)
-        i++    # skip datafile name
-    if (i in ARGV) {
-        outfile = ARGV[i]
+
+    # Clear out options
+    for (i = 1; i < Optind; i++)
         ARGV[i] = ""
+
+    # Check for filename
+    if (ARGV[Optind]) {
+        Optind++
+
+       # Check for different prefix
+        if (ARGV[Optind]) {
+            Outfile = ARGV[Optind]
+            ARGV[Optind] = ""
+
+            if (++Optind < ARGC)
+                usage()
+        }
     }
-    s1 = s2 = "a"
-    out = (outfile s1 s2)
 }
+function compute_suffix(    i, result, letters)
 {
-    if (++tcount > count) {
-        close(out)
-        if (s2 == "z") {
-            if (s1 == "z") {
-                printf("split: %s is too large to split\n",
-                       FILENAME) > "/dev/stderr"
-                exit 1
-            }
-            s1 = chr(ord(s1) + 1)
-            s2 = "a"
-        }
-        else
-            s2 = chr(ord(s2) + 1)
-        out = (outfile s1 s2)
-        tcount = 1
+    # Logical step 3
+    if (Reached_last) {
+        printf("split: too many files!\n") > "/dev/stderr"
+        exit 1
+    } else if (on_last_file())
+        Reached_last = 1    # fail when wrapping after 'zzz'
+
+    # Logical step 1
+    result = ""
+    letters = "abcdefghijklmnopqrstuvwxyz"
+    for (i = 1; i <= Suffix_length; i++)
+        result = result substr(letters, Suffix_ind[i], 1)
+
+    # Logical step 2
+    for (i = Suffix_length; i >= 1; i--) {
+        if (++Suffix_ind[i] > 26) {
+            Suffix_ind[i] = 1
+        } else
+            break
     }
-    print > out
+
+    return result
 }
-function usage()
+function init_suffix_data(  i)
 {
-    print("usage: split [-num] [file] [outname]") > "/dev/stderr"
-    exit 1
+    for (i = 1; i <= Suffix_length; i++)
+        Suffix_ind[i] = 1
+
+    Reached_last = 0
+}
+function on_last_file(  i, on_last)
+{
+    on_last = 1
+    for (i = 1; i <= Suffix_length; i++) {
+        on_last = on_last && (Suffix_ind[i] == 26)
+    }
+
+    return on_last
+}
+Line_count > 0 {
+    if (++tcount > Line_count) {
+        close(Output)
+        Output = (Outfile compute_suffix())
+        tcount = 1
+    }
+    print > Output
+}
+Byte_count > 0 {
+    # `+ 1' is for the final newline
+    if (tcount + length($0) + 1 > Byte_count) { # would overflow
+        # compute leading bytes
+        leading_bytes = Byte_count - tcount
+
+        # write leading bytes
+        printf("%s", substr($0, 1, leading_bytes)) > Output
+
+        # close old file, open new file
+        close(Output)
+        Output = (Outfile compute_suffix())
+
+        # set up first bytes for new file
+        $0 = substr($0, leading_bytes + 1)  # trailing bytes
+        tcount = 0
+    }
+
+    # write full record or trailing bytes
+    tcount += length($0) + 1
+    print > Output
+}
+END {
+    close(Output)
 }
diff --git a/doc/ChangeLog b/doc/ChangeLog
index bb6aa39..e648631 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-01         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * gawktexi.in (Split Program): Rewrite split to be POSIX
+       compliant. Update all the prose.
+       * wordlist: Update.
+
 2020-09-24         Arnold D. Robbins     <arnold@skeeve.com>
 
        * gawktexi.in: Fix a spelling error.
diff --git a/doc/gawk.info b/doc/gawk.info
index a3c9a80..5523f24 100644
--- a/doc/gawk.info
+++ b/doc/gawk.info
@@ -18418,103 +18418,281 @@ File: gawk.info,  Node: Split Program,  Next: Tee 
Program,  Prev: Id Program,  U
 11.2.4 Splitting a Large File into Pieces
 -----------------------------------------
 
-The 'split' program splits large text files into smaller pieces.  Usage
-is as follows:(1)
+The 'split' utility splits large text files into smaller pieces.  The
+usage follows the POSIX standard for 'split' and is as follows:
 
-     'split' ['-COUNT'] [FILE] [PREFIX]
+     'split' ['-l' COUNT] ['-a' SUFFIX-LEN] [FILE [OUTNAME]]
+     'split' '-b' N['k'|'m']] ['-a' SUFFIX-LEN] [FILE [OUTNAME]]
 
    By default, the output files are named 'xaa', 'xab', and so on.  Each
 file has 1,000 lines in it, with the likely exception of the last file.
-To change the number of lines in each file, supply a number on the
-command line preceded with a minus sign (e.g., '-500' for files with 500
-lines in them instead of 1,000).  To change the names of the output
-files to something like 'myfileaa', 'myfileab', and so on, supply an
-additional argument that specifies the file name prefix.
-
-   Here is a version of 'split' in 'awk'.  It uses the 'ord()' and
-'chr()' functions presented in *note Ordinal Functions::.
-
-   The program first sets its defaults, and then tests to make sure
-there are not too many arguments.  It then looks at each argument in
-turn.  The first argument could be a minus sign followed by a number.
-If it is, this happens to look like a negative number, so it is made
-positive, and that is the count of lines.  The data file name is skipped
-over and the final argument is used as the prefix for the output file
-names:
+
+   The 'split' program has evolved over time, and the current POSIX
+version is more complicated than the original Unix version.  The options
+and what they do are as follows:
+
+'-a' SUFFIX-LEN
+     Use SUFFIX-LEN characters for the suffix.  For example, if
+     SUFFIX-LEN is four, the output files would range from 'xaaaa' to
+     'xzzzz'.
+
+'-b' N['k'|'m']]
+     Instead of each file containing a specified number of lines, each
+     file should have (at most) N bytes.  Supplying a trailing 'k'
+     multiplies N by 1,024, yielding kilobytes.  Supplying a trailing
+     'm' mutiplies N by 1,048,576 (1,024 * 1,024) yielding megabytes.
+     (This option is mutually exclusive with '-l').
+
+'-l' COUNT
+     Each file should have at most COUNT lines, instead of the default
+     1,000.  (This option is mutually exclusive with '-b').
+
+   If supplied, FILE is the input file to read.  Otherwise standard
+input is processed.  If supplied, OUTNAME is the leading prefix to use
+for file names, instead of 'x'.
+
+   In order to use the '-b' option, 'gawk' should be invoked with its
+'-b' option (*note Options::), or with the environment variable 'LC_ALL'
+set to 'C', so that each input byte is treated as a separate
+character.(1)
+
+   Here is an implementation of 'split' in 'awk'.  It uses the
+'getopt()' function presented in *note Getopt Function::.
+
+   The program begins with a standard descriptive comment and then a
+'usage()' function describing the options:
 
      # split.awk --- do split in awk
      #
-     # Requires ord() and chr() library functions
-     # usage: split [-count] [file] [outname]
+     # Requires getopt() library function.
+     function usage()
+     {
+         print("usage: split [-l count]  [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+         print("       split [-b N[k|m]] [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+         exit 1
+     }
+
+   Next, in a 'BEGIN' rule we set the default values and parse the
+arguments.  After that we initialize the data structures used to cycle
+the suffix from 'aa...' to 'zz...'.  Finally we set the name of the
+first output file:
 
      BEGIN {
-         outfile = "x"    # default
-         count = 1000
-         if (ARGC > 4)
-             usage()
+         # Set defaults:
+         Suffix_length = 2
+         Line_count = 1000
+         Byte_count = 0
+         Outfile = "x"
 
-         i = 1
-         if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) {
-             count = -ARGV[i]
-             ARGV[i] = ""
-             i++
+         parse_arguments()
+
+         init_suffix_data()
+
+         Output = (Outfile compute_suffix())
+     }
+
+   Parsing the arguments is straightforward.  The program follows our
+convention (*note Library Names::) of having important global variables
+start with an uppercase letter:
+
+     function parse_arguments(   i, c, l, modifier)
+     {
+         while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) {
+             if (c == "a")
+                 Suffix_length = Optarg + 0
+             else if (c == "b") {
+                 Byte_count = Optarg + 0
+                 Line_count = 0
+
+                 l = length(Optarg)
+                 modifier = substr(Optarg, l, 1)
+                 if (modifier == "k")
+                     Byte_count *= 1024
+                 else if (modifier == "m")
+                     Byte_count *= 1024 * 1024
+             } else if (c == "l") {
+                 Line_count = Optarg + 0
+                 Byte_count = 0
+             } else
+                 usage()
          }
-         # test argv in case reading from stdin instead of file
-         if (i in ARGV)
-             i++    # skip datafile name
-         if (i in ARGV) {
-             outfile = ARGV[i]
+
+         # Clear out options
+         for (i = 1; i < Optind; i++)
              ARGV[i] = ""
+
+         # Check for filename
+         if (ARGV[Optind]) {
+             Optind++
+
+             # Check for different prefix
+             if (ARGV[Optind]) {
+                 Outfile = ARGV[Optind]
+                 ARGV[Optind] = ""
+
+                 if (++Optind < ARGC)
+                     usage()
+             }
+         }
+     }
+
+   Managing the file name suffix is interesting.  Given a suffix of
+length three, say, the values go from 'aaa', 'aab', 'aac' and so on, all
+the way to 'zzx', 'zzy', and finally 'zzz'.  There are two important
+aspects to this:
+
+   * We have to be able to easily generate these suffixes, and in
+     particular easily handle "rolling over"; for example, going from
+     'abz' to 'aca'.
+
+   * We have to tell when we've finished with the last file, so that if
+     we still have more input data we can print an error message and
+     exit.  The trick is to handle this _after_ using the last suffix,
+     and not when the final suffix is created.
+
+   The computation is handled by 'compute_suffix()'.  This function is
+called every time a new file is opened.
+
+   The flow here is messy, because we want to generate 'zzzz' (say), and
+use it, and only produce an error after all the file name suffixes have
+been used up.  The logical steps are as follows:
+
+  1. Generate the suffix, saving the value in 'result' to return.  To do
+     this, the supplementary array 'Suffix_ind' contains one element for
+     each letter in the suffix.  Each element ranges from 1 to 26,
+     acting as the index into a string containing all the lowercase
+     letters of the English alphabet.  It is initialized by
+     'init_suffix_data()'.  'result' is built up one letter at a time,
+     using each 'substr()'.
+
+  2. Prepare the data structures for the next time 'compute_suffix()' is
+     called.  To do this, we loop over 'Suffix_ind', _backwards_.  If
+     the current element is less than 26, it's incremented and the loop
+     breaks ('abq' goes to 'abr').  Otherwise, the element is reset to
+     one and we move down the list ('abz' to 'aca').  Thus, the
+     'Suffix_ind' array is always "one step ahead" of the actual file
+     name suffix to be returned.
+
+  3. Check if we've gone past the limit of possible filenames.  If
+     'Reached_last' is true, print a message and exit.  Otherwise, check
+     if 'Suffix_ind' describes a suffix where all the letters are 'z'.
+     If that's the case we're about to return the final suffix.  If so,
+     we set 'Reached_last' to true so that the _next_ call to
+     'compute_suffix()' will cause a failure.
+
+   Physically, the steps in the function occur in the order 3, 1, 2:
+
+     function compute_suffix(    i, result, letters)
+     {
+         # Logical step 3
+         if (Reached_last) {
+             printf("split: too many files!\n") > "/dev/stderr"
+             exit 1
+         } else if (on_last_file())
+             Reached_last = 1    # fail when wrapping after 'zzz'
+
+         # Logical step 1
+         result = ""
+         letters = "abcdefghijklmnopqrstuvwxyz"
+         for (i = 1; i <= Suffix_length; i++)
+             result = result substr(letters, Suffix_ind[i], 1)
+
+         # Logical step 2
+         for (i = Suffix_length; i >= 1; i--) {
+             if (++Suffix_ind[i] > 26) {
+                 Suffix_ind[i] = 1
+             } else
+                 break
          }
-         s1 = s2 = "a"
-         out = (outfile s1 s2)
+
+         return result
      }
 
-   The next rule does most of the work.  'tcount' (temporary count)
-tracks how many lines have been printed to the output file so far.  If
-it is greater than 'count', it is time to close the current file and
-start a new one.  's1' and 's2' track the current suffixes for the file
-name.  If they are both 'z', the file is just too big.  Otherwise, 's1'
-moves to the next letter in the alphabet and 's2' starts over again at
-'a':
+   The 'Suffix_ind' array and 'Reached_last' are initialized by
+'init_suffix_data()':
 
+     function init_suffix_data(  i)
      {
-         if (++tcount > count) {
-             close(out)
-             if (s2 == "z") {
-                 if (s1 == "z") {
-                     printf("split: %s is too large to split\n",
-                            FILENAME) > "/dev/stderr"
-                     exit 1
-                 }
-                 s1 = chr(ord(s1) + 1)
-                 s2 = "a"
-             }
-             else
-                 s2 = chr(ord(s2) + 1)
-             out = (outfile s1 s2)
+         for (i = 1; i <= Suffix_length; i++)
+             Suffix_ind[i] = 1
+
+         Reached_last = 0
+     }
+
+   The function 'on_last_file()' returns true if 'Suffix_ind' describes
+a suffix where all the letters are 'z' by checking that all the elements
+in the array are equal to 26:
+
+     function on_last_file(  i, on_last)
+     {
+         on_last = 1
+         for (i = 1; i <= Suffix_length; i++) {
+             on_last = on_last && (Suffix_ind[i] == 26)
+         }
+
+         return on_last
+     }
+
+   The actual work of splitting the input file is done by the next two
+rules.  Since splitting by line count and splitting by byte count are
+mutually exclusive, we simply use two separate rules, one for when
+'Line_count' is greater than zero, and another for when 'Byte_count' is
+greater than zero.
+
+   The variable 'tcount' counts how many lines have been processed so
+far.  When it exceeds 'Line_count', it's time to close the previous file
+and switch to a new one:
+
+     Line_count > 0 {
+         if (++tcount > Line_count) {
+             close(Output)
+             Output = (Outfile compute_suffix())
              tcount = 1
          }
-         print > out
+         print > Output
      }
 
-The 'usage()' function simply prints an error message and exits:
+   The rule for handling bytes is more complicated.  Since lines most
+likely vary in length, the 'Byte_count' boundary may be hit in the
+middle of an input record.  In that case, 'split' has to write enough of
+the first bytes of the input record to finish up 'Byte_count' bytes,
+close the file, open a new file, and write the rest of the record to the
+new file.  The logic here does all that:
 
-     function usage()
-     {
-         print("usage: split [-num] [file] [outname]") > "/dev/stderr"
-         exit 1
+     Byte_count > 0 {
+         # `+ 1' is for the final newline
+         if (tcount + length($0) + 1 > Byte_count) { # would overflow
+             # compute leading bytes
+             leading_bytes = Byte_count - tcount
+
+             # write leading bytes
+             printf("%s", substr($0, 1, leading_bytes)) > Output
+
+             # close old file, open new file
+             close(Output)
+             Output = (Outfile compute_suffix())
+
+             # set up first bytes for new file
+             $0 = substr($0, leading_bytes + 1)  # trailing bytes
+             tcount = 0
+         }
+
+         # write full record or trailing bytes
+         tcount += length($0) + 1
+         print > Output
      }
 
-   This program is a bit sloppy; it relies on 'awk' to automatically
-close the last file instead of doing it in an 'END' rule.  It also
-assumes that letters are contiguous in the character set, which isn't
-true for EBCDIC systems.
+   Finally, the 'END' rule cleans up by closing the last output file:
+
+     END {
+         close(Output)
+     }
 
    ---------- Footnotes ----------
 
-   (1) This is the traditional usage.  The POSIX usage is different, but
-not relevant for what the program aims to demonstrate.
+   (1) Using '-b' twice requires separating 'gawk''s options from those
+of the program.  For example: 'gawk -f getopt.awk -f split.awk -b -- -b
+42m large-file.txt split-'.
 
 
 File: gawk.info,  Node: Tee Program,  Next: Uniq Program,  Prev: Split 
Program,  Up: Clones
@@ -37254,7 +37432,7 @@ Index
 * split string into array:               String Functions.    (line 303)
 * split utility:                         Split Program.       (line   6)
 * split() function, array elements, deleting: Delete.         (line  61)
-* split.awk program:                     Split Program.       (line  30)
+* split.awk program:                     Split Program.       (line  50)
 * sprintf:                               OFMT.                (line  15)
 * sprintf <1>:                           String Functions.    (line 395)
 * sprintf() function, print/printf statements and: Round Function.
@@ -37958,270 +38136,270 @@ Node: Cut Program728246
 Node: Egrep Program738175
 Node: Id Program747186
 Node: Split Program757133
-Ref: Split Program-Footnote-1760591
-Node: Tee Program760720
-Node: Uniq Program763510
-Node: Wc Program771074
-Ref: Wc Program-Footnote-1775329
-Node: Miscellaneous Programs775423
-Node: Dupword Program776636
-Node: Alarm Program778666
-Node: Translate Program783521
-Ref: Translate Program-Footnote-1788086
-Node: Labels Program788356
-Ref: Labels Program-Footnote-1791707
-Node: Word Sorting791791
-Node: History Sorting795863
-Node: Extract Program798088
-Node: Simple Sed806142
-Node: Igawk Program809216
-Ref: Igawk Program-Footnote-1823547
-Ref: Igawk Program-Footnote-2823749
-Ref: Igawk Program-Footnote-3823871
-Node: Anagram Program823986
-Node: Signature Program827048
-Node: Programs Summary828295
-Node: Programs Exercises829509
-Ref: Programs Exercises-Footnote-1833639
-Node: Advanced Features833725
-Node: Nondecimal Data835715
-Node: Array Sorting837306
-Node: Controlling Array Traversal838006
-Ref: Controlling Array Traversal-Footnote-1846374
-Node: Array Sorting Functions846492
-Ref: Array Sorting Functions-Footnote-1851583
-Node: Two-way I/O851779
-Ref: Two-way I/O-Footnote-1859500
-Ref: Two-way I/O-Footnote-2859687
-Node: TCP/IP Networking859769
-Node: Profiling862887
-Node: Advanced Features Summary872201
-Node: Internationalization874045
-Node: I18N and L10N875525
-Node: Explaining gettext876212
-Ref: Explaining gettext-Footnote-1882104
-Ref: Explaining gettext-Footnote-2882289
-Node: Programmer i18n882454
-Ref: Programmer i18n-Footnote-1887403
-Node: Translator i18n887452
-Node: String Extraction888246
-Ref: String Extraction-Footnote-1889378
-Node: Printf Ordering889464
-Ref: Printf Ordering-Footnote-1892250
-Node: I18N Portability892314
-Ref: I18N Portability-Footnote-1894770
-Node: I18N Example894833
-Ref: I18N Example-Footnote-1898108
-Ref: I18N Example-Footnote-2898181
-Node: Gawk I18N898290
-Node: I18N Summary898939
-Node: Debugger900280
-Node: Debugging901280
-Node: Debugging Concepts901721
-Node: Debugging Terms903530
-Node: Awk Debugging906105
-Ref: Awk Debugging-Footnote-1907050
-Node: Sample Debugging Session907182
-Node: Debugger Invocation907716
-Node: Finding The Bug909102
-Node: List of Debugger Commands915576
-Node: Breakpoint Control916909
-Node: Debugger Execution Control920603
-Node: Viewing And Changing Data923965
-Node: Execution Stack927506
-Node: Debugger Info929143
-Node: Miscellaneous Debugger Commands933214
-Node: Readline Support938276
-Node: Limitations939172
-Node: Debugging Summary941726
-Node: Namespaces943005
-Node: Global Namespace944116
-Node: Qualified Names945514
-Node: Default Namespace946513
-Node: Changing The Namespace947254
-Node: Naming Rules948868
-Node: Internal Name Management950716
-Node: Namespace Example951758
-Node: Namespace And Features954320
-Node: Namespace Summary955755
-Node: Arbitrary Precision Arithmetic957232
-Node: Computer Arithmetic958719
-Ref: table-numeric-ranges962485
-Ref: table-floating-point-ranges962978
-Ref: Computer Arithmetic-Footnote-1963636
-Node: Math Definitions963693
-Ref: table-ieee-formats967009
-Ref: Math Definitions-Footnote-1967612
-Node: MPFR features967717
-Node: FP Math Caution969435
-Ref: FP Math Caution-Footnote-1970507
-Node: Inexactness of computations970876
-Node: Inexact representation971836
-Node: Comparing FP Values973196
-Node: Errors accumulate974437
-Node: Getting Accuracy975870
-Node: Try To Round978580
-Node: Setting precision979479
-Ref: table-predefined-precision-strings980176
-Node: Setting the rounding mode982006
-Ref: table-gawk-rounding-modes982380
-Ref: Setting the rounding mode-Footnote-1986311
-Node: Arbitrary Precision Integers986490
-Ref: Arbitrary Precision Integers-Footnote-1989665
-Node: Checking for MPFR989814
-Node: POSIX Floating Point Problems991288
-Ref: POSIX Floating Point Problems-Footnote-1995573
-Node: Floating point summary995611
-Node: Dynamic Extensions997801
-Node: Extension Intro999354
-Node: Plugin License1000620
-Node: Extension Mechanism Outline1001417
-Ref: figure-load-extension1001856
-Ref: figure-register-new-function1003421
-Ref: figure-call-new-function1004513
-Node: Extension API Description1006575
-Node: Extension API Functions Introduction1008288
-Ref: table-api-std-headers1010124
-Node: General Data Types1014373
-Ref: General Data Types-Footnote-11023003
-Node: Memory Allocation Functions1023302
-Ref: Memory Allocation Functions-Footnote-11027803
-Node: Constructor Functions1027902
-Node: API Ownership of MPFR and GMP Values1031368
-Node: Registration Functions1032681
-Node: Extension Functions1033381
-Node: Exit Callback Functions1038703
-Node: Extension Version String1039953
-Node: Input Parsers1040616
-Node: Output Wrappers1053337
-Node: Two-way processors1057849
-Node: Printing Messages1060114
-Ref: Printing Messages-Footnote-11061285
-Node: Updating ERRNO1061438
-Node: Requesting Values1062177
-Ref: table-value-types-returned1062914
-Node: Accessing Parameters1063850
-Node: Symbol Table Access1065087
-Node: Symbol table by name1065599
-Ref: Symbol table by name-Footnote-11068623
-Node: Symbol table by cookie1068751
-Ref: Symbol table by cookie-Footnote-11072936
-Node: Cached values1073000
-Ref: Cached values-Footnote-11076536
-Node: Array Manipulation1076689
-Ref: Array Manipulation-Footnote-11077780
-Node: Array Data Types1077817
-Ref: Array Data Types-Footnote-11080475
-Node: Array Functions1080567
-Node: Flattening Arrays1085065
-Node: Creating Arrays1092041
-Node: Redirection API1096808
-Node: Extension API Variables1099641
-Node: Extension Versioning1100352
-Ref: gawk-api-version1100781
-Node: Extension GMP/MPFR Versioning1102512
-Node: Extension API Informational Variables1104140
-Node: Extension API Boilerplate1105213
-Node: Changes from API V11109187
-Node: Finding Extensions1110759
-Node: Extension Example1111318
-Node: Internal File Description1112116
-Node: Internal File Ops1116196
-Ref: Internal File Ops-Footnote-11127546
-Node: Using Internal File Ops1127686
-Ref: Using Internal File Ops-Footnote-11130069
-Node: Extension Samples1130343
-Node: Extension Sample File Functions1131872
-Node: Extension Sample Fnmatch1139521
-Node: Extension Sample Fork1141008
-Node: Extension Sample Inplace1142226
-Node: Extension Sample Ord1145851
-Node: Extension Sample Readdir1146687
-Ref: table-readdir-file-types1147576
-Node: Extension Sample Revout1148643
-Node: Extension Sample Rev2way1149232
-Node: Extension Sample Read write array1149972
-Node: Extension Sample Readfile1151914
-Node: Extension Sample Time1153009
-Node: Extension Sample API Tests1154761
-Node: gawkextlib1155253
-Node: Extension summary1158171
-Node: Extension Exercises1161873
-Node: Language History1163115
-Node: V7/SVR3.11164771
-Node: SVR41166923
-Node: POSIX1168357
-Node: BTL1169738
-Node: POSIX/GNU1170467
-Node: Feature History1176245
-Node: Common Extensions1192564
-Node: Ranges and Locales1193847
-Ref: Ranges and Locales-Footnote-11198463
-Ref: Ranges and Locales-Footnote-21198490
-Ref: Ranges and Locales-Footnote-31198725
-Node: Contributors1198948
-Node: History summary1204945
-Node: Installation1206325
-Node: Gawk Distribution1207269
-Node: Getting1207753
-Node: Extracting1208716
-Node: Distribution contents1210354
-Node: Unix Installation1216834
-Node: Quick Installation1217516
-Node: Shell Startup Files1219930
-Node: Additional Configuration Options1221019
-Node: Configuration Philosophy1223334
-Node: Non-Unix Installation1225703
-Node: PC Installation1226163
-Node: PC Binary Installation1227001
-Node: PC Compiling1227436
-Node: PC Using1228553
-Node: Cygwin1232106
-Node: MSYS1233330
-Node: VMS Installation1233932
-Node: VMS Compilation1234723
-Ref: VMS Compilation-Footnote-11235952
-Node: VMS Dynamic Extensions1236010
-Node: VMS Installation Details1237695
-Node: VMS Running1239948
-Node: VMS GNV1244227
-Node: VMS Old Gawk1244962
-Node: Bugs1245433
-Node: Bug address1246096
-Node: Usenet1249078
-Node: Maintainers1250082
-Node: Other Versions1251267
-Node: Installation summary1258355
-Node: Notes1259564
-Node: Compatibility Mode1260358
-Node: Additions1261140
-Node: Accessing The Source1262065
-Node: Adding Code1263502
-Node: New Ports1269721
-Node: Derived Files1274096
-Ref: Derived Files-Footnote-11279756
-Ref: Derived Files-Footnote-21279791
-Ref: Derived Files-Footnote-31280389
-Node: Future Extensions1280503
-Node: Implementation Limitations1281161
-Node: Extension Design1282371
-Node: Old Extension Problems1283515
-Ref: Old Extension Problems-Footnote-11285033
-Node: Extension New Mechanism Goals1285090
-Ref: Extension New Mechanism Goals-Footnote-11288454
-Node: Extension Other Design Decisions1288643
-Node: Extension Future Growth1290756
-Node: Notes summary1291362
-Node: Basic Concepts1292520
-Node: Basic High Level1293201
-Ref: figure-general-flow1293483
-Ref: figure-process-flow1294168
-Ref: Basic High Level-Footnote-11297469
-Node: Basic Data Typing1297654
-Node: Glossary1300982
-Node: Copying1332867
-Node: GNU Free Documentation License1370410
-Node: Index1395530
+Ref: Split Program-Footnote-1766905
+Node: Tee Program767078
+Node: Uniq Program769868
+Node: Wc Program777432
+Ref: Wc Program-Footnote-1781687
+Node: Miscellaneous Programs781781
+Node: Dupword Program782994
+Node: Alarm Program785024
+Node: Translate Program789879
+Ref: Translate Program-Footnote-1794444
+Node: Labels Program794714
+Ref: Labels Program-Footnote-1798065
+Node: Word Sorting798149
+Node: History Sorting802221
+Node: Extract Program804446
+Node: Simple Sed812500
+Node: Igawk Program815574
+Ref: Igawk Program-Footnote-1829905
+Ref: Igawk Program-Footnote-2830107
+Ref: Igawk Program-Footnote-3830229
+Node: Anagram Program830344
+Node: Signature Program833406
+Node: Programs Summary834653
+Node: Programs Exercises835867
+Ref: Programs Exercises-Footnote-1839997
+Node: Advanced Features840083
+Node: Nondecimal Data842073
+Node: Array Sorting843664
+Node: Controlling Array Traversal844364
+Ref: Controlling Array Traversal-Footnote-1852732
+Node: Array Sorting Functions852850
+Ref: Array Sorting Functions-Footnote-1857941
+Node: Two-way I/O858137
+Ref: Two-way I/O-Footnote-1865858
+Ref: Two-way I/O-Footnote-2866045
+Node: TCP/IP Networking866127
+Node: Profiling869245
+Node: Advanced Features Summary878559
+Node: Internationalization880403
+Node: I18N and L10N881883
+Node: Explaining gettext882570
+Ref: Explaining gettext-Footnote-1888462
+Ref: Explaining gettext-Footnote-2888647
+Node: Programmer i18n888812
+Ref: Programmer i18n-Footnote-1893761
+Node: Translator i18n893810
+Node: String Extraction894604
+Ref: String Extraction-Footnote-1895736
+Node: Printf Ordering895822
+Ref: Printf Ordering-Footnote-1898608
+Node: I18N Portability898672
+Ref: I18N Portability-Footnote-1901128
+Node: I18N Example901191
+Ref: I18N Example-Footnote-1904466
+Ref: I18N Example-Footnote-2904539
+Node: Gawk I18N904648
+Node: I18N Summary905297
+Node: Debugger906638
+Node: Debugging907638
+Node: Debugging Concepts908079
+Node: Debugging Terms909888
+Node: Awk Debugging912463
+Ref: Awk Debugging-Footnote-1913408
+Node: Sample Debugging Session913540
+Node: Debugger Invocation914074
+Node: Finding The Bug915460
+Node: List of Debugger Commands921934
+Node: Breakpoint Control923267
+Node: Debugger Execution Control926961
+Node: Viewing And Changing Data930323
+Node: Execution Stack933864
+Node: Debugger Info935501
+Node: Miscellaneous Debugger Commands939572
+Node: Readline Support944634
+Node: Limitations945530
+Node: Debugging Summary948084
+Node: Namespaces949363
+Node: Global Namespace950474
+Node: Qualified Names951872
+Node: Default Namespace952871
+Node: Changing The Namespace953612
+Node: Naming Rules955226
+Node: Internal Name Management957074
+Node: Namespace Example958116
+Node: Namespace And Features960678
+Node: Namespace Summary962113
+Node: Arbitrary Precision Arithmetic963590
+Node: Computer Arithmetic965077
+Ref: table-numeric-ranges968843
+Ref: table-floating-point-ranges969336
+Ref: Computer Arithmetic-Footnote-1969994
+Node: Math Definitions970051
+Ref: table-ieee-formats973367
+Ref: Math Definitions-Footnote-1973970
+Node: MPFR features974075
+Node: FP Math Caution975793
+Ref: FP Math Caution-Footnote-1976865
+Node: Inexactness of computations977234
+Node: Inexact representation978194
+Node: Comparing FP Values979554
+Node: Errors accumulate980795
+Node: Getting Accuracy982228
+Node: Try To Round984938
+Node: Setting precision985837
+Ref: table-predefined-precision-strings986534
+Node: Setting the rounding mode988364
+Ref: table-gawk-rounding-modes988738
+Ref: Setting the rounding mode-Footnote-1992669
+Node: Arbitrary Precision Integers992848
+Ref: Arbitrary Precision Integers-Footnote-1996023
+Node: Checking for MPFR996172
+Node: POSIX Floating Point Problems997646
+Ref: POSIX Floating Point Problems-Footnote-11001931
+Node: Floating point summary1001969
+Node: Dynamic Extensions1004159
+Node: Extension Intro1005712
+Node: Plugin License1006978
+Node: Extension Mechanism Outline1007775
+Ref: figure-load-extension1008214
+Ref: figure-register-new-function1009779
+Ref: figure-call-new-function1010871
+Node: Extension API Description1012933
+Node: Extension API Functions Introduction1014646
+Ref: table-api-std-headers1016482
+Node: General Data Types1020731
+Ref: General Data Types-Footnote-11029361
+Node: Memory Allocation Functions1029660
+Ref: Memory Allocation Functions-Footnote-11034161
+Node: Constructor Functions1034260
+Node: API Ownership of MPFR and GMP Values1037726
+Node: Registration Functions1039039
+Node: Extension Functions1039739
+Node: Exit Callback Functions1045061
+Node: Extension Version String1046311
+Node: Input Parsers1046974
+Node: Output Wrappers1059695
+Node: Two-way processors1064207
+Node: Printing Messages1066472
+Ref: Printing Messages-Footnote-11067643
+Node: Updating ERRNO1067796
+Node: Requesting Values1068535
+Ref: table-value-types-returned1069272
+Node: Accessing Parameters1070208
+Node: Symbol Table Access1071445
+Node: Symbol table by name1071957
+Ref: Symbol table by name-Footnote-11074981
+Node: Symbol table by cookie1075109
+Ref: Symbol table by cookie-Footnote-11079294
+Node: Cached values1079358
+Ref: Cached values-Footnote-11082894
+Node: Array Manipulation1083047
+Ref: Array Manipulation-Footnote-11084138
+Node: Array Data Types1084175
+Ref: Array Data Types-Footnote-11086833
+Node: Array Functions1086925
+Node: Flattening Arrays1091423
+Node: Creating Arrays1098399
+Node: Redirection API1103166
+Node: Extension API Variables1105999
+Node: Extension Versioning1106710
+Ref: gawk-api-version1107139
+Node: Extension GMP/MPFR Versioning1108870
+Node: Extension API Informational Variables1110498
+Node: Extension API Boilerplate1111571
+Node: Changes from API V11115545
+Node: Finding Extensions1117117
+Node: Extension Example1117676
+Node: Internal File Description1118474
+Node: Internal File Ops1122554
+Ref: Internal File Ops-Footnote-11133904
+Node: Using Internal File Ops1134044
+Ref: Using Internal File Ops-Footnote-11136427
+Node: Extension Samples1136701
+Node: Extension Sample File Functions1138230
+Node: Extension Sample Fnmatch1145879
+Node: Extension Sample Fork1147366
+Node: Extension Sample Inplace1148584
+Node: Extension Sample Ord1152209
+Node: Extension Sample Readdir1153045
+Ref: table-readdir-file-types1153934
+Node: Extension Sample Revout1155001
+Node: Extension Sample Rev2way1155590
+Node: Extension Sample Read write array1156330
+Node: Extension Sample Readfile1158272
+Node: Extension Sample Time1159367
+Node: Extension Sample API Tests1161119
+Node: gawkextlib1161611
+Node: Extension summary1164529
+Node: Extension Exercises1168231
+Node: Language History1169473
+Node: V7/SVR3.11171129
+Node: SVR41173281
+Node: POSIX1174715
+Node: BTL1176096
+Node: POSIX/GNU1176825
+Node: Feature History1182603
+Node: Common Extensions1198922
+Node: Ranges and Locales1200205
+Ref: Ranges and Locales-Footnote-11204821
+Ref: Ranges and Locales-Footnote-21204848
+Ref: Ranges and Locales-Footnote-31205083
+Node: Contributors1205306
+Node: History summary1211303
+Node: Installation1212683
+Node: Gawk Distribution1213627
+Node: Getting1214111
+Node: Extracting1215074
+Node: Distribution contents1216712
+Node: Unix Installation1223192
+Node: Quick Installation1223874
+Node: Shell Startup Files1226288
+Node: Additional Configuration Options1227377
+Node: Configuration Philosophy1229692
+Node: Non-Unix Installation1232061
+Node: PC Installation1232521
+Node: PC Binary Installation1233359
+Node: PC Compiling1233794
+Node: PC Using1234911
+Node: Cygwin1238464
+Node: MSYS1239688
+Node: VMS Installation1240290
+Node: VMS Compilation1241081
+Ref: VMS Compilation-Footnote-11242310
+Node: VMS Dynamic Extensions1242368
+Node: VMS Installation Details1244053
+Node: VMS Running1246306
+Node: VMS GNV1250585
+Node: VMS Old Gawk1251320
+Node: Bugs1251791
+Node: Bug address1252454
+Node: Usenet1255436
+Node: Maintainers1256440
+Node: Other Versions1257625
+Node: Installation summary1264713
+Node: Notes1265922
+Node: Compatibility Mode1266716
+Node: Additions1267498
+Node: Accessing The Source1268423
+Node: Adding Code1269860
+Node: New Ports1276079
+Node: Derived Files1280454
+Ref: Derived Files-Footnote-11286114
+Ref: Derived Files-Footnote-21286149
+Ref: Derived Files-Footnote-31286747
+Node: Future Extensions1286861
+Node: Implementation Limitations1287519
+Node: Extension Design1288729
+Node: Old Extension Problems1289873
+Ref: Old Extension Problems-Footnote-11291391
+Node: Extension New Mechanism Goals1291448
+Ref: Extension New Mechanism Goals-Footnote-11294812
+Node: Extension Other Design Decisions1295001
+Node: Extension Future Growth1297114
+Node: Notes summary1297720
+Node: Basic Concepts1298878
+Node: Basic High Level1299559
+Ref: figure-general-flow1299841
+Ref: figure-process-flow1300526
+Ref: Basic High Level-Footnote-11303827
+Node: Basic Data Typing1304012
+Node: Glossary1307340
+Node: Copying1339225
+Node: GNU Free Documentation License1376768
+Node: Index1401888
 
 End Tag Table
 
diff --git a/doc/gawk.texi b/doc/gawk.texi
index 90146b9..4f3f67d 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -26013,45 +26013,64 @@ so that the rest of the code will work as expected:
 @node Split Program
 @subsection Splitting a Large File into Pieces
 
-@c FIXME: One day, update to current POSIX version of split
-
 @cindex files @subentry splitting
 @cindex @code{split} utility
-The @command{split} program splits large text files into smaller pieces.
-Usage is as follows:@footnote{This is the traditional usage. The
-POSIX usage is different, but not relevant for what the program
-aims to demonstrate.}
+The @command{split} utility splits large text files into smaller pieces.
+The usage follows the POSIX standard for @command{split} and is as follows:
 
 @display
-@command{split} [@code{-@var{count}}] [@var{file}] [@var{prefix}]
+@command{split} [@option{-l} @var{count}] [@option{-a} @var{suffix-len}] 
[@var{file} [@var{outname}]]
+@command{split} @option{-b} @var{N}[@code{k}|@code{m}]] [@option{-a} 
@var{suffix-len}] [@var{file} [@var{outname}]]
 @end display
 
-By default,
-the output files are named @file{xaa}, @file{xab}, and so on. Each file has
-1,000 lines in it, with the likely exception of the last file. To change the
-number of lines in each file, supply a number on the command line
-preceded with a minus sign (e.g., @samp{-500} for files with 500 lines in them
-instead of 1,000).  To change the names of the output files to something like
-@file{myfileaa}, @file{myfileab}, and so on, supply an additional
-argument that specifies the @value{FN} prefix.
-
-Here is a version of @command{split} in @command{awk}. It uses the
-@code{ord()} and @code{chr()} functions presented in
-@ref{Ordinal Functions}.
-
-The program first sets its defaults, and then tests to make sure there are
-not too many arguments.  It then looks at each argument in turn.  The
-first argument could be a minus sign followed by a number. If it is, this 
happens
-to look like a negative number, so it is made positive, and that is the
-count of lines.  The @value{DF} name is skipped over and the final argument
-is used as the prefix for the output @value{FN}s:
+By default, the output files are named @file{xaa}, @file{xab}, and so
+on. Each file has 1,000 lines in it, with the likely exception of the
+last file.
+
+The @command{split} program has evolved over time, and the current POSIX
+version is more complicated than the original Unix version.  The options
+and what they do are as follows:
+
+@table @asis
+@item @option{-a} @var{suffix-len}
+Use @var{suffix-len} characters for the suffix. For example, if 
@var{suffix-len}
+is four, the output files would range from @file{xaaaa} to @file{xzzzz}.
+
+@item @option{-b} @var{N}[@code{k}|@code{m}]]
+Instead of each file containing a specified number of lines, each file
+should have (at most) @var{N} bytes.  Supplying a trailing @samp{k}
+multiplies @var{N} by 1,024, yielding kilobytes.  Supplying a trailing
+@samp{m} mutiplies @var{N} by 1,048,576 (@math{1,024 @value{TIMES} 1,024})
+yielding megabytes.  (This option is mutually exclusive with @option{-l}).
+
+@item @option{-l} @var{count}
+Each file should have at most @var{count} lines, instead of the default
+1,000.  (This option is mutually exclusive with @option{-b}).
+@end table
+
+If supplied, @var{file} is the input file to read. Otherwise standard
+input is processed.  If supplied, @var{outname} is the leading prefix
+to use for @value{FN}s, instead of @samp{x}.
+
+In order to use the @option{-b} option, @command{gawk} should be invoked
+with its @option{-b} option (@pxref{Options}), or with the environment
+variable @env{LC_ALL} set to @samp{C}, so that each input byte is treated
+as a separate character.@footnote{Using @option{-b} twice requires
+separating @command{gawk}'s options from those of the program.  For example:
+@samp{gawk -f getopt.awk -f split.awk -b -- -b 42m large-file.txt split-}.}
+
+Here is an implementation of @command{split} in @command{awk}. It uses the
+@code{getopt()} function presented in @ref{Getopt Function}.
+
+The program begins with a standard descriptive comment and then
+a @code{usage()} function describing the options:
 
 @cindex @code{split.awk} program
 @example
 @c file eg/prog/split.awk
 # split.awk --- do split in awk
 #
-# Requires ord() and chr() library functions
+# Requires getopt() library function.
 @c endfile
 @ignore
 @c file eg/prog/split.awk
@@ -26059,100 +26078,277 @@ is used as the prefix for the output @value{FN}s:
 # Arnold Robbins, arnold@@skeeve.com, Public Domain
 # May 1993
 # Revised slightly, May 2014
+# Rewritten September 2020
 
 @c endfile
 @end ignore
 @c file eg/prog/split.awk
-# usage: split [-count] [file] [outname]
+function usage()
+@{
+    print("usage: split [-l count]  [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+    print("       split [-b N[k|m]] [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+    exit 1
+@}
+@c endfile
+@end example
+
+Next, in a @code{BEGIN} rule we set the default values and parse the arguments.
+After that we initialize the data structures used to cycle the suffix
+from @samp{aa@dots{}} to @samp{zz@dots{}}. Finally we set the name of
+the first output file:
 
+@example
+@c file eg/prog/split.awk
 BEGIN @{
-    outfile = "x"    # default
-    count = 1000
-    if (ARGC > 4)
-        usage()
+    # Set defaults:
+    Suffix_length = 2
+    Line_count = 1000
+    Byte_count = 0
+    Outfile = "x"
 
-    i = 1
-    if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) @{
-        count = -ARGV[i]
-        ARGV[i] = ""
-        i++
+    parse_arguments()
+
+    init_suffix_data()
+
+    Output = (Outfile compute_suffix())
+@}
+@c endfile
+@end example
+
+Parsing the arguments is straightforward.  The program follows our
+convention (@pxref{Library Names}) of having important global variables
+start with an uppercase letter:
+
+@example
+@c file eg/prog/split.awk
+function parse_arguments(   i, c, l, modifier)
+@{
+    while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) @{
+        if (c == "a")
+            Suffix_length = Optarg + 0
+        else if (c == "b") @{
+            Byte_count = Optarg + 0
+            Line_count = 0
+
+            l = length(Optarg)
+            modifier = substr(Optarg, l, 1)
+            if (modifier == "k")
+                Byte_count *= 1024
+            else if (modifier == "m")
+                Byte_count *= 1024 * 1024
+        @} else if (c == "l") @{
+            Line_count = Optarg + 0
+            Byte_count = 0
+        @} else
+            usage()
     @}
-    # test argv in case reading from stdin instead of file
-    if (i in ARGV)
-        i++    # skip datafile name
-@group
-    if (i in ARGV) @{
-        outfile = ARGV[i]
+
+    # Clear out options
+    for (i = 1; i < Optind; i++)
         ARGV[i] = ""
+
+    # Check for filename
+    if (ARGV[Optind]) @{
+        Optind++
+
+        # Check for different prefix
+        if (ARGV[Optind]) @{
+            Outfile = ARGV[Optind]
+            ARGV[Optind] = ""
+
+            if (++Optind < ARGC)
+                usage()
+        @}
     @}
-@end group
-@group
-    s1 = s2 = "a"
-    out = (outfile s1 s2)
 @}
-@end group
 @c endfile
 @end example
 
-The next rule does most of the work. @code{tcount} (temporary count) tracks
-how many lines have been printed to the output file so far. If it is greater
-than @code{count}, it is time to close the current file and start a new one.
-@code{s1} and @code{s2} track the current suffixes for the @value{FN}. If
-they are both @samp{z}, the file is just too big.  Otherwise, @code{s1}
-moves to the next letter in the alphabet and @code{s2} starts over again at
-@samp{a}:
+Managing the @value{FN} suffix is interesting.
+Given a suffix of length three, say, the values go from
+@samp{aaa}, @samp{aab}, @samp{aac} and so on, all the way to
+@samp{zzx}, @samp{zzy}, and finally @samp{zzz}.
+There are two important aspects to this:
+
+@itemize @bullet
+@item
+We have to be
+able to easily generate these suffixes, and in particular
+easily handle ``rolling over''; for example, going from
+@samp{abz} to @samp{aca}.
+
+@item
+We have to tell when we've finished with the last file,
+so that if we still have more input data we can print an
+error message and exit. The trick is to handle this @emph{after}
+using the last suffix, and not when the final suffix is created.
+@end itemize
+
+The computation is handled by @code{compute_suffix()}.
+This function is called every time a new file is opened.
+
+The flow here is messy, because we want to generate @samp{zzzz} (say),
+and use it, and only produce an error after all the @value{FN}
+suffixes have been used up. The logical steps are as follows:
+
+@enumerate 1
+@item
+Generate the suffix, saving the value in @code{result} to return.
+To do this, the supplementary array @code{Suffix_ind} contains one
+element for each letter in the suffix.  Each element ranges from 1 to
+26, acting as the index into a string containing all the lowercase
+letters of the English alphabet.
+It is initialized by @code{init_suffix_data()}.
+@code{result} is built up one letter at a time, using each @code{substr()}.
+
+@item
+Prepare the data structures for the next time @code{compute_suffix()}
+is called. To do this, we loop over @code{Suffix_ind}, @emph{backwards}.
+If the current element is less than 26, it's incremented and the loop
+breaks (@samp{abq} goes to @samp{abr}). Otherwise, the element is
+reset to one and we move down the list (@samp{abz} to @samp{aca}).
+Thus, the @code{Suffix_ind} array is always ``one step ahead'' of the actual
+@value{FN} suffix to be returned.
+
+@item
+Check if we've gone past the limit of possible filenames.
+If @code{Reached_last} is true, print a message and exit. Otherwise,
+check if @code{Suffix_ind} describes a suffix where all the letters are
+@samp{z}. If that's the case we're about to return the final suffix. If
+so, we set @code{Reached_last} to true so that the @emph{next} call to
+@code{compute_suffix()} will cause a failure.
+@end enumerate
+
+Physically, the steps in the function occur in the order 3, 1, 2:
 
-@c else on separate line here for page breaking
 @example
 @c file eg/prog/split.awk
+function compute_suffix(    i, result, letters)
 @{
-    if (++tcount > count) @{
-        close(out)
-        if (s2 == "z") @{
-            if (s1 == "z") @{
-                printf("split: %s is too large to split\n",
-                       FILENAME) > "/dev/stderr"
-                exit 1
-            @}
-            s1 = chr(ord(s1) + 1)
-            s2 = "a"
-        @}
-@group
-        else
-            s2 = chr(ord(s2) + 1)
-@end group
-        out = (outfile s1 s2)
-        tcount = 1
+    # Logical step 3
+    if (Reached_last) @{
+        printf("split: too many files!\n") > "/dev/stderr"
+        exit 1
+    @} else if (on_last_file())
+        Reached_last = 1    # fail when wrapping after 'zzz'
+
+    # Logical step 1
+    result = ""
+    letters = "abcdefghijklmnopqrstuvwxyz"
+    for (i = 1; i <= Suffix_length; i++)
+        result = result substr(letters, Suffix_ind[i], 1)
+
+    # Logical step 2
+    for (i = Suffix_length; i >= 1; i--) @{
+        if (++Suffix_ind[i] > 26) @{
+            Suffix_ind[i] = 1
+        @} else
+            break
     @}
-    print > out
+
+    return result
 @}
 @c endfile
 @end example
 
-@noindent
-The @code{usage()} function simply prints an error message and exits:
+The @code{Suffix_ind} array and @code{Reached_last} are initialized
+by @code{init_suffix_data()}:
 
 @example
 @c file eg/prog/split.awk
-function usage()
+function init_suffix_data(  i)
 @{
-    print("usage: split [-num] [file] [outname]") > "/dev/stderr"
-    exit 1
+    for (i = 1; i <= Suffix_length; i++)
+        Suffix_ind[i] = 1
+
+    Reached_last = 0
 @}
 @c endfile
 @end example
 
-This program is a bit sloppy; it relies on @command{awk} to automatically 
close the last file
-instead of doing it in an @code{END} rule.
-It also assumes that letters are contiguous in the character set,
-which isn't true for EBCDIC systems.
+The function @code{on_last_file()} returns true if @code{Suffix_ind} describes
+a suffix where all the letters are @samp{z} by checking that all the elements
+in the array are equal to 26:
 
-@ifset FOR_PRINT
-You might want to consider how to eliminate the use of
-@code{ord()} and @code{chr()}; this can be done in such a
-way as to solve the EBCDIC issue as well.
-@end ifset
+@example
+@c file eg/prog/split.awk
+function on_last_file(  i, on_last)
+@{
+    on_last = 1
+    for (i = 1; i <= Suffix_length; i++) @{
+        on_last = on_last && (Suffix_ind[i] == 26)
+    @}
+
+    return on_last
+@}
+@c endfile
+@end example
+
+The actual work of splitting the input file is done by the next two rules.
+Since splitting by line count and splitting by byte count are mutually
+exclusive, we simply use two separate rules, one for when @code{Line_count}
+is greater than zero, and another for when @code{Byte_count} is greater than 
zero.
+
+The variable @code{tcount} counts how many lines have been processed so far.
+When it exceeds @code{Line_count}, it's time to close the previous file and
+switch to a new one:
+
+@example
+@c file eg/prog/split.awk
+Line_count > 0 @{
+    if (++tcount > Line_count) @{
+        close(Output)
+        Output = (Outfile compute_suffix())
+        tcount = 1
+    @}
+    print > Output
+@}
+@c endfile
+@end example
+
+The rule for handling bytes is more complicated.  Since lines most likely
+vary in length, the @code{Byte_count} boundary may be hit in the middle of
+an input record.  In that case, @command{split} has to write enough of the
+first bytes of the input record to finish up @code{Byte_count} bytes, close
+the file, open a new file, and write the rest of the record to the new file.
+The logic here does all that:
+
+@example
+@c file eg/prog/split.awk
+Byte_count > 0 @{
+    # `+ 1' is for the final newline
+    if (tcount + length($0) + 1 > Byte_count) @{ # would overflow
+        # compute leading bytes
+        leading_bytes = Byte_count - tcount
+
+        # write leading bytes
+        printf("%s", substr($0, 1, leading_bytes)) > Output
+
+        # close old file, open new file
+        close(Output)
+        Output = (Outfile compute_suffix())
+
+        # set up first bytes for new file
+        $0 = substr($0, leading_bytes + 1)  # trailing bytes
+        tcount = 0
+    @}
+
+    # write full record or trailing bytes
+    tcount += length($0) + 1
+    print > Output
+@}
+@c endfile
+@end example
 
+Finally, the @code{END} rule cleans up by closing the last output file:
+
+@example
+@c file eg/prog/split.awk
+END @{
+    close(Output)
+@}
+@c endfile
+@end example
 
 @node Tee Program
 @subsection Duplicating Output into Multiple Files
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index ae1d0bc..f77d071 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -25023,45 +25023,64 @@ so that the rest of the code will work as expected:
 @node Split Program
 @subsection Splitting a Large File into Pieces
 
-@c FIXME: One day, update to current POSIX version of split
-
 @cindex files @subentry splitting
 @cindex @code{split} utility
-The @command{split} program splits large text files into smaller pieces.
-Usage is as follows:@footnote{This is the traditional usage. The
-POSIX usage is different, but not relevant for what the program
-aims to demonstrate.}
+The @command{split} utility splits large text files into smaller pieces.
+The usage follows the POSIX standard for @command{split} and is as follows:
 
 @display
-@command{split} [@code{-@var{count}}] [@var{file}] [@var{prefix}]
+@command{split} [@option{-l} @var{count}] [@option{-a} @var{suffix-len}] 
[@var{file} [@var{outname}]]
+@command{split} @option{-b} @var{N}[@code{k}|@code{m}]] [@option{-a} 
@var{suffix-len}] [@var{file} [@var{outname}]]
 @end display
 
-By default,
-the output files are named @file{xaa}, @file{xab}, and so on. Each file has
-1,000 lines in it, with the likely exception of the last file. To change the
-number of lines in each file, supply a number on the command line
-preceded with a minus sign (e.g., @samp{-500} for files with 500 lines in them
-instead of 1,000).  To change the names of the output files to something like
-@file{myfileaa}, @file{myfileab}, and so on, supply an additional
-argument that specifies the @value{FN} prefix.
-
-Here is a version of @command{split} in @command{awk}. It uses the
-@code{ord()} and @code{chr()} functions presented in
-@ref{Ordinal Functions}.
-
-The program first sets its defaults, and then tests to make sure there are
-not too many arguments.  It then looks at each argument in turn.  The
-first argument could be a minus sign followed by a number. If it is, this 
happens
-to look like a negative number, so it is made positive, and that is the
-count of lines.  The @value{DF} name is skipped over and the final argument
-is used as the prefix for the output @value{FN}s:
+By default, the output files are named @file{xaa}, @file{xab}, and so
+on. Each file has 1,000 lines in it, with the likely exception of the
+last file.
+
+The @command{split} program has evolved over time, and the current POSIX
+version is more complicated than the original Unix version.  The options
+and what they do are as follows:
+
+@table @asis
+@item @option{-a} @var{suffix-len}
+Use @var{suffix-len} characters for the suffix. For example, if 
@var{suffix-len}
+is four, the output files would range from @file{xaaaa} to @file{xzzzz}.
+
+@item @option{-b} @var{N}[@code{k}|@code{m}]]
+Instead of each file containing a specified number of lines, each file
+should have (at most) @var{N} bytes.  Supplying a trailing @samp{k}
+multiplies @var{N} by 1,024, yielding kilobytes.  Supplying a trailing
+@samp{m} mutiplies @var{N} by 1,048,576 (@math{1,024 @value{TIMES} 1,024})
+yielding megabytes.  (This option is mutually exclusive with @option{-l}).
+
+@item @option{-l} @var{count}
+Each file should have at most @var{count} lines, instead of the default
+1,000.  (This option is mutually exclusive with @option{-b}).
+@end table
+
+If supplied, @var{file} is the input file to read. Otherwise standard
+input is processed.  If supplied, @var{outname} is the leading prefix
+to use for @value{FN}s, instead of @samp{x}.
+
+In order to use the @option{-b} option, @command{gawk} should be invoked
+with its @option{-b} option (@pxref{Options}), or with the environment
+variable @env{LC_ALL} set to @samp{C}, so that each input byte is treated
+as a separate character.@footnote{Using @option{-b} twice requires
+separating @command{gawk}'s options from those of the program.  For example:
+@samp{gawk -f getopt.awk -f split.awk -b -- -b 42m large-file.txt split-}.}
+
+Here is an implementation of @command{split} in @command{awk}. It uses the
+@code{getopt()} function presented in @ref{Getopt Function}.
+
+The program begins with a standard descriptive comment and then
+a @code{usage()} function describing the options:
 
 @cindex @code{split.awk} program
 @example
 @c file eg/prog/split.awk
 # split.awk --- do split in awk
 #
-# Requires ord() and chr() library functions
+# Requires getopt() library function.
 @c endfile
 @ignore
 @c file eg/prog/split.awk
@@ -25069,100 +25088,277 @@ is used as the prefix for the output @value{FN}s:
 # Arnold Robbins, arnold@@skeeve.com, Public Domain
 # May 1993
 # Revised slightly, May 2014
+# Rewritten September 2020
 
 @c endfile
 @end ignore
 @c file eg/prog/split.awk
-# usage: split [-count] [file] [outname]
+function usage()
+@{
+    print("usage: split [-l count]  [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+    print("       split [-b N[k|m]] [-a suffix-len] [file [outname]]") > 
"/dev/stderr"
+    exit 1
+@}
+@c endfile
+@end example
+
+Next, in a @code{BEGIN} rule we set the default values and parse the arguments.
+After that we initialize the data structures used to cycle the suffix
+from @samp{aa@dots{}} to @samp{zz@dots{}}. Finally we set the name of
+the first output file:
 
+@example
+@c file eg/prog/split.awk
 BEGIN @{
-    outfile = "x"    # default
-    count = 1000
-    if (ARGC > 4)
-        usage()
+    # Set defaults:
+    Suffix_length = 2
+    Line_count = 1000
+    Byte_count = 0
+    Outfile = "x"
 
-    i = 1
-    if (i in ARGV && ARGV[i] ~ /^-[[:digit:]]+$/) @{
-        count = -ARGV[i]
-        ARGV[i] = ""
-        i++
+    parse_arguments()
+
+    init_suffix_data()
+
+    Output = (Outfile compute_suffix())
+@}
+@c endfile
+@end example
+
+Parsing the arguments is straightforward.  The program follows our
+convention (@pxref{Library Names}) of having important global variables
+start with an uppercase letter:
+
+@example
+@c file eg/prog/split.awk
+function parse_arguments(   i, c, l, modifier)
+@{
+    while ((c = getopt(ARGC, ARGV, "a:b:l:")) != -1) @{
+        if (c == "a")
+            Suffix_length = Optarg + 0
+        else if (c == "b") @{
+            Byte_count = Optarg + 0
+            Line_count = 0
+
+            l = length(Optarg)
+            modifier = substr(Optarg, l, 1)
+            if (modifier == "k")
+                Byte_count *= 1024
+            else if (modifier == "m")
+                Byte_count *= 1024 * 1024
+        @} else if (c == "l") @{
+            Line_count = Optarg + 0
+            Byte_count = 0
+        @} else
+            usage()
     @}
-    # test argv in case reading from stdin instead of file
-    if (i in ARGV)
-        i++    # skip datafile name
-@group
-    if (i in ARGV) @{
-        outfile = ARGV[i]
+
+    # Clear out options
+    for (i = 1; i < Optind; i++)
         ARGV[i] = ""
+
+    # Check for filename
+    if (ARGV[Optind]) @{
+        Optind++
+
+        # Check for different prefix
+        if (ARGV[Optind]) @{
+            Outfile = ARGV[Optind]
+            ARGV[Optind] = ""
+
+            if (++Optind < ARGC)
+                usage()
+        @}
     @}
-@end group
-@group
-    s1 = s2 = "a"
-    out = (outfile s1 s2)
 @}
-@end group
 @c endfile
 @end example
 
-The next rule does most of the work. @code{tcount} (temporary count) tracks
-how many lines have been printed to the output file so far. If it is greater
-than @code{count}, it is time to close the current file and start a new one.
-@code{s1} and @code{s2} track the current suffixes for the @value{FN}. If
-they are both @samp{z}, the file is just too big.  Otherwise, @code{s1}
-moves to the next letter in the alphabet and @code{s2} starts over again at
-@samp{a}:
+Managing the @value{FN} suffix is interesting.
+Given a suffix of length three, say, the values go from
+@samp{aaa}, @samp{aab}, @samp{aac} and so on, all the way to
+@samp{zzx}, @samp{zzy}, and finally @samp{zzz}.
+There are two important aspects to this:
+
+@itemize @bullet
+@item
+We have to be
+able to easily generate these suffixes, and in particular
+easily handle ``rolling over''; for example, going from
+@samp{abz} to @samp{aca}.
+
+@item
+We have to tell when we've finished with the last file,
+so that if we still have more input data we can print an
+error message and exit. The trick is to handle this @emph{after}
+using the last suffix, and not when the final suffix is created.
+@end itemize
+
+The computation is handled by @code{compute_suffix()}.
+This function is called every time a new file is opened.
+
+The flow here is messy, because we want to generate @samp{zzzz} (say),
+and use it, and only produce an error after all the @value{FN}
+suffixes have been used up. The logical steps are as follows:
+
+@enumerate 1
+@item
+Generate the suffix, saving the value in @code{result} to return.
+To do this, the supplementary array @code{Suffix_ind} contains one
+element for each letter in the suffix.  Each element ranges from 1 to
+26, acting as the index into a string containing all the lowercase
+letters of the English alphabet.
+It is initialized by @code{init_suffix_data()}.
+@code{result} is built up one letter at a time, using each @code{substr()}.
+
+@item
+Prepare the data structures for the next time @code{compute_suffix()}
+is called. To do this, we loop over @code{Suffix_ind}, @emph{backwards}.
+If the current element is less than 26, it's incremented and the loop
+breaks (@samp{abq} goes to @samp{abr}). Otherwise, the element is
+reset to one and we move down the list (@samp{abz} to @samp{aca}).
+Thus, the @code{Suffix_ind} array is always ``one step ahead'' of the actual
+@value{FN} suffix to be returned.
+
+@item
+Check if we've gone past the limit of possible filenames.
+If @code{Reached_last} is true, print a message and exit. Otherwise,
+check if @code{Suffix_ind} describes a suffix where all the letters are
+@samp{z}. If that's the case we're about to return the final suffix. If
+so, we set @code{Reached_last} to true so that the @emph{next} call to
+@code{compute_suffix()} will cause a failure.
+@end enumerate
+
+Physically, the steps in the function occur in the order 3, 1, 2:
 
-@c else on separate line here for page breaking
 @example
 @c file eg/prog/split.awk
+function compute_suffix(    i, result, letters)
 @{
-    if (++tcount > count) @{
-        close(out)
-        if (s2 == "z") @{
-            if (s1 == "z") @{
-                printf("split: %s is too large to split\n",
-                       FILENAME) > "/dev/stderr"
-                exit 1
-            @}
-            s1 = chr(ord(s1) + 1)
-            s2 = "a"
-        @}
-@group
-        else
-            s2 = chr(ord(s2) + 1)
-@end group
-        out = (outfile s1 s2)
-        tcount = 1
+    # Logical step 3
+    if (Reached_last) @{
+        printf("split: too many files!\n") > "/dev/stderr"
+        exit 1
+    @} else if (on_last_file())
+        Reached_last = 1    # fail when wrapping after 'zzz'
+
+    # Logical step 1
+    result = ""
+    letters = "abcdefghijklmnopqrstuvwxyz"
+    for (i = 1; i <= Suffix_length; i++)
+        result = result substr(letters, Suffix_ind[i], 1)
+
+    # Logical step 2
+    for (i = Suffix_length; i >= 1; i--) @{
+        if (++Suffix_ind[i] > 26) @{
+            Suffix_ind[i] = 1
+        @} else
+            break
     @}
-    print > out
+
+    return result
 @}
 @c endfile
 @end example
 
-@noindent
-The @code{usage()} function simply prints an error message and exits:
+The @code{Suffix_ind} array and @code{Reached_last} are initialized
+by @code{init_suffix_data()}:
 
 @example
 @c file eg/prog/split.awk
-function usage()
+function init_suffix_data(  i)
 @{
-    print("usage: split [-num] [file] [outname]") > "/dev/stderr"
-    exit 1
+    for (i = 1; i <= Suffix_length; i++)
+        Suffix_ind[i] = 1
+
+    Reached_last = 0
 @}
 @c endfile
 @end example
 
-This program is a bit sloppy; it relies on @command{awk} to automatically 
close the last file
-instead of doing it in an @code{END} rule.
-It also assumes that letters are contiguous in the character set,
-which isn't true for EBCDIC systems.
+The function @code{on_last_file()} returns true if @code{Suffix_ind} describes
+a suffix where all the letters are @samp{z} by checking that all the elements
+in the array are equal to 26:
 
-@ifset FOR_PRINT
-You might want to consider how to eliminate the use of
-@code{ord()} and @code{chr()}; this can be done in such a
-way as to solve the EBCDIC issue as well.
-@end ifset
+@example
+@c file eg/prog/split.awk
+function on_last_file(  i, on_last)
+@{
+    on_last = 1
+    for (i = 1; i <= Suffix_length; i++) @{
+        on_last = on_last && (Suffix_ind[i] == 26)
+    @}
+
+    return on_last
+@}
+@c endfile
+@end example
+
+The actual work of splitting the input file is done by the next two rules.
+Since splitting by line count and splitting by byte count are mutually
+exclusive, we simply use two separate rules, one for when @code{Line_count}
+is greater than zero, and another for when @code{Byte_count} is greater than 
zero.
+
+The variable @code{tcount} counts how many lines have been processed so far.
+When it exceeds @code{Line_count}, it's time to close the previous file and
+switch to a new one:
+
+@example
+@c file eg/prog/split.awk
+Line_count > 0 @{
+    if (++tcount > Line_count) @{
+        close(Output)
+        Output = (Outfile compute_suffix())
+        tcount = 1
+    @}
+    print > Output
+@}
+@c endfile
+@end example
+
+The rule for handling bytes is more complicated.  Since lines most likely
+vary in length, the @code{Byte_count} boundary may be hit in the middle of
+an input record.  In that case, @command{split} has to write enough of the
+first bytes of the input record to finish up @code{Byte_count} bytes, close
+the file, open a new file, and write the rest of the record to the new file.
+The logic here does all that:
+
+@example
+@c file eg/prog/split.awk
+Byte_count > 0 @{
+    # `+ 1' is for the final newline
+    if (tcount + length($0) + 1 > Byte_count) @{ # would overflow
+        # compute leading bytes
+        leading_bytes = Byte_count - tcount
+
+        # write leading bytes
+        printf("%s", substr($0, 1, leading_bytes)) > Output
+
+        # close old file, open new file
+        close(Output)
+        Output = (Outfile compute_suffix())
+
+        # set up first bytes for new file
+        $0 = substr($0, leading_bytes + 1)  # trailing bytes
+        tcount = 0
+    @}
+
+    # write full record or trailing bytes
+    tcount += length($0) + 1
+    print > Output
+@}
+@c endfile
+@end example
 
+Finally, the @code{END} rule cleans up by closing the last output file:
+
+@example
+@c file eg/prog/split.awk
+END @{
+    close(Output)
+@}
+@c endfile
+@end example
 
 @node Tee Program
 @subsection Duplicating Output into Multiple Files
diff --git a/doc/wordlist b/doc/wordlist
index d3aea0b..40f55da 100644
--- a/doc/wordlist
+++ b/doc/wordlist
@@ -358,6 +358,7 @@ Oram
 Ord
 Ormos
 Ou
+Outfile
 Oy
 PASSWD
 PATCHLEVEL
@@ -517,11 +518,13 @@ aB
 aCa
 aIt
 aSAgbfocFVtguG
+aa
 aaa
 aaaabcd
 aab
 aabbb
 aabbbccccddd
+aac
 aaccdd
 ab
 abCDEf
@@ -534,9 +537,13 @@ abcdefg
 abcdefghi
 abcdefghijklmnopqrstuvwxyz
 abcdxyz
+abq
+abr
 abs
 abx
+abz
 ac
+aca
 acbfoo
 aclocal
 addrs
@@ -1768,6 +1775,7 @@ xDeadBeef
 xFOO
 xX
 xaa
+xaaaa
 xab
 xbd
 xdeadBEEF
@@ -1796,6 +1804,7 @@ xxx
 xyz
 xyzzy
 xz
+xzzzz
 ya
 yabber
 yballs
@@ -1809,5 +1818,9 @@ zbcom
 zerofile
 zodiacusque
 zsh
+zz
+zzx
+zzy
 zzz
+zzzz
 zzzzzz

-----------------------------------------------------------------------

Summary of changes:
 awklib/eg/prog/split.awk | 164 ++++++---
 doc/ChangeLog            |   6 +
 doc/gawk.info            | 848 ++++++++++++++++++++++++++++-------------------
 doc/gawk.texi            | 372 ++++++++++++++++-----
 doc/gawktexi.in          | 372 ++++++++++++++++-----
 doc/wordlist             |  13 +
 6 files changed, 1225 insertions(+), 550 deletions(-)


hooks/post-receive
-- 
gawk



reply via email to

[Prev in Thread] Current Thread [Next in Thread]