bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] split: --chunks option


From: Chen Guo
Subject: Re: [PATCH] split: --chunks option
Date: Wed, 16 Dec 2009 06:21:36 -0800 (PST)

Hi all,

> Pádraig Brady  writes:

> 
> > For reference bash and ksh support ansi c quoting like $'\0'
> > so you could specify -t $'\0'.

Since this is only bash and ksh, I'm guessing it'd be best to write our
own parsing, to support something like -t\n or -t\0 for shells that don't
support ansi C quoting?

At any rate, I figure it's about time I shared what I have so far, in
case you guys want to look at it or play around with it. This is
everything, minus parsing issues (i.e. -t is implemented, but depends
on shell parsing) and minus handling of non-seekable files.

From 8296bb34e51b93a339c51cdf16160cf3e9753d2b Mon Sep 17 00:00:00 2001
From: Chen Guo <address@hidden>
Date: Wed, 16 Dec 2009 15:07:38 +0100
Subject: [PATCH] Split: extend --bytes and --lines to divide file into N roughly
equal pieces, or to extract Kth of N said pieces
Add --number option (equivalent to --bytes=/N and --bytes=K/N
Add -t optiont to specify delineation character

---
src/split.c |  285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 275 insertions(+), 10 deletions(-)

diff --git a/src/split.c b/src/split.c
index d1a0e0d..98a4e0d 100644
--- a/src/split.c
+++ b/src/split.c
@@ -72,6 +72,9 @@ static int output_desc;
    output file is opened. */
static bool verbose;

+/* End of line character */
+static char eol;
+
/* For long options that have no equivalent short option, use a
    non-character as a pseudo short option, starting with CHAR_MAX + 1.  */
enum
@@ -81,9 +84,11 @@ enum

static struct option const longopts[] =
{
+  {"break", required_argument, NULL, 't'},
   {"bytes", required_argument, NULL, 'b'},
   {"lines", required_argument, NULL, 'l'},
   {"line-bytes", required_argument, NULL, 'C'},
+  {"number", required_argument, NULL, 'n'},
   {"suffix-length", required_argument, NULL, 'a'},
   {"numeric-suffixes", no_argument, NULL, 'd'},
   {"verbose", no_argument, NULL, VERBOSE_OPTION},
@@ -116,9 +121,18 @@ Mandatory arguments to long options are mandatory for 
short options too.\n\
       fprintf (stdout, _("\
   -a, --suffix-length=N   use suffixes of length N (default %d)\n\
   -b, --bytes=SIZE        put SIZE bytes per output file\n\
+  -b, --bytes=/N          generate N output files\n\
+  -b, --bytes=K/N         print Kth of N chunks of file\n\
   -C, --line-bytes=SIZE   put at most SIZE bytes of lines per output file\n\
   -d, --numeric-suffixes  use numeric suffixes instead of alphabetic\n\
   -l, --lines=NUMBER      put NUMBER lines per output file\n\
+  -l, --lines=/N          generate N newline delineated output files\n\
+  -l, --lines=K/N         print Kth of N newline delineated chunks\n\
+  -n, --number=N          same as --bytes=/N\n\
+  -n, --number=K/N        same as --bytes=K/N\n\
+  -t                      specify delineation character. This will also\n\
+                            convert options such as -b to their delineated\n\
+                            equivalent (-l or -C, depending on context)\n\
"), DEFAULT_SUFFIX_LENGTH);
       fputs (_("\
       --verbose           print a diagnostic just before each\n\
@@ -218,13 +232,14 @@ cwrite (bool new_file_flag, const char *bp, size_t bytes)
    Use buffer BUF, whose size is BUFSIZE.  */

static void
-bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
+bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, uintmax_t max_files)
{
   size_t n_read;
   bool new_file_flag = true;
   size_t to_read;
   uintmax_t to_write = n_bytes;
   char *bp_out;
+  uintmax_t opened = 1;

   do
     {
@@ -251,7 +266,8 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
               cwrite (new_file_flag, bp_out, w);
               bp_out += w;
               to_read -= w;
-              new_file_flag = true;
+              new_file_flag = (opened++ < max_files || !max_files)?
+                              true : false;
               to_write = n_bytes;
             }
         }
@@ -277,10 +293,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
         error (EXIT_FAILURE, errno, "%s", infile);
       bp = bp_out = buf;
       eob = bp + n_read;
-      *eob = '\n';
+      *eob = eol;
       for (;;)
         {
-          bp = memchr (bp, '\n', eob - bp + 1);
+          bp = memchr (bp, eol, eob - bp + 1);
           if (bp == eob)
             {
               if (eob != bp_out) /* do not write 0 bytes! */
@@ -323,7 +339,6 @@ line_bytes_split (size_t n_bytes)
   do
     {
       /* Fill up the full buffer size from the input file.  */
-
       n_read = full_read (STDIN_FILENO, buf + n_buffered, n_bytes - 
n_buffered);
       if (n_read == SAFE_READ_ERROR)
         error (EXIT_FAILURE, errno, "%s", infile);
@@ -340,7 +355,7 @@ line_bytes_split (size_t n_bytes)
       bp = buf + n_buffered;
       if (n_buffered == n_bytes)
         {
-          while (bp > buf && bp[-1] != '\n')
+          while (bp > buf && bp[-1] != eol)
             bp--;
         }

@@ -362,6 +377,164 @@ line_bytes_split (size_t n_bytes)
   free (buf);
}

+/* Split into NUMBER newline delineated chunks. */
+
+static void
+line_chunk_split (uintmax_t number, char *buf, size_t bufsize, size_t 
file_size)
+{
+  size_t n_read;
+  uintmax_t chunk_no = 1;
+  off_t chunk_end = file_size / number - 1;
+  off_t offset = 0;
+  bool new_file_flag = true;
+  char *bp, *bp_out, *eob;
+
+  while (offset < file_size)
+    {
+      bp = bp_out = buf;
+      n_read = full_read (STDIN_FILENO, buf, bufsize);
+      if (n_read == SAFE_READ_ERROR)
+        error (EXIT_FAILURE, errno, "%s", infile);
+      eob = buf + n_read;
+
+      while (1)
+        {
+          char *seek = (offset < chunk_end)? bp + chunk_end - offset : bp;
+          if (seek > eob)
+            seek = eob;
+          bp_out = memchr (seek, eol, eob - seek);
+          if (!bp_out)
+            {
+              cwrite (new_file_flag, bp, eob - bp);
+              new_file_flag = false;
+              offset += eob - bp;
+              break;
+           }
+          else
+            bp_out++;
+
+          cwrite (new_file_flag, bp, bp_out - bp);
+          chunk_end = (++chunk_no < number)?
+                  chunk_end + file_size / number : file_size;
+      new_file_flag = true;
+          offset += bp_out - bp;
+          bp = bp_out;
+          /* A line could have been so long that it skipped
+             entire chunks. */
+          while (chunk_end < offset)
+            {
+              chunk_end += file_size / number;
+              chunk_no++;
+              /* Create blank file: this ensures NUMBER files are
+                 created. */
+              cwrite (true, bp, 0);
+            }
+        }
+    }
+}
+
+/* If file is seekable, extract Nth of TOTAL chunks.
+   FIXME: Handle non-ssekable files.  */
+
+static void
+byte_chunk_extract (uintmax_t n, uintmax_t total, char *buf, size_t bufsize,
+                    size_t file_size)
+{
+  off_t start = (n == 0)? 0 : (n - 1) * (file_size / total);
+  off_t end = (n == total)? file_size : n * (file_size / total);
+  ssize_t n_read;
+  size_t n_write;
+
+  while (1)
+    {
+      n_read = pread (STDIN_FILENO, buf, bufsize, start);
+      if (n_read < 0)
+        error (EXIT_FAILURE, errno, "%s", infile);
+      n_write = (start + n_read <= end)? n_read : end - start;
+      if (write (STDOUT_FILENO, buf, n_write) != n_write)
+    error (EXIT_FAILURE, errno, "output error");
+      start += n_read;
+      if (end <= start)
+        return;
+    }
+}
+
+/* If file is seekable, extract lines whose first byte begins in the Nth of
+   TOTAL chunks.
+   FIXME: Handle non-seekable files.  */
+
+static void
+line_chunk_extract (uintmax_t n, uintmax_t total, char* buf, size_t bufsize,
+                    size_t file_size)
+{
+  ssize_t n_read;
+  bool end_of_chunk = false;
+  bool skip = true;
+  char *bp = buf, *bp_out = buf, *eob;
+  off_t start;
+  off_t end;
+
+   /* For n != 1, start reading 1 byte before nth chunk of file. This is to
+     detect if the first byte of chunk is the first byte of a line. */
+   if (n == 1)
+    {
+      start = 0;
+      skip = false;
+    }
+  else
+    start = (n - 1) * (file_size / total) - 1;
+  end = (n == total)? file_size - 1 : n * (file_size / total) - 1;
+
+  while (1)
+    {
+      n_read = pread (STDIN_FILENO, buf, bufsize, start);
+      if (n_read < 0)
+        error (EXIT_FAILURE, errno, "%s", infile);
+      bp = buf;
+      bp_out = buf + n_read;
+      eob = bp_out;
+
+      /* Unless n == 1, skip past the first eol character
+         encountered. */
+      if (skip)
+        {
+          bp = memchr (buf, eol, n_read);
+          if (bp)
+            {
+              if (bp - buf >= end - start)
+                return;
+              bp++;
+              skip = false;
+            }
+          else
+            if (start + n_read < end)
+              {
+                start += n_read;
+                continue;
+              }
+            else
+              return;
+        }
+      if (start + n_read >= end && end == file_size -1)
+        end_of_chunk = true;
+      else if (start + n_read >= end)
+        {
+          char *base = (buf + end - start < buf)? buf : buf + end - start;
+          bp_out = memchr (base, eol, eob - base);
+          if (bp_out)
+            {
+              bp_out++;
+              end_of_chunk = true;
+            }
+        }
+      start += n_read;
+      if (write (STDOUT_FILENO, bp, bp_out - bp) != bp_out - bp)
+        error (EXIT_FAILURE, errno, "output error");
+      if (end_of_chunk)
+        return;
+    }
+}
+
#define FAIL_ONLY_ONE_WAY()                    \
   do                                \
     {                                \
@@ -370,21 +543,47 @@ line_bytes_split (size_t n_bytes)
     }                                \
   while (0)

+/* Parse K/N syntax of chunk options. */
+
+static inline void
+chunk_parse (uintmax_t *m_units, uintmax_t *n_units, char *slash)
+{
+  *slash = '\0';
+  if (slash != optarg
+      && xstrtoumax (optarg, NULL, 10, m_units, "") != LONGINT_OK
+      || SIZE_MAX < *m_units)
+    {
+      error (0, 0, _("%s: invalid chunk number"), optarg);
+      usage (EXIT_FAILURE);
+    }
+  if (xstrtoumax (++slash, NULL, 10, n_units, "") != LONGINT_OK
+      || *n_units == 0 || *n_units < *m_units || SIZE_MAX < *n_units)
+    {
+      error (0, 0, _("%s: invalid number of total chunks"), slash);
+      usage (EXIT_FAILURE);
+    }
+}
+
int
main (int argc, char **argv)
{
   struct stat stat_buf;
   enum
     {
-      type_undef, type_bytes, type_byteslines, type_lines, type_digits
+      type_undef, type_bytes, type_byteslines, type_lines, type_digits,
+      type_chunk_bytes, type_chunk_eol
     } split_type = type_undef;
   size_t in_blk_size;        /* optimal block size of input file device */
   char *buf;            /* file i/o buffer */
   size_t page_size = getpagesize ();
+  uintmax_t m_units = 0;
   uintmax_t n_units;
   static char const multipliers[] = "bEGKkMmPTYZ0";
   int c;
   int digits_optind = 0;
+  size_t file_size;
+  char *slash;
+  bool eol_char = false;

   initialize_main (&argc, &argv);
   set_program_name (argv[0]);
@@ -404,7 +603,7 @@ main (int argc, char **argv)
       /* This is the argv-index of the option we will read next.  */
       int this_optind = optind ? optind : 1;

-      c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL);
+      c = getopt_long (argc, argv, "0123456789C:a:b:c:dl:n:t:", longopts, 
NULL);
       if (c == -1)
         break;

@@ -426,6 +625,13 @@ main (int argc, char **argv)
         case 'b':
           if (split_type != type_undef)
             FAIL_ONLY_ONE_WAY ();
+          slash = strchr (optarg, '/');
+          if (slash)
+            {
+              split_type = type_chunk_bytes;
+              chunk_parse (&m_units, &n_units, slash);
+              break;
+            }
           split_type = type_bytes;
           if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) != 
LONGINT_OK
               || n_units == 0)
@@ -438,6 +644,13 @@ main (int argc, char **argv)
         case 'l':
           if (split_type != type_undef)
             FAIL_ONLY_ONE_WAY ();
+          slash = strchr (optarg, '/');
+          if (slash)
+            {
+              split_type = type_chunk_eol;
+              chunk_parse (&m_units, &n_units, slash);
+              break;
+            }
           split_type = type_lines;
           if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK
               || n_units == 0)
@@ -447,6 +660,24 @@ main (int argc, char **argv)
             }
           break;

+        case 'n':
+          if (split_type != type_undef)
+            FAIL_ONLY_ONE_WAY ();
+          split_type = type_chunk_bytes;
+          slash = strchr (optarg, '/');
+          if (slash)
+            {
+              chunk_parse (&m_units, &n_units, slash);
+              break;
+            }
+          if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK
+              || n_units == 0 || SIZE_MAX < n_units)
+            {
+              error (0, 0, _("%s: invalid number of total chunks"), optarg);
+              usage (EXIT_FAILURE);
+            }
+          break;
+
         case 'C':
           if (split_type != type_undef)
             FAIL_ONLY_ONE_WAY ();
@@ -492,6 +723,11 @@ main (int argc, char **argv)
           suffix_alphabet = "0123456789";
           break;

+        case 't':
+          eol = *optarg;
+          eol_char = true;
+          break;
+
         case VERBOSE_OPTION:
           verbose = true;
           break;
@@ -505,6 +741,17 @@ main (int argc, char **argv)
         }
     }

+  /* Default eol to \n if none specified. */
+  if (!eol_char)
+    eol = '\n';
+  else
+    {
+      if (split_type == type_chunk_bytes)
+        split_type = type_chunk_eol;
+      if (split_type == type_bytes)
+        split_type = type_byteslines;
+    }
+
   /* Handle default case.  */
   if (split_type == type_undef)
     {
@@ -546,10 +793,14 @@ main (int argc, char **argv)
   output_desc = -1;

   /* Get the optimal block size of input device and make a buffer.  */
-
   if (fstat (STDIN_FILENO, &stat_buf) != 0)
     error (EXIT_FAILURE, errno, "%s", infile);
   in_blk_size = io_blksize (stat_buf);
+  file_size = stat_buf.st_size;
+
+  if (split_type == type_chunk_bytes || split_type == type_chunk_eol)
+    if (file_size < n_units)
+      error (EXIT_FAILURE, errno, "number of chunks exceed file size");

   buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size);

@@ -561,13 +812,27 @@ main (int argc, char **argv)
       break;

     case type_bytes:
-      bytes_split (n_units, buf, in_blk_size);
+      bytes_split (n_units, buf, in_blk_size, 0);
       break;

     case type_byteslines:
       line_bytes_split (n_units);
       break;

+    case type_chunk_bytes:
+      if (m_units == 0)
+        bytes_split (file_size / n_units, buf, in_blk_size, n_units);
+      else
+        byte_chunk_extract (m_units, n_units, buf, in_blk_size, file_size);
+      break;
+
+    case type_chunk_eol:
+      if (m_units == 0)
+        line_chunk_split (n_units, buf, in_blk_size, file_size);
+      else
+        line_chunk_extract (m_units, n_units, buf, in_blk_size, file_size);
+      break;
+
     default:
       abort ();
     }
-- 
1.6.3.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]