[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH] split: --chunks option
From: |
Chen Guo |
Subject: |
Re: [PATCH] split: --chunks option |
Date: |
Wed, 16 Dec 2009 06:21:36 -0800 (PST) |
Hi all,
> Pádraig Brady writes:
>
> > For reference bash and ksh support ansi c quoting like $'\0'
> > so you could specify -t $'\0'.
Since this is only bash and ksh, I'm guessing it'd be best to write our
own parsing, to support something like -t\n or -t\0 for shells that don't
support ansi C quoting?
At any rate, I figure it's about time I shared what I have so far, in
case you guys want to look at it or play around with it. This is
everything, minus parsing issues (i.e. -t is implemented, but depends
on shell parsing) and minus handling of non-seekable files.
From 8296bb34e51b93a339c51cdf16160cf3e9753d2b Mon Sep 17 00:00:00 2001
From: Chen Guo <address@hidden>
Date: Wed, 16 Dec 2009 15:07:38 +0100
Subject: [PATCH] Split: extend --bytes and --lines to divide file into N roughly
equal pieces, or to extract Kth of N said pieces
Add --number option (equivalent to --bytes=/N and --bytes=K/N
Add -t optiont to specify delineation character
---
src/split.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 275 insertions(+), 10 deletions(-)
diff --git a/src/split.c b/src/split.c
index d1a0e0d..98a4e0d 100644
--- a/src/split.c
+++ b/src/split.c
@@ -72,6 +72,9 @@ static int output_desc;
output file is opened. */
static bool verbose;
+/* End of line character */
+static char eol;
+
/* For long options that have no equivalent short option, use a
non-character as a pseudo short option, starting with CHAR_MAX + 1. */
enum
@@ -81,9 +84,11 @@ enum
static struct option const longopts[] =
{
+ {"break", required_argument, NULL, 't'},
{"bytes", required_argument, NULL, 'b'},
{"lines", required_argument, NULL, 'l'},
{"line-bytes", required_argument, NULL, 'C'},
+ {"number", required_argument, NULL, 'n'},
{"suffix-length", required_argument, NULL, 'a'},
{"numeric-suffixes", no_argument, NULL, 'd'},
{"verbose", no_argument, NULL, VERBOSE_OPTION},
@@ -116,9 +121,18 @@ Mandatory arguments to long options are mandatory for
short options too.\n\
fprintf (stdout, _("\
-a, --suffix-length=N use suffixes of length N (default %d)\n\
-b, --bytes=SIZE put SIZE bytes per output file\n\
+ -b, --bytes=/N generate N output files\n\
+ -b, --bytes=K/N print Kth of N chunks of file\n\
-C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\
-d, --numeric-suffixes use numeric suffixes instead of alphabetic\n\
-l, --lines=NUMBER put NUMBER lines per output file\n\
+ -l, --lines=/N generate N newline delineated output files\n\
+ -l, --lines=K/N print Kth of N newline delineated chunks\n\
+ -n, --number=N same as --bytes=/N\n\
+ -n, --number=K/N same as --bytes=K/N\n\
+ -t specify delineation character. This will also\n\
+ convert options such as -b to their delineated\n\
+ equivalent (-l or -C, depending on context)\n\
"), DEFAULT_SUFFIX_LENGTH);
fputs (_("\
--verbose print a diagnostic just before each\n\
@@ -218,13 +232,14 @@ cwrite (bool new_file_flag, const char *bp, size_t bytes)
Use buffer BUF, whose size is BUFSIZE. */
static void
-bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
+bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, uintmax_t max_files)
{
size_t n_read;
bool new_file_flag = true;
size_t to_read;
uintmax_t to_write = n_bytes;
char *bp_out;
+ uintmax_t opened = 1;
do
{
@@ -251,7 +266,8 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
cwrite (new_file_flag, bp_out, w);
bp_out += w;
to_read -= w;
- new_file_flag = true;
+ new_file_flag = (opened++ < max_files || !max_files)?
+ true : false;
to_write = n_bytes;
}
}
@@ -277,10 +293,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
error (EXIT_FAILURE, errno, "%s", infile);
bp = bp_out = buf;
eob = bp + n_read;
- *eob = '\n';
+ *eob = eol;
for (;;)
{
- bp = memchr (bp, '\n', eob - bp + 1);
+ bp = memchr (bp, eol, eob - bp + 1);
if (bp == eob)
{
if (eob != bp_out) /* do not write 0 bytes! */
@@ -323,7 +339,6 @@ line_bytes_split (size_t n_bytes)
do
{
/* Fill up the full buffer size from the input file. */
-
n_read = full_read (STDIN_FILENO, buf + n_buffered, n_bytes -
n_buffered);
if (n_read == SAFE_READ_ERROR)
error (EXIT_FAILURE, errno, "%s", infile);
@@ -340,7 +355,7 @@ line_bytes_split (size_t n_bytes)
bp = buf + n_buffered;
if (n_buffered == n_bytes)
{
- while (bp > buf && bp[-1] != '\n')
+ while (bp > buf && bp[-1] != eol)
bp--;
}
@@ -362,6 +377,164 @@ line_bytes_split (size_t n_bytes)
free (buf);
}
+/* Split into NUMBER newline delineated chunks. */
+
+static void
+line_chunk_split (uintmax_t number, char *buf, size_t bufsize, size_t
file_size)
+{
+ size_t n_read;
+ uintmax_t chunk_no = 1;
+ off_t chunk_end = file_size / number - 1;
+ off_t offset = 0;
+ bool new_file_flag = true;
+ char *bp, *bp_out, *eob;
+
+ while (offset < file_size)
+ {
+ bp = bp_out = buf;
+ n_read = full_read (STDIN_FILENO, buf, bufsize);
+ if (n_read == SAFE_READ_ERROR)
+ error (EXIT_FAILURE, errno, "%s", infile);
+ eob = buf + n_read;
+
+ while (1)
+ {
+ char *seek = (offset < chunk_end)? bp + chunk_end - offset : bp;
+ if (seek > eob)
+ seek = eob;
+ bp_out = memchr (seek, eol, eob - seek);
+ if (!bp_out)
+ {
+ cwrite (new_file_flag, bp, eob - bp);
+ new_file_flag = false;
+ offset += eob - bp;
+ break;
+ }
+ else
+ bp_out++;
+
+ cwrite (new_file_flag, bp, bp_out - bp);
+ chunk_end = (++chunk_no < number)?
+ chunk_end + file_size / number : file_size;
+ new_file_flag = true;
+ offset += bp_out - bp;
+ bp = bp_out;
+ /* A line could have been so long that it skipped
+ entire chunks. */
+ while (chunk_end < offset)
+ {
+ chunk_end += file_size / number;
+ chunk_no++;
+ /* Create blank file: this ensures NUMBER files are
+ created. */
+ cwrite (true, bp, 0);
+ }
+ }
+ }
+}
+
+/* If file is seekable, extract Nth of TOTAL chunks.
+ FIXME: Handle non-ssekable files. */
+
+static void
+byte_chunk_extract (uintmax_t n, uintmax_t total, char *buf, size_t bufsize,
+ size_t file_size)
+{
+ off_t start = (n == 0)? 0 : (n - 1) * (file_size / total);
+ off_t end = (n == total)? file_size : n * (file_size / total);
+ ssize_t n_read;
+ size_t n_write;
+
+ while (1)
+ {
+ n_read = pread (STDIN_FILENO, buf, bufsize, start);
+ if (n_read < 0)
+ error (EXIT_FAILURE, errno, "%s", infile);
+ n_write = (start + n_read <= end)? n_read : end - start;
+ if (write (STDOUT_FILENO, buf, n_write) != n_write)
+ error (EXIT_FAILURE, errno, "output error");
+ start += n_read;
+ if (end <= start)
+ return;
+ }
+}
+
+/* If file is seekable, extract lines whose first byte begins in the Nth of
+ TOTAL chunks.
+ FIXME: Handle non-seekable files. */
+
+static void
+line_chunk_extract (uintmax_t n, uintmax_t total, char* buf, size_t bufsize,
+ size_t file_size)
+{
+ ssize_t n_read;
+ bool end_of_chunk = false;
+ bool skip = true;
+ char *bp = buf, *bp_out = buf, *eob;
+ off_t start;
+ off_t end;
+
+ /* For n != 1, start reading 1 byte before nth chunk of file. This is to
+ detect if the first byte of chunk is the first byte of a line. */
+ if (n == 1)
+ {
+ start = 0;
+ skip = false;
+ }
+ else
+ start = (n - 1) * (file_size / total) - 1;
+ end = (n == total)? file_size - 1 : n * (file_size / total) - 1;
+
+ while (1)
+ {
+ n_read = pread (STDIN_FILENO, buf, bufsize, start);
+ if (n_read < 0)
+ error (EXIT_FAILURE, errno, "%s", infile);
+ bp = buf;
+ bp_out = buf + n_read;
+ eob = bp_out;
+
+ /* Unless n == 1, skip past the first eol character
+ encountered. */
+ if (skip)
+ {
+ bp = memchr (buf, eol, n_read);
+ if (bp)
+ {
+ if (bp - buf >= end - start)
+ return;
+ bp++;
+ skip = false;
+ }
+ else
+ if (start + n_read < end)
+ {
+ start += n_read;
+ continue;
+ }
+ else
+ return;
+ }
+ if (start + n_read >= end && end == file_size -1)
+ end_of_chunk = true;
+ else if (start + n_read >= end)
+ {
+ char *base = (buf + end - start < buf)? buf : buf + end - start;
+ bp_out = memchr (base, eol, eob - base);
+ if (bp_out)
+ {
+ bp_out++;
+ end_of_chunk = true;
+ }
+ }
+ start += n_read;
+ if (write (STDOUT_FILENO, bp, bp_out - bp) != bp_out - bp)
+ error (EXIT_FAILURE, errno, "output error");
+ if (end_of_chunk)
+ return;
+ }
+}
+
#define FAIL_ONLY_ONE_WAY() \
do \
{ \
@@ -370,21 +543,47 @@ line_bytes_split (size_t n_bytes)
} \
while (0)
+/* Parse K/N syntax of chunk options. */
+
+static inline void
+chunk_parse (uintmax_t *m_units, uintmax_t *n_units, char *slash)
+{
+ *slash = '\0';
+ if (slash != optarg
+ && xstrtoumax (optarg, NULL, 10, m_units, "") != LONGINT_OK
+ || SIZE_MAX < *m_units)
+ {
+ error (0, 0, _("%s: invalid chunk number"), optarg);
+ usage (EXIT_FAILURE);
+ }
+ if (xstrtoumax (++slash, NULL, 10, n_units, "") != LONGINT_OK
+ || *n_units == 0 || *n_units < *m_units || SIZE_MAX < *n_units)
+ {
+ error (0, 0, _("%s: invalid number of total chunks"), slash);
+ usage (EXIT_FAILURE);
+ }
+}
+
int
main (int argc, char **argv)
{
struct stat stat_buf;
enum
{
- type_undef, type_bytes, type_byteslines, type_lines, type_digits
+ type_undef, type_bytes, type_byteslines, type_lines, type_digits,
+ type_chunk_bytes, type_chunk_eol
} split_type = type_undef;
size_t in_blk_size; /* optimal block size of input file device */
char *buf; /* file i/o buffer */
size_t page_size = getpagesize ();
+ uintmax_t m_units = 0;
uintmax_t n_units;
static char const multipliers[] = "bEGKkMmPTYZ0";
int c;
int digits_optind = 0;
+ size_t file_size;
+ char *slash;
+ bool eol_char = false;
initialize_main (&argc, &argv);
set_program_name (argv[0]);
@@ -404,7 +603,7 @@ main (int argc, char **argv)
/* This is the argv-index of the option we will read next. */
int this_optind = optind ? optind : 1;
- c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL);
+ c = getopt_long (argc, argv, "0123456789C:a:b:c:dl:n:t:", longopts,
NULL);
if (c == -1)
break;
@@ -426,6 +625,13 @@ main (int argc, char **argv)
case 'b':
if (split_type != type_undef)
FAIL_ONLY_ONE_WAY ();
+ slash = strchr (optarg, '/');
+ if (slash)
+ {
+ split_type = type_chunk_bytes;
+ chunk_parse (&m_units, &n_units, slash);
+ break;
+ }
split_type = type_bytes;
if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) !=
LONGINT_OK
|| n_units == 0)
@@ -438,6 +644,13 @@ main (int argc, char **argv)
case 'l':
if (split_type != type_undef)
FAIL_ONLY_ONE_WAY ();
+ slash = strchr (optarg, '/');
+ if (slash)
+ {
+ split_type = type_chunk_eol;
+ chunk_parse (&m_units, &n_units, slash);
+ break;
+ }
split_type = type_lines;
if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK
|| n_units == 0)
@@ -447,6 +660,24 @@ main (int argc, char **argv)
}
break;
+ case 'n':
+ if (split_type != type_undef)
+ FAIL_ONLY_ONE_WAY ();
+ split_type = type_chunk_bytes;
+ slash = strchr (optarg, '/');
+ if (slash)
+ {
+ chunk_parse (&m_units, &n_units, slash);
+ break;
+ }
+ if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK
+ || n_units == 0 || SIZE_MAX < n_units)
+ {
+ error (0, 0, _("%s: invalid number of total chunks"), optarg);
+ usage (EXIT_FAILURE);
+ }
+ break;
+
case 'C':
if (split_type != type_undef)
FAIL_ONLY_ONE_WAY ();
@@ -492,6 +723,11 @@ main (int argc, char **argv)
suffix_alphabet = "0123456789";
break;
+ case 't':
+ eol = *optarg;
+ eol_char = true;
+ break;
+
case VERBOSE_OPTION:
verbose = true;
break;
@@ -505,6 +741,17 @@ main (int argc, char **argv)
}
}
+ /* Default eol to \n if none specified. */
+ if (!eol_char)
+ eol = '\n';
+ else
+ {
+ if (split_type == type_chunk_bytes)
+ split_type = type_chunk_eol;
+ if (split_type == type_bytes)
+ split_type = type_byteslines;
+ }
+
/* Handle default case. */
if (split_type == type_undef)
{
@@ -546,10 +793,14 @@ main (int argc, char **argv)
output_desc = -1;
/* Get the optimal block size of input device and make a buffer. */
-
if (fstat (STDIN_FILENO, &stat_buf) != 0)
error (EXIT_FAILURE, errno, "%s", infile);
in_blk_size = io_blksize (stat_buf);
+ file_size = stat_buf.st_size;
+
+ if (split_type == type_chunk_bytes || split_type == type_chunk_eol)
+ if (file_size < n_units)
+ error (EXIT_FAILURE, errno, "number of chunks exceed file size");
buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size);
@@ -561,13 +812,27 @@ main (int argc, char **argv)
break;
case type_bytes:
- bytes_split (n_units, buf, in_blk_size);
+ bytes_split (n_units, buf, in_blk_size, 0);
break;
case type_byteslines:
line_bytes_split (n_units);
break;
+ case type_chunk_bytes:
+ if (m_units == 0)
+ bytes_split (file_size / n_units, buf, in_blk_size, n_units);
+ else
+ byte_chunk_extract (m_units, n_units, buf, in_blk_size, file_size);
+ break;
+
+ case type_chunk_eol:
+ if (m_units == 0)
+ line_chunk_split (n_units, buf, in_blk_size, file_size);
+ else
+ line_chunk_extract (m_units, n_units, buf, in_blk_size, file_size);
+ break;
+
default:
abort ();
}
--
1.6.3.3
- Re: [PATCH] split: --chunks option, Chen Guo, 2009/12/14
- Re: [PATCH] split: --chunks option, Pádraig Brady, 2009/12/14
- Re: [PATCH] split: --chunks option, Jim Meyering, 2009/12/15
- Re: [PATCH] split: --chunks option, Chen Guo, 2009/12/15
- Re: [PATCH] split: --chunks option, Jim Meyering, 2009/12/15
- Re: [PATCH] split: --chunks option, Pádraig Brady, 2009/12/15
- Re: [PATCH] split: --chunks option, Chen Guo, 2009/12/15
- Re: [PATCH] split: --chunks option, Pádraig Brady, 2009/12/15
- Re: [PATCH] split: --chunks option, Andreas Schwab, 2009/12/16
- Re: [PATCH] split: --chunks option,
Chen Guo <=
- Re: [PATCH] split: --chunks option, Chen Guo, 2009/12/20
- Re: [PATCH] split: --chunks option, Andreas Schwab, 2009/12/20