[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Feature request: gzip/bzip support for split
From: |
Chandrakumar Muthaiah |
Subject: |
Feature request: gzip/bzip support for split |
Date: |
Thu, 29 Jan 2009 16:46:02 -0500 |
User-agent: |
Thunderbird 2.0.0.19 (X11/20090105) |
I would like to propose a feature that allows to gzip/bzip on its way
out during the split and I am also including the patch for the same.
I believe this is going to be really useful when we are dealing with
very large of files. I hope that it will be useful for people out there.
Below is the patch for coreutils 6.9
--- split.c 2007-03-18 17:36:43.000000000 -0400
+++ ../../coreutils-6/src/split.c 2009-01-28 22:26:45.000000000 -0500
@@ -75,6 +76,9 @@
output file is opened. */
static bool verbose;
+/* gzip/bzip2 the output file. */
+static int zipoutfile = 0;
+
/* For long options that have no equivalent short option, use a
non-character as a pseudo short option, starting with CHAR_MAX + 1. */
enum
@@ -82,6 +86,14 @@
VERBOSE_OPTION = CHAR_MAX + 1
};
+struct strvars
+{
+ const char *val;
+};
+
+static struct strvars const zsuffixes[] = {{""}, {".gz"}, {".bz2"}};
+static struct strvars const zipcmds[] = {{""}, {"gzip"}, {"bzip2"}};
+
static struct option const longopts[] =
{
{"bytes", required_argument, NULL, 'b'},
@@ -90,6 +102,8 @@
{"suffix-length", required_argument, NULL, 'a'},
{"numeric-suffixes", no_argument, NULL, 'd'},
{"verbose", no_argument, NULL, VERBOSE_OPTION},
+ {"gzip", no_argument, NULL, 'z'},
+ {"bzip2", no_argument, NULL, 'j'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -122,6 +136,8 @@
-C, --line-bytes=SIZE put at most SIZE bytes of lines per output
file\n\
-d, --numeric-suffixes use numeric suffixes instead of alphabetic\n\
-l, --lines=NUMBER put NUMBER lines per output file\n\
+ -z, --gzip gzip output files\n\
+ -j, --bzip2 bzip2 output files\n\
"), DEFAULT_SUFFIX_LENGTH);
fputs (_("\
--verbose print a diagnostic to standard error just\n\
@@ -194,21 +210,136 @@
}
}
-/* Write BYTES bytes at BP to an output file.
- If NEW_FILE_FLAG is true, open the next output file.
- Otherwise add to the same output file already in use. */
+/* Opens a new fd based on the file type seletion
+ */
-static void
-cwrite (bool new_file_flag, const char *bp, size_t bytes)
+#define EXIT_FAILURE 1
+#define O_BINARY 0
+
+static int
+fdpopen ( const char *command, const char *mode)
{
- if (new_file_flag)
+ int parent_end, child_end;
+ int pipe_fds[2];
+ pid_t child_pid;
+
+ int do_read = 0;
+ int do_write = 0;
+ int do_cloexec = 0;
+
+ while (*mode != '\0')
+ {
+ switch (*mode++)
+ {
+ case 'r':
+ do_read = 1;
+ break;
+ case 'w':
+ do_write = 1;
+ break;
+ case 'e':
+ do_cloexec = 1;
+ break;
+ default:
+ errout:
+ errno = EINVAL;
+ return -1;
+ }
+ }
+
+ if ((do_read ^ do_write) == 0)
+ goto errout;
+
+ if (pipe (pipe_fds) < 0)
+ return -1;
+
+ if (do_read)
+ {
+ parent_end = pipe_fds[0];
+ child_end = pipe_fds[1];
+ }
+ else
+ {
+ parent_end = pipe_fds[1];
+ child_end = pipe_fds[0];
+ }
+ child_pid = fork ();
+
+ if (child_pid == 0)
{
+ int child_std_end = do_read ? 1 : 0; /* Make this as the
stdin/stdout file descriptor */
+ close (parent_end);
+
+ if (child_end != child_std_end)
+ {
+ dup2 (child_end, child_std_end);
+ close (child_end);
+ }
+
+ execl ("/bin/sh", "sh", "-c", command, (char *) 0);
+ _exit (127);
+ }
+
+ close (child_end);
+ if (child_pid < 0)
+ {
+ close (parent_end);
+ return -1;
+ }
+
+ if (do_cloexec)
+ fcntl (parent_end, F_SETFD, FD_CLOEXEC);
+
+ return parent_end;
+}
+
+static void
+new_fd_pipe()
+{
+ const char* zipcmd = zipcmds[zipoutfile].val;
+ const char* zsuf = zsuffixes[zipoutfile].val;
+
+ /* 'gzip > /1/2/3/4/5/outputfile.gz' */
+
+ size_t outzlength = strlen (zipcmd);
+ size_t outlength = strlen (outfile);
+ size_t zsuflength = strlen (zsuf);
+ size_t tlength = outzlength + outlength + zsuflength + 3;
+
+ char* outfilez = xmalloc (tlength + 1);
+ char* ptrpos = outfilez;
+
+ memcpy (ptrpos, zipcmd, outzlength);
+ ptrpos += outzlength;
+ memcpy (ptrpos, " > ", 3);
+ ptrpos += 3;
+ memcpy (ptrpos, outfile, outlength);
+ ptrpos += outlength;
+ memcpy (ptrpos, zsuf, zsuflength);
+ outfile[tlength] = 0;
+
+ if (verbose)
+ fprintf (stderr, _("creating file %s\n"), quote (outfilez));
+
+ output_desc = fdpopen ( outfilez, "we");
+
+ if (output_desc < 0)
+ error (EXIT_FAILURE, errno, "%s", outfilez);
+}
+
+static void
+new_fd_file()
+{
if (output_desc >= 0 && close (output_desc) < 0)
error (EXIT_FAILURE, errno, "%s", outfile);
next_file_name ();
+
+ if(!zipoutfile)
+ {
if (verbose)
fprintf (stderr, _("creating file %s\n"), quote (outfile));
+
output_desc = open (outfile,
O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
@@ -216,6 +347,29 @@
if (output_desc < 0)
error (EXIT_FAILURE, errno, "%s", outfile);
}
+ else
+ {
+ new_fd_pipe();
+ }
+}
+
+static void
+new_fd(bool new_file_flag)
+{
+ if (!new_file_flag)
+ return;
+
+ new_fd_file();
+}
+
+/* Write BYTES bytes at BP to an output file.
+ If NEW_FILE_FLAG is true, open the next output file.
+ Otherwise add to the same output file already in use. */
+
+static void
+cwrite (bool new_file_flag, const char *bp, size_t bytes)
+{
+ new_fd(new_file_flag);
if (full_write (output_desc, bp, bytes) != bytes)
error (EXIT_FAILURE, errno, "%s", outfile);
}
@@ -405,7 +559,7 @@
/* This is the argv-index of the option we will read next. */
int this_optind = optind ? optind : 1;
- c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL);
+ c = getopt_long (argc, argv, "0123456789C:a:b:dl:zj", longopts,
NULL);
if (c == -1)
break;
@@ -489,6 +643,14 @@
}
break;
+ case 'z':
+ zipoutfile = 1;
+ break;
+
+ case 'j':
+ zipoutfile = 2;
+ break;
+
case 'd':
suffix_alphabet = "0123456789";
break;
- Feature request: gzip/bzip support for split,
Chandrakumar Muthaiah <=