From d5d70d63eea4110dde8e84b81c8877b97b73ecab Mon Sep 17 00:00:00 2001 From: Leslie P. Polzer Date: Mon, 24 Mar 2008 00:13:23 +0000 Subject: [PATCH] Merge Leslie Polzer's SOC 2007 changes for better ARG_MAX support To: address@hidden 2008-03-23 Leslie Polzer Merge Leslie Polzer's ARG_MAX enhancements to xargs which were produced as part of the Google Summer of Code 2007. These changes fix Savannah bug #22708. * find/pred.c (launch): The struct buildcmd_control* argument is no longer const. * find/defs.h: Likewise * lib/buildcmd.c (bc_do_insert): The struct buildcmd_control* argument is no longer const. (bc_push_arg): Likewise. (cb_exec_noop): Likewise. (get_stringv_len): New function; measures the length of a NULL-terminated argv sequence. (bc_do_exec): Rename from do_exec, and make global. Modify the function to react to exec failing with E2BIG by trying again with fewer arguments. * xargs/xargs.1: Mention that xargs automatically adopts to the situation where exec fails with E2BIG. * xargs/xargs.c: (parent): New variable, the PID of the parent process. (main): initialise the variable 'parent'. Don't fail immediately due to lack of space when the environment is large. Call xargs_do_exec via bc_do_exec. (xargs_do_exec): The struct buildcmd_control* argument is no longer const. When exec fails in the child, communicate the errno value back to the parent through a pipe which is close-on-exec; this allows us to accurately determine the cause of the failure and also to distinguish exec failures from all possible exit codes in the child. (wait_for_proc): If the utility cannot be found or cannot be run, we now find out about this by reading an errno value from the pipe, so this means that exit codes 126 and 127 in the child no longer have a special interpretation. * NEWS: mention these changes. --- NEWS | 11 +++ find/defs.h | 2 +- find/pred.c | 2 +- import-gnulib.config | 1 + lib/buildcmd.c | 157 ++++++++++++++++++++++++++---------------- lib/buildcmd.h | 11 ++- xargs/xargs.1 | 2 + xargs/xargs.c | 190 +++++++++++++++++++++++++++++++++++++------------ 8 files changed, 264 insertions(+), 112 deletions(-) diff --git a/NEWS b/NEWS index 3994c59..e42174c 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,12 @@ GNU findutils NEWS - User visible changes. -*- outline -*- (allout) * Major changes in release 4.5.0-CVS +** Enhancements + +If xargs find that exec fails because the argument size limit it +calculated is larger than the system's actual maximum, it now adapts +by passing fewer arguments (as opposed to failing). + ** Performance changes The default optimisation level for find is now -O2 instead of -O0, @@ -21,6 +27,11 @@ operations needed to make a test are. ** Bug Fixes +#22708: Exit status 126 and 127 from the utility invoked from xargs +now makes xargs return 123, meaning that exit status values 126 and +127 now unambigously mean that the utility could not be run or could +not be found, respectively. + #15472: Error messages that print ino_t values are no longer truncated on platforms with 64-bit ino_t. diff --git a/find/defs.h b/find/defs.h index 1708d83..6dd4a30 100644 --- a/find/defs.h +++ b/find/defs.h @@ -462,7 +462,7 @@ PREDICATEFUNCTION pred_xtype; -int launch PARAMS((const struct buildcmd_control *ctl, +int launch PARAMS((struct buildcmd_control *ctl, struct buildcmd_state *buildstate)); diff --git a/find/pred.c b/find/pred.c index d758276..345fa95 100644 --- a/find/pred.c +++ b/find/pred.c @@ -1916,7 +1916,7 @@ prep_child_for_exec (boolean close_stdin, int dir_fd) int -launch (const struct buildcmd_control *ctl, +launch (struct buildcmd_control *ctl, struct buildcmd_state *buildstate) { int wait_status; diff --git a/import-gnulib.config b/import-gnulib.config index 287c2d8..9f163e9 100644 --- a/import-gnulib.config +++ b/import-gnulib.config @@ -92,3 +92,4 @@ xstrtol xstrtoumax yesno ' + diff --git a/lib/buildcmd.c b/lib/buildcmd.c index 6c6a9c4..c3f9c92 100644 --- a/lib/buildcmd.c +++ b/lib/buildcmd.c @@ -16,29 +16,6 @@ along with this program. If not, see . */ -/* - XXX_SOC: - - One of the aspects of the SOC project is to adapt this module. - This module currently makes an initial guess at two things: - - buildcmd_control->arg_max (The most characters we can fit in) - buildcmd_control->max_arg_count (most args) - - The nature of the SOC task is to adjust these values when exec fails. - Optionally (if we have the time) we can make the software adjust them - when exec succeeds. If we do the latter, we need to ensure we don't - get into some state where we are sitting just below the limit and - keep trying to extend, because that would lead to every other exec - failing. - - If our initial guess is successful, there is no pressing need really to - increase our guess. Indeed, if we are beign called by xargs (as opposed - to find) th user may have specified a limit with "-s" and we should not - exceed it. -*/ - - #include # ifndef PARAMS @@ -128,7 +105,7 @@ extern char **environ; Those restrictions do not exist here. */ void -bc_do_insert (const struct buildcmd_control *ctl, +bc_do_insert (struct buildcmd_control *ctl, struct buildcmd_state *state, char *arg, size_t arglen, const char *prefix, size_t pfxlen, @@ -203,29 +180,103 @@ bc_do_insert (const struct buildcmd_control *ctl, initial_args); } -static -void do_exec(const struct buildcmd_control *ctl, - struct buildcmd_state *state) +/* get the length of a zero-terminated string array */ +static unsigned int +get_stringv_len(char** sv) +{ + char** p = sv; + + while (*p++); + + return (p - sv - 1); +} + + +void +bc_do_exec(struct buildcmd_control *ctl, + struct buildcmd_state *state) { - /* XXX_SOC: + bc_push_arg (ctl, state, + (char *) NULL, 0, + NULL, 0, + false); /* Null terminate the arg list. */ - Here we are calling the user's function. Currently there is no - way for it to report that the argument list was too long. We - should introduce an externally callable function that allows them - to report this. + /* save original argv data so we can free the memory later */ + int argc_orig = state->cmd_argc; + char** argv_orig = state->cmd_argv; - If the callee does report that the exec failed, we need to retry - the exec with a shorter argument list. Once we have reduced the - argument list to the point where the exec can succeed, we need to - preserve the list of arguments we couldn't exec this time. + if ((ctl->exec_callback)(ctl, state)) + goto fin; - This also means that the control argument here probably needs not - to be const (since the limits are in the control arg). + /* got E2BIG, adjust arguments */ + char** initial_args = xmalloc(ctl->initial_argc * sizeof(char*)); + int i; + for (i=0; iinitial_argc; ++i) + initial_args[i] = argv_orig[i]; - The caller's only requirement on do_exec is that it should - free up enough room for at least one argument. - */ - (ctl->exec_callback)(ctl, state); + state->cmd_argc -= ctl->initial_argc; + state->cmd_argv += ctl->initial_argc; + + + int pos; + int done = 0; /* number of argv elements we have relayed successfully */ + + int argc_current = state->cmd_argc; + + pos = -1; /* array offset from the right end */ + + for (;;) + { + int divider = argc_current+pos; + char* swapped_out = state->cmd_argv[divider]; + state->cmd_argv[divider] = NULL; + state->cmd_argc = get_stringv_len (state->cmd_argv); + + if (state->cmd_argc < 1) + error(1, 0, "can't call exec() due to argument size restrictions"); + + /* prepend initial args */ + state->cmd_argv -= ctl->initial_argc; + state->cmd_argc += ctl->initial_argc; + for (i=0; iinitial_argc; ++i) + state->cmd_argv[i] = initial_args[i]; + + ++state->cmd_argc; /* include trailing NULL */ + int r = (ctl->exec_callback)(ctl, state); + state->cmd_argv += ctl->initial_argc; + state->cmd_argc -= ctl->initial_argc; + --state->cmd_argc; /* exclude trailing NULL again */ + + if (r) + { + /* success */ + done += state->cmd_argc; + + /* check whether we have any left */ + if (argc_orig - done > ctl->initial_argc) + { + state->cmd_argv[divider] = swapped_out; + state->cmd_argv += divider; + + argc_current -= state->cmd_argc; + pos = -1; + } + else + goto fin; + } + else + { + /* failure, make smaller */ + state->cmd_argv[divider] = swapped_out; + --pos; + } + } + + +fin: + state->cmd_argv = argv_orig; + + bc_clear_args(ctl, state); } @@ -265,7 +316,7 @@ bc_argc_limit_reached(int initial_args, * for greater clarity. */ void -bc_push_arg (const struct buildcmd_control *ctl, +bc_push_arg (struct buildcmd_control *ctl, struct buildcmd_state *state, const char *arg, size_t len, const char *prefix, size_t pfxlen, @@ -278,11 +329,6 @@ bc_push_arg (const struct buildcmd_control *ctl, if (arg) { - /* XXX_SOC: if do_exec() is only guaranteeed to free up one - * argument, this if statement may need to become a while loop. - * If it becomes a while loop, it needs not to be an infinite - * loop... - */ if (state->cmd_argv_chars + len > ctl->arg_max) { if (initial_args || state->cmd_argc == ctl->initial_argc) @@ -292,17 +338,10 @@ bc_push_arg (const struct buildcmd_control *ctl, || (ctl->exit_if_size_exceeded && (ctl->lines_per_exec || ctl->args_per_exec))) error (1, 0, _("argument list too long")); - do_exec (ctl, state); + bc_do_exec (ctl, state); } - /* XXX_SOC: this if may also need to become a while loop. In - fact perhaps it is best to factor this out into a separate - function which ceeps calling the exec handler until there is - space for our next argument. Each exec will free one argc - "slot" so the main thing to worry about repeated exec calls - for would be total argument length. - */ if (bc_argc_limit_reached(initial_args, ctl, state)) - do_exec (ctl, state); + bc_do_exec (ctl, state); } if (state->cmd_argc >= state->cmd_argv_alloc) @@ -341,7 +380,7 @@ bc_push_arg (const struct buildcmd_control *ctl, * actually calls bc_push_arg(ctl, state, NULL, 0, false). */ if (bc_argc_limit_reached(initial_args, ctl, state)) - do_exec (ctl, state); + bc_do_exec (ctl, state); } /* If this is an initial argument, set the high-water mark. */ @@ -420,7 +459,7 @@ bc_get_arg_max(void) } -static int cb_exec_noop(const struct buildcmd_control *ctl, +static int cb_exec_noop(struct buildcmd_control *ctl, struct buildcmd_state *state) { /* does nothing. */ diff --git a/lib/buildcmd.h b/lib/buildcmd.h index e295852..f9f8f64 100644 --- a/lib/buildcmd.h +++ b/lib/buildcmd.h @@ -22,7 +22,7 @@ struct buildcmd_state { - /* Number of valid elements in `cmd_argv'. */ + /* Number of valid elements in `cmd_argv', including terminating NULL. */ int cmd_argc; /* 0 */ /* The list of args being built. */ @@ -87,7 +87,7 @@ struct buildcmd_control int initial_argc; /* 0 */ /* exec callback. */ - int (*exec_callback)(const struct buildcmd_control *, struct buildcmd_state *); + int (*exec_callback)(struct buildcmd_control *, struct buildcmd_state *); /* If nonzero, the maximum number of nonblank lines from stdin to use per command line. */ @@ -107,14 +107,17 @@ enum BC_INIT_STATUS extern size_t bc_size_of_environment (void); -extern void bc_do_insert (const struct buildcmd_control *ctl, +extern void bc_do_insert (struct buildcmd_control *ctl, struct buildcmd_state *state, char *arg, size_t arglen, const char *prefix, size_t pfxlen, const char *linebuf, size_t lblen, int initial_args); -extern void bc_push_arg (const struct buildcmd_control *ctl, +extern void bc_do_exec (struct buildcmd_control *ctl, + struct buildcmd_state *state); + +extern void bc_push_arg (struct buildcmd_control *ctl, struct buildcmd_state *state, const char *arg, size_t len, const char *prefix, size_t pfxlen, diff --git a/xargs/xargs.1 b/xargs/xargs.1 index 413b55e..8f3811f 100644 --- a/xargs/xargs.1 +++ b/xargs/xargs.1 @@ -246,6 +246,8 @@ and is calculated as the argument length limit for exec, less the size of your environment, less 2048 bytes of headroom. If this value is more than 128KiB, 128Kib is used as the default value; otherwise, the default value is the maximum. 1KiB is 1024 bytes. +.B xargs +automatically adapts to tighter constraints. .TP .PD 0 .B \-\-verbose diff --git a/xargs/xargs.c b/xargs/xargs.c index 1365a71..ac13989 100644 --- a/xargs/xargs.c +++ b/xargs/xargs.c @@ -220,6 +220,9 @@ static pid_t *pids = NULL; /* The number of allocated elements in `pids'. */ static size_t pids_alloc = 0u; +/* Process ID of the parent xarsg process. */ +static pid_t parent; + /* Exit status; nonzero if any child process exited with a status of 1-125. */ static volatile int child_error = 0; @@ -264,7 +267,7 @@ static int read_line PARAMS ((void)); static int read_string PARAMS ((void)); static boolean print_args PARAMS ((boolean ask)); /* static void do_exec PARAMS ((void)); */ -static int xargs_do_exec (const struct buildcmd_control *cl, struct buildcmd_state *state); +static int xargs_do_exec (struct buildcmd_control *cl, struct buildcmd_state *state); static void exec_if_possible PARAMS ((void)); static void add_proc PARAMS ((pid_t pid)); static void wait_for_proc PARAMS ((boolean all, unsigned int minreap)); @@ -413,6 +416,7 @@ main (int argc, char **argv) enum { XARGS_POSIX_HEADROOM = 2048u }; program_name = argv[0]; + parent = getpid(); original_exit_value = 0; #ifdef HAVE_SETLOCALE @@ -423,10 +427,6 @@ main (int argc, char **argv) atexit (close_stdin); atexit (wait_for_proc_all); - /* xargs is required by POSIX to allow 2048 bytes of headroom - * for extra environment variables (that perhaps the utliity might - * want to set before execing something else). - */ bcstatus = bc_init_controlinfo(&bc_ctl, XARGS_POSIX_HEADROOM); /* The bc_init_controlinfo call may have determined that the @@ -443,27 +443,13 @@ main (int argc, char **argv) { act_on_init_result = fail_due_to_env_size; } - else if (BC_INIT_CANNOT_ACCOMODATE_HEADROOM == bcstatus) - { - /* All POSIX systems are required to support ARG_MAX of at least - * 4096. For everything to work the total of (command line + - * headroom + environment) must fit into this. POSIX requires - * that we use a headroom of 2048 bytes. The user is in control - * of the size of the environment. - * - * In general if bc_init_controlinfo() returns - * BC_INIT_CANNOT_ACCOMODATE_HEADROOM, its caller can try again - * with a smaller headroom. However, in the case of xargs, this - * would not be POSIX-compliant. - */ - act_on_init_result = fail_due_to_env_size; - } else { /* IEEE Std 1003.1, 2003 specifies that the combined argument and * environment list shall not exceed {ARG_MAX}-2048 bytes. It also * specifies that it shall be at least LINE_MAX. */ + assert(bc_ctl.arg_max <= (ARG_MAX-2048)); #ifdef _SC_ARG_MAX long val = sysconf(_SC_ARG_MAX); if (val > 0) @@ -689,12 +675,10 @@ main (int argc, char **argv) _("Your environment variables take up %lu bytes\n"), (unsigned long)bc_size_of_environment()); fprintf(stderr, - _("POSIX upper limit on argument length (this system): %lu\n"), + _("POSIX lower and upper limits on argument length: %lu, %lu\n"), + (unsigned long)bc_ctl.posix_arg_size_min, (unsigned long)bc_ctl.posix_arg_size_max); fprintf(stderr, - _("POSIX smallest allowable upper limit on argument length (all systems): %lu\n"), - (unsigned long)bc_ctl.posix_arg_size_min); - fprintf(stderr, _("Maximum length of command we could actually use: %ld\n"), (unsigned long)(bc_ctl.posix_arg_size_max - bc_size_of_environment())); @@ -737,19 +721,21 @@ main (int argc, char **argv) initial_args = false; bc_ctl.initial_argc = bc_state.cmd_argc; bc_state.cmd_initial_argv_chars = bc_state.cmd_argv_chars; + bc_ctl.initial_argc = bc_state.cmd_argc; + /*fprintf(stderr, "setting initial_argc=%d\n", bc_state.cmd_initial_argc);*/ while ((*read_args) () != -1) if (bc_ctl.lines_per_exec && lineno >= bc_ctl.lines_per_exec) { - xargs_do_exec (&bc_ctl, &bc_state); + bc_do_exec (&bc_ctl, &bc_state); lineno = 0; } /* SYSV xargs seems to do at least one exec, even if the input is empty. */ if (bc_state.cmd_argc != bc_ctl.initial_argc - || (always_run_command && !procs_executed)) - xargs_do_exec (&bc_ctl, &bc_state); + || (always_run_command && procs_executed==0)) + bc_do_exec (&bc_ctl, &bc_state); } else @@ -780,7 +766,7 @@ main (int argc, char **argv) NULL, 0, linebuf, len, initial_args); - xargs_do_exec (&bc_ctl, &bc_state); + bc_do_exec (&bc_ctl, &bc_state); } } @@ -1089,21 +1075,24 @@ prep_child_for_exec (void) /* Execute the command that has been built in `cmd_argv'. This may involve - waiting for processes that were previously executed. */ - + waiting for processes that were previously executed. + + There are a number of cases where we want to terminate the current (child) + process. We do this by calling _exit() rather than exit() in order to + avoid the invocation of wait_for_proc_all(), which was registered by the parent + as an atexit() function. +*/ static int -xargs_do_exec (const struct buildcmd_control *ctl, struct buildcmd_state *state) +xargs_do_exec (struct buildcmd_control *ctl, struct buildcmd_state *state) { pid_t child; + int fd[2]; + int buf; + int r; (void) ctl; (void) state; - bc_push_arg (&bc_ctl, &bc_state, - (char *) NULL, 0, - NULL, 0, - false); /* Null terminate the arg list. */ - if (!query_before_executing || print_args (true)) { if (proc_max) @@ -1126,6 +1115,10 @@ xargs_do_exec (const struct buildcmd_control *ctl, struct buildcmd_state *state) */ wait_for_proc (false, 0u); + if (pipe(fd)) + error (1, errno, "could not create pipe before fork"); + fcntl(fd[1], F_SETFD, FD_CLOEXEC); + /* If we run out of processes, wait for a child to return and try again. */ while ((child = fork ()) < 0 && errno == EAGAIN && procs_executing) @@ -1137,16 +1130,113 @@ xargs_do_exec (const struct buildcmd_control *ctl, struct buildcmd_state *state) error (1, errno, _("cannot fork")); case 0: /* Child. */ - prep_child_for_exec(); - execvp (bc_state.cmd_argv[0], bc_state.cmd_argv); - error (0, errno, "%s", bc_state.cmd_argv[0]); - _exit (errno == ENOENT ? 127 : 126); + { + close(fd[0]); + child_error = 0; + + prep_child_for_exec(); + + execvp (bc_state.cmd_argv[0], bc_state.cmd_argv); + if (errno) + { + /* Write errno value to parent. We do this even if + * the error was not E2BIG, because we need to + * distinguish successful execs from unsuccessful + * ones. The reason we need to do this is to know + * whether to reap the child here (preventing the + * exit status processing in wait_for_proc() from + * changing the value of child_error) or leave it + * for wait_for_proc() to handle. We need to have + * wait_for_proc() handle the exit values from the + * utility if we run it, for POSIX compliance on the + * handling of exit values. + */ + write(fd[1], &errno, sizeof(int)); + } + + close(fd[1]); + if (E2BIG != errno) + { + error (0, errno, "%s", bc_state.cmd_argv[0]); + /* The actual value returned here should be irrelevant, + * because the parent will test our value of errno. + */ + _exit (errno == ENOENT ? 127 : 126); + + } /*NOTREACHED*/ - } - add_proc (child); - } + } /* child */ - bc_clear_args(&bc_ctl, &bc_state); + default: + { + /* Parent */ + close(fd[1]); + } + + } /* switch(child) */ + /*fprintf(stderr, "forked child (bc_state.cmd_argc=%d) -> ", bc_state.cmd_argc);*/ + + switch (r = read(fd[0], &buf, sizeof(int))) + { + case -1: + { + close(fd[0]); + error(0, errno, "errno-buffer read failed in xargs_do_exec (BUG?)"); + break; + } + + case sizeof(int): + { + /* Failure */ + int childstatus; + + close(fd[0]); + + /* we know the child is about to exit, so wait for that. + * We have to do this so that wait_for_proc() does not + * change the value of child_error on the basis of the + * return value -- since in this case we did not launch + * the utility. + * + * We do the wait before deciding if we failed in order to + * avoid creating a zombie, even briefly. + */ + waitpid(child, &childstatus, 0); + + + if (E2BIG == buf) + { + return 0; /* Failure; caller should pass fewer args */ + } + else if (ENOENT == buf) + { + exit(127); /* command cannot be found */ + } + else + { + exit(126); /* command cannot be run */ + } + break; + } + + case 0: + { + /* Failed to read data from pipe; the exec must have + * succeeded. We call add_proc only in this case, + * because it increments procs_executing, and we only + * want to do that if we didn't already wait for the + * child. + */ + add_proc (child); + break; + } + default: + { + error(1, errno, "read returned unexpected value %d! BUG?", r); + } + } /* switch on bytes read */ + close(fd[0]); + } return 1; /* Success */ } @@ -1158,7 +1248,7 @@ exec_if_possible (void) if (bc_ctl.replace_pat || initial_args || bc_state.cmd_argc == bc_ctl.initial_argc || bc_ctl.exit_if_size_exceeded) return; - xargs_do_exec (&bc_ctl, &bc_state); + bc_do_exec (&bc_ctl, &bc_state); } /* Add the process with id PID to the list of processes that have @@ -1269,8 +1359,6 @@ wait_for_proc (boolean all, unsigned int minreap) procs_executing--; reaped++; - if (WEXITSTATUS (status) == 126 || WEXITSTATUS (status) == 127) - exit (WEXITSTATUS (status)); /* Can't find or run the command. */ if (WEXITSTATUS (status) == 255) error (124, 0, _("%s: exited with status 255; aborting"), bc_state.cmd_argv[0]); if (WIFSTOPPED (status)) @@ -1289,6 +1377,14 @@ wait_for_proc_all (void) { static boolean waiting = false; + /* This function was registered by the original, parent, process. + * The child processes must not call exit() to terminate, since this + * will mean that their exit status will be inappropriately changed. + * Instead the child processes should call _exit(). If someone + * forgets this rule, you may see the following assert() fail. + */ + assert(getpid() == parent); + if (waiting) return; -- 1.5.3.8