diff --git a/locate/locate.c b/locate/locate.c index a55d807..2661be6 100644 --- a/locate/locate.c +++ b/locate/locate.c @@ -315,6 +315,7 @@ struct locate_stats uintmax_t compressed_bytes; uintmax_t total_filename_count; uintmax_t total_filename_length; + uintmax_t total_chars_reused; uintmax_t whitespace_count; uintmax_t newline_count; uintmax_t highbit_filename_count; @@ -322,18 +323,34 @@ struct locate_stats static struct locate_stats statistics; +struct substring_match +{ + bool last_result; + bool narrow; + size_t last_match_byte_offset; + uintmax_t sequence_number; + const char *pattern; + size_t pattern_len; + char * (*matcher)(const char *haystack, const char *needle); +}; + struct regular_expression { struct re_pattern_buffer regex; /* for --regex */ }; +typedef int locate_count; + struct process_data { int c; /* An input byte. */ char itemcount; /* Indicates we're at the beginning of an slocate db. */ - int count; /* The length of the prefix shared with the previous database entry. */ + uintmax_t file_name_count; /* How many file names we saw */ + locate_count count; /* The length of the prefix shared with the previous database entry. */ int len; + int previous_len; + size_t unchanged_bytes; char *original_filename; /* The current input database entry. */ size_t pathsize; /* Amount allocated for it. */ char *munged_filename; /* path or basename(path) */ @@ -500,6 +517,8 @@ visit_old_format(struct process_data *procdata, void *context) if (EOF == procdata->c) return VISIT_ABORT; + procdata->unchanged_bytes = 0u; + /* Get the offset in the path where this path info starts. */ if (procdata->c == LOCATEDB_OLD_ESCAPE) { @@ -522,6 +541,10 @@ visit_old_format(struct process_data *procdata, void *context) procdata->count += (procdata->c - LOCATEDB_OLD_OFFSET); assert(procdata->count >= 0); } + /* Keep running statistics about how many characters are the same + * between consecutive pathnames. + */ + statistics.total_chars_reused += procdata->count; /* Overlay the old path with the remainder of the new. Read * more data until we get to the next filename. @@ -553,52 +576,55 @@ visit_old_format(struct process_data *procdata, void *context) */ extend (procdata, i, 1u); procdata->original_filename[i] = 0; + procdata->previous_len = procdata->len; + ++(procdata->file_name_count); procdata->len = i; procdata->munged_filename = procdata->original_filename; return VISIT_CONTINUE; } -static int -visit_locate02_format(struct process_data *procdata, void *context) + +static locate_count +read_locate02_itemcount(struct process_data *procdata) { - register char *s; - int nread; - (void) context; + locate_count newcount = procdata->count; if (procdata->slocatedb_format) { if (procdata->itemcount == 0) { ungetc(procdata->c, procdata->fp); - procdata->count = 0; + newcount = 0; + procdata->unchanged_bytes = 0u; procdata->len = 0; + procdata->previous_len = 0; } else if (procdata->itemcount == 1) { - procdata->count = procdata->len-1; + newcount = procdata->len-1; } else { if (procdata->c == LOCATEDB_ESCAPE) - procdata->count += (short)get_short (procdata->fp); + newcount = procdata->count + (short)get_short (procdata->fp); else if (procdata->c > 127) - procdata->count += procdata->c - 256; + newcount = procdata->count + procdata->c - 256; else - procdata->count += procdata->c; + newcount = procdata->count + procdata->c; } } else { if (procdata->c == LOCATEDB_ESCAPE) - procdata->count += (short)get_short (procdata->fp); + newcount = procdata->count + (short)get_short (procdata->fp); else if (procdata->c > 127) - procdata->count += procdata->c - 256; + newcount = procdata->count + procdata->c - 256; else - procdata->count += procdata->c; + newcount = procdata->count + procdata->c; } - if (procdata->count > procdata->len || procdata->count < 0) + if (newcount > procdata->len || newcount < 0) { /* This should not happen generally , but since we're * reading in data which is outside our control, we @@ -607,15 +633,46 @@ visit_locate02_format(struct process_data *procdata, void *context) error(1, 0, _("locate database %s is corrupt or invalid"), quotearg_n_style(0, locale_quoting_style, procdata->dbfile)); } - + + /* Keep running statistics about how many characters are the same + * between consecutive pathnames. + */ + statistics.total_chars_reused += newcount; + + return newcount; +} + + +static int +visit_locate02_format(struct process_data *procdata, void *context) +{ + register char *s; + int nread; + size_t new_len; + + (void) context; + locate_count newcount, oldcount; + + procdata->previous_len = procdata->len; + + oldcount = procdata->count; + newcount = read_locate02_itemcount(procdata); + /* Overlay the old path with the remainder of the new. */ nread = locate_read_str (&procdata->original_filename, &procdata->pathsize, - procdata->fp, 0, procdata->count); + procdata->fp, 0, newcount); + procdata->unchanged_bytes = newcount; + if (nread < 0) - return VISIT_ABORT; + { + procdata->count = newcount; + return VISIT_ABORT; + } + procdata->c = getc (procdata->fp); - procdata->len = procdata->count + nread; + procdata->len = newcount + nread; + s = procdata->original_filename + procdata->len - 1; /* Move to the last char in path. */ assert (s[0] != '\0'); assert (s[1] == '\0'); /* Our terminator. */ @@ -632,7 +689,8 @@ visit_locate02_format(struct process_data *procdata, void *context) } } - + procdata->count = newcount; + ++(procdata->file_name_count); return VISIT_CONTINUE; } @@ -734,50 +792,109 @@ visit_non_existing_nofollow(struct process_data *procdata, void *context) } } -static int -visit_substring_match_nocasefold_wide(struct process_data *procdata, void *context) -{ - const char *pattern = context; - - if (NULL != mbsstr(procdata->munged_filename, pattern)) - return VISIT_ACCEPTED; - else - return VISIT_REJECTED; -} - -static int -visit_substring_match_nocasefold_narrow(struct process_data *procdata, void *context) -{ - const char *pattern = context; - assert(MB_CUR_MAX == 1); - if (NULL != strstr(procdata->munged_filename, pattern)) - return VISIT_ACCEPTED; - else - return VISIT_REJECTED; -} - -static int -visit_substring_match_casefold_wide(struct process_data *procdata, void *context) +static int +visit_substring_match(struct process_data *procdata, + void *context) { - const char *pattern = context; + struct substring_match *pm = context; + const char *pattern = pm->pattern; + const char *match_pos; + size_t start_offset; + + bool match; - if (NULL != mbscasestr(procdata->munged_filename, pattern)) - return VISIT_ACCEPTED; + start_offset = 0u; + if (procdata->file_name_count == pm->sequence_number+1) + { + if (pm->last_result) + { + /* Previous match passed. If the leading bytes that did not + * match last time are unchanged, there is no point + * searching them again. + */ + if (pm->last_match_byte_offset < procdata->unchanged_bytes) + { + start_offset = pm->last_match_byte_offset; + } + else + { + /* We would like to estimate a start position + * for our search, but since the last unchanged + * byte may be the first byte in a multibyte sequence, + * I know of no way to do that. + * + * In any case, the last unchanged byte may be a partial + * match which is now completed by the new bytes, and so + * making this case work also means being able to move + * left enough characters to catch the full match; so even if + * we can figure out how to resynchronise near a point, the + * appropriate point is before the last unchanged byte anyway. + */ + start_offset = 0u; + } + } + else + { + /* We know there is no match between 0 and the + * previous length. But the last part of the + * previous string may be a partial match. + * + * We can move backward in unibyte strings, but + * not multibyte ones. + */ + if (pm->narrow) + { + if (procdata->previous_len < pm->pattern_len) + { + start_offset = 0u; + } + else + { + if (procdata->previous_len > pm->pattern_len) + { + start_offset = procdata->previous_len - pm->pattern_len; + if (start_offset < procdata->unchanged_bytes) + start_offset = procdata->unchanged_bytes; + } + else + { + start_offset = 0u; + } + } + } + else + { + /* I don't know a good way to move backwards in a + * multibyte string. + */ + start_offset = 0u; + } + } + } else - return VISIT_REJECTED; -} - - -static int -visit_substring_match_casefold_narrow(struct process_data *procdata, void *context) -{ - const char *pattern = context; + { + /* This matcher didn't get offered a chance last time, so + * procdata->unchanged_bytes may in fact be greater than the + * offset of the first byte that changed since the last time we + * were called for this substring_match instance. + */ + start_offset = 0u; + } - assert(MB_CUR_MAX == 1); - if (NULL != strcasestr(procdata->munged_filename, pattern)) - return VISIT_ACCEPTED; + pm->sequence_number = procdata->file_name_count; + match_pos = (pm->matcher) (&procdata->munged_filename[start_offset], pattern); + if (match_pos) + { + pm->last_match_byte_offset = match_pos - pattern; + pm->last_result = true; + return VISIT_ACCEPTED; + } else - return VISIT_REJECTED; + { + pm->last_match_byte_offset = 0u; + pm->last_result = false; + return VISIT_REJECTED; + } } @@ -913,6 +1030,10 @@ print_stats(int argc, size_t database_file_size) printf(_("\n\tand %s contain characters with the high bit set.\n"), human_readable (statistics.highbit_filename_count, hbuf, human_ceiling, 1, 1)); + printf(_("%s characters were unchanged between successive records.\n"), + human_readable (statistics.total_chars_reused, + hbuf, human_ceiling, 1, 1)); + if (!argc) { @@ -1034,6 +1155,23 @@ i_am_little_endian(void) return u.ui == 1; } +static struct substring_match* +new_substring_match(const char *pattern, + char* (*matcher)(const char*,const char*), + bool narrow) +{ + struct substring_match *p = malloc (sizeof(struct substring_match)); + if (p) + { + p->last_result = true; + p->last_match_byte_offset = 0u; + p->pattern = pattern; + p->pattern_len = strlen(pattern); + p->narrow = narrow; + p->matcher = matcher; + } + return p; +} @@ -1082,6 +1220,7 @@ search_one_database (int argc, oldformat = 0; procdata.endian_state = GetwordEndianStateInitial; procdata.len = procdata.count = 0; + procdata.unchanged_bytes = 0u; procdata.slocatedb_format = 0; procdata.itemcount = 0; @@ -1256,26 +1395,19 @@ search_one_database (int argc, * traditional behaviour. * James Youngman */ - visitfunc matcher; - if (1 == MB_CUR_MAX) + char* (*matcher)(const char*, const char*); + const bool narrow = (1 == MB_CUR_MAX); + if (narrow) { - /* As an optimisation, use a strstr() matcher if we are - * in a unibyte locale. This can give a x2 speedup in - * the C locale. Some light testing reveals that - * glibc's strstr() is somewhere around 40% faster than - * gnulib's, so we just use strstr(). - */ - matcher = ignore_case ? - visit_substring_match_casefold_narrow : - visit_substring_match_nocasefold_narrow; + /* Not usng a multibyte encoding. */ + matcher = ignore_case ? strcasestr : strstr; } else { - matcher = ignore_case ? - visit_substring_match_casefold_wide : - visit_substring_match_nocasefold_wide; + matcher = ignore_case ? mbscasestr : mbsstr; } - add_visitor(matcher, pathpart); + add_visitor(visit_substring_match, + new_substring_match(pathpart, matcher, narrow)); } } @@ -1770,6 +1902,7 @@ dolocate (int argc, char **argv, int secure_db_fd) statistics.compressed_bytes = statistics.total_filename_count = statistics.total_filename_length = + statistics.total_chars_reused = statistics.whitespace_count = statistics.newline_count = statistics.highbit_filename_count = 0u;