# # # patch "file_io.cc" # from [1c79107b3af55f17b13b5dae75d1e5dc3448e59f] # to [d09e05619e36cf4ed969a596b897b92a9a9050b1] # ============================================================ --- file_io.cc 1c79107b3af55f17b13b5dae75d1e5dc3448e59f +++ file_io.cc d09e05619e36cf4ed969a596b897b92a9a9050b1 @@ -133,39 +133,37 @@ directory_empty(any_path const & path) return true; } -static bool did_char_is_binary_init; -static bool char_is_binary[256]; +// This is not the greatest heuristic ever; it just looks for characters in +// the original ASCII control code range (00 - 1f, 7f) that are not white +// space (07 - 0D). But then, GNU diff just looks for NULs. We could do +// better if this was detecting character encoding (because then we could +// report wide encodings as such instead of treating them as binary) but the +// application proper isn't set up for that. +// +// Everything > 128 *can* be a valid text character, depending on encoding, +// even in the 80 - 9F region that Unicode reserves for yet more useless +// control characters. +// +// N.B. the obvious algorithmic version of the inner loop here +// u8 c = s[i]; +// if (c <= 0x06 || (c >= 0x0E && c < 0x20) || c == 0x7F) +// return true; +// is about twice as slow on current hardware (Intel Core2 quad). -static void -set_char_is_binary(char c, bool is_binary) -{ - char_is_binary[static_cast(c)] = is_binary; -} - -static void -init_char_is_binary() -{ - // these do not occur in ASCII text files - // FIXME: this heuristic is (a) crap and (b) hardcoded. fix both these. - // Should be calling a lua hook here that can use set_char_is_binary() - // That will at least fix (b) - string nontext_chars("\x01\x02\x03\x04\x05\x06\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18" - "\x19\x1a\x1c\x1d\x1e\x1f"); - set_char_is_binary('\0', true); - for(size_t i = 0; i < nontext_chars.size(); ++i) - { - set_char_is_binary(nontext_chars[i], true); - } -} - bool guess_binary(string const & s) { - if (did_char_is_binary_init == false) - { - init_char_is_binary(); - did_char_is_binary_init = true; - } + static const bool char_is_binary[256] = { + //_0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, // 0_ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1_ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2_ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3_ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4_ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5_ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6_ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // 7_ + 0 // 8+ + }; for (size_t i = 0; i < s.size(); ++i) {