# # patch "ChangeLog" # from [7d20d582c8d113321864da68087143fd5300f126] # to [67e45834f2d1b3f5d62ba648c23f7ab624f8f6df] # # patch "transforms.cc" # from [ae9ac59d6edadaf20c783491e293aa1404266aca] # to [b86e7728a3cdded2731266cafc985a11ce3c0654] # ======================================================================== --- ChangeLog 7d20d582c8d113321864da68087143fd5300f126 +++ ChangeLog 67e45834f2d1b3f5d62ba648c23f7ab624f8f6df @@ -1,8 +1,12 @@ 2005-08-23 Nathaniel Smith + * transforms.cc (utf8_to_system): Make fast. + +2005-08-23 Nathaniel Smith + * transforms.hh (utf8_to_system): Add a string->string version. * transforms.cc (utf8_to_system): Implement it. - + 2005-08-23 Nathaniel Smith * paths.cc (localized_path_str): New function. ======================================================================== --- transforms.cc ae9ac59d6edadaf20c783491e293aa1404266aca +++ transforms.cc b86e7728a3cdded2731266cafc985a11ce3c0654 @@ -534,7 +534,6 @@ } } - void system_to_utf8(external const & ext, utf8 & utf) { @@ -543,10 +542,72 @@ utf = out; } +// Lots of gunk to avoid charset conversion as much as possible. Running +// iconv over every element of every path in a 30,000 file manifest takes +// multiple seconds, which then is a minimum bound on pretty much any +// operation we do... +static inline bool +system_charset_is_utf8_impl() +{ + std::string lc_encoding = lowercase(system_charset()); + return (lc_encoding == "utf-8" + || lc_encoding == "utf_8" + || lc_encoding == "utf8"); +} + +static inline bool +system_charset_is_utf8() +{ + static bool it_is = filesystem_is_utf8_impl(); + return it_is; +} + +static inline bool +system_charset_is_ascii_extension_impl() +{ + if (system_charset_is_utf8()) + return true; + std::string lc_encoding = lowercase(system_charset()); + // if your character set is identical to ascii in the lower 7 bits, then add + // it here for a speed boost. + return (lc_encoding.find("ascii") != std::string::npos + || lc_encoding.find("8859") != std::string::npos + || lc_encoding.find("ansi_x3.4") != std::string::npos + // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended + // Unix Code) is a simple and clean encoding, standard on Unix + // systems.... It is backwards-compatible with ASCII (i.e. valid + // ASCII implies valid EUC)." + || lc_encoding.find("euc") != std::string::npos); +} + +static inline bool +system_charset_is_ascii_extension() +{ + static bool it_is = system_charset_is_ascii_extension_impl(); + return it_is; +} + +inline static bool +is_all_ascii(string const & utf) +{ + // could speed this up by vectorization -- mask against 0x80808080, + // process a whole word at at time... + for (std::string::const_iterator i = utf.begin(); i != utf.end(); ++i) + if (0x80 & *i) + return false; + return true; +} + void utf8_to_system(std::string const & utf, std::string & ext) { - charset_convert("UTF-8", system_charset(), utf, ext); + if (system_charset_is_utf8()) + utf = ext; + else if (system_charset_is_ascii_extension() + && is_all_ascii(utf)) + utf = ext; + else + charset_convert("UTF-8", system_charset(), utf, ext); } void @@ -605,62 +666,6 @@ free(out); } -// Lots of gunk to avoid charset conversion as much as possible. Running -// iconv over every element of every path in a 30,000 file manifest takes -// multiple seconds, which then is a minimum bound on pretty much any -// operation we do... -static inline bool -filesystem_is_utf8_impl() -{ - std::string lc_encoding = lowercase(system_charset()); - return (lc_encoding == "utf-8" - || lc_encoding == "utf_8" - || lc_encoding == "utf8"); -} - -static inline bool -filesystem_is_utf8() -{ - static bool it_is = filesystem_is_utf8_impl(); - return it_is; -} - -static inline bool -filesystem_is_ascii_extension_impl() -{ - if (filesystem_is_utf8()) - return true; - std::string lc_encoding = lowercase(system_charset()); - // if your character set is identical to ascii in the lower 7 bits, then add - // it here for a speed boost. - return (lc_encoding.find("ascii") != std::string::npos - || lc_encoding.find("8859") != std::string::npos - || lc_encoding.find("ansi_x3.4") != std::string::npos - // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended - // Unix Code) is a simple and clean encoding, standard on Unix - // systems.... It is backwards-compatible with ASCII (i.e. valid - // ASCII implies valid EUC)." - || lc_encoding.find("euc") != std::string::npos); -} - -static inline bool -filesystem_is_ascii_extension() -{ - static bool it_is = filesystem_is_ascii_extension_impl(); - return it_is; -} - -inline static bool -is_all_ascii(string const & utf) -{ - // could speed this up by vectorization -- mask against 0x80808080, - // process a whole word at at time... - for (std::string::const_iterator i = utf.begin(); i != utf.end(); ++i) - if (0x80 & *i) - return false; - return true; -} - inline static fs::path localized_impl(string const & utf) {