# # # add_file "charset.cc" # content [e954d3206a9df54b32bea7aba29d723fbad74f14] # # add_file "charset.hh" # content [e4f776a4afa20b3a770e0a6ac87824ebd24232ef] # # add_file "simplestring_xform.cc" # content [b6d21fa059da420562b8dfea0e833b1806d7766d] # # add_file "simplestring_xform.hh" # content [5226320784043474c9f5fd799b8e86ca2f070d8d] # # patch "ChangeLog" # from [b7eab83ddef0e1e1aa9ce27433aefe5ef7c1767a] # to [9d48a6925823f964e9a54d74c749a8f36aa0b010] # # patch "Makefile.am" # from [9e4f81a1650f6b7ea78fc91412facc9085d17370] # to [dff9212d58d7f5530cd3474b073ebb03114922f4] # # patch "annotate.cc" # from [714dd862061320e7a6b98231a9fece85fb777f6d] # to [5f284a607f1ef3dd5a7e192a58c396ce135ae9ff] # # patch "app_state.cc" # from [659c5d285188926cc6837777a4e8ba037bb8dc92] # to [6a8be1d3051e8e84464730d9d4f60cc9abca098d] # # patch "cert.cc" # from [49898212ea84352348d3de005ec0f64aedacec6d] # to [90434ca979245b8d440c3daead3d8ad5120e27ee] # # patch "cmd_db.cc" # from [f52e5e7e9d01d7cc025ed7dc398e8b108952a159] # to [cef42bd1ec6ec18b7e37bfbd70d1d9decee2d679] # # patch "cmd_diff_log.cc" # from [a831b231bb020f181f60425bd0bca65624c140c5] # to [e4ee11b436926500bb79a452886c3570dad608af] # # patch "cmd_files.cc" # from [180ee596a5ca3008d4ba94215d821e769f81a6b9] # to [6979ad01ab19f55ea0fa0a119cef5c4b7a6c4bdc] # # patch "cmd_key_cert.cc" # from [4791bf13e667065cab77e9e26b33c0ae2b255b7c] # to [6b1f09475de549a193b7604f4512b0a4e6a5ff2e] # # patch "cmd_list.cc" # from [a7051aab9b0e1dcb9d6880c12c41c1df3abf309e] # to [5fcd71a963f52bf864f3a8f392c8d9ae328ac953] # # patch "commands.cc" # from [54fcd00817ebb45cbb14262b8538da0ff5a19c01] # to [a02d61685a3807368a2e1ec1fed79ea5f342ea79] # # patch "diff_patch.cc" # from [d33f922b61cd32299ba5adad92c746680b392cc8] # to [aa0b86873084492d817470968a14537d56168bce] # # patch "file_io.cc" # from [c697f41e7ea39c7ba830a3662f9750449402f5fd] # to [76f4b114209f9e82c86a7af82cea0f36a75bf447] # # patch "keys.cc" # from [4c974aa7fc542e4b9d3361a185f0ee29291af931] # to [ab0e49cae3664e3aa8f52c652e0825dea72de281] # # patch "monotone.cc" # from [cf9b8d637091861448eb03a8c358a8708c27aebb] # to [7ce7990a7e4ebb99f1980ccdeca7a6e1b297ea7b] # # patch "packet.cc" # from [0403b0840c9d378669cdebee37b9bd5247e0f4b7] # to [130498edfbbd9c527c89e6e6c5c2d8d6193d3590] # # patch "paths.cc" # from [375da7ea4ea6a8a047824de80c1c5a5f3daef181] # to [5680d2c2726d5894fe21cc4c7dd9f149ba1efb32] # # patch "revision.cc" # from [30726868211c9c820002c3e281a5728c4a640ac6] # to [e41a1f765fa1a8d68288a62f74da3215864ada44] # # patch "roster.cc" # from [a57e3ffdc43ec0089cbcf72ad5d8165945cbcdad] # to [0aa31818d4a5c43136c8b6fe3ff2e0ba9f3aa751] # # patch "sanity.cc" # from [113604698c798bcaea7d7d4fb42417090fe4cec1] # to [d10487eb6479f58640483c72bb8c207903b0ade1] # # patch "transforms.cc" # from [d3970881bad05091164e06c3186db95beebf6533] # to [079c3479e99dda5711ca148a67c93f756641b0d1] # # patch "transforms.hh" # from [0ab0d07b7a5b8d05c7c64a9770429435d667475e] # to [72398b1a08314a36352c7216fff9f6e2c00f74b3] # # patch "ui.cc" # from [1d1fcfee81b62f2698d9deb380c49f9c29765a67] # to [180abc6a7a5e1e7c552abc05d8d1806aebcae786] # # patch "unit_tests.cc" # from [b9f978cb102ef892f9c08f757e2150107e1e792d] # to [9b3d3fd11212c735e093314f79e5fbd76977ebab] # # patch "unit_tests.hh" # from [0b883587fecefa652cf6bcc11c45dd326ad2a181] # to [dd69dcf3b16174632d8e4ef123ae6404208fc81e] # # patch "work.cc" # from [7968939058440542a070522a04e9f36e6711f318] # to [53ac432f1fc3241c5e2b3141bbbe01fe96ce7401] # ============================================================ --- charset.cc e954d3206a9df54b32bea7aba29d723fbad74f14 +++ charset.cc e954d3206a9df54b32bea7aba29d723fbad74f14 @@ -0,0 +1,853 @@ +#include + +#include + +#include "charset.hh" +#include "sanity.hh" +#include "simplestring_xform.hh" +#include "numeric_vocab.hh" + +#include "idna/idna.h" +#include "idna/stringprep.h" + +using std::string; +using std::vector; + +// general character code conversion routines + +static string +system_charset() +{ + char const * locale_charset_name = stringprep_locale_charset (); + I(locale_charset_name != NULL); + string sys_charset(locale_charset_name); + return sys_charset; +} + +void +charset_convert(string const & src_charset, + string const & dst_charset, + string const & src, + string & dst) +{ + if (src_charset == dst_charset) + dst = src; + else + { + L(FL("converting %d bytes from %s to %s\n") % src.size() + % src_charset % dst_charset); + char * converted = stringprep_convert(src.c_str(), + dst_charset.c_str(), + src_charset.c_str()); + E(converted != NULL, + F("failed to convert string from %s to %s: '%s'") + % src_charset % dst_charset % src); + dst = string(converted); + free(converted); + } +} + +void +system_to_utf8(external const & ext, utf8 & utf) +{ + string out; + charset_convert(system_charset(), "UTF-8", ext(), out); + I(utf8_validate(out)); + utf = out; +} + +size_t +display_width(utf8 const & utf) +{ + std::string const & u = utf(); + size_t sz = 0; + std::string::const_iterator i = u.begin(); + while (i != u.end()) + { + if (UNLIKELY(static_cast(*i) & static_cast(0x80))) + { + // A UTF-8 escape: consume the full escape. + ++i; + ++sz; + while (i != u.end() + && (static_cast(*i) & static_cast(0x80)) + && (!(static_cast(*i) & static_cast(0x40)))) + ++i; + } + else + { + // An ASCII-like character in the range 0..0x7F. + ++i; + ++sz; + } + } + return sz; +} + +// Lots of gunk to avoid charset conversion as much as possible. Running +// iconv over every element of every path in a 30,000 file manifest takes +// multiple seconds, which then is a minimum bound on pretty much any +// operation we do... +static inline bool +system_charset_is_utf8_impl() +{ + std::string lc_encoding = lowercase(system_charset()); + return (lc_encoding == "utf-8" + || lc_encoding == "utf_8" + || lc_encoding == "utf8"); +} + +static inline bool +system_charset_is_utf8() +{ + static bool it_is = system_charset_is_utf8_impl(); + return it_is; +} + +static inline bool +system_charset_is_ascii_extension_impl() +{ + if (system_charset_is_utf8()) + return true; + std::string lc_encoding = lowercase(system_charset()); + // if your character set is identical to ascii in the lower 7 bits, then add + // it here for a speed boost. + return (lc_encoding.find("ascii") != std::string::npos + || lc_encoding.find("8859") != std::string::npos + || lc_encoding.find("ansi_x3.4") != std::string::npos + // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended + // Unix Code) is a simple and clean encoding, standard on Unix + // systems.... It is backwards-compatible with ASCII (i.e. valid + // ASCII implies valid EUC)." + || lc_encoding.find("euc") != std::string::npos); +} + +static inline bool +system_charset_is_ascii_extension() +{ + static bool it_is = system_charset_is_ascii_extension_impl(); + return it_is; +} + +inline static bool +is_all_ascii(string const & utf) +{ + // could speed this up by vectorization -- mask against 0x80808080, + // process a whole word at at time... + for (std::string::const_iterator i = utf.begin(); i != utf.end(); ++i) + if (0x80 & *i) + return false; + return true; +} + +// this function must be fast. do not make it slow. +void +utf8_to_system(utf8 const & utf, std::string & ext) +{ + if (system_charset_is_utf8()) + ext = utf(); + else if (system_charset_is_ascii_extension() + && is_all_ascii(utf())) + ext = utf(); + else + charset_convert("UTF-8", system_charset(), utf(), ext); +} + +void +utf8_to_system(utf8 const & utf, external & ext) +{ + string out; + utf8_to_system(utf, out); + ext = out; +} + +// utf8_validate and the helper functions is_valid_unicode_char and +// utf8_consume_continuation_char g_utf8_validate and supporting functions +// from the file gutf8.c of the GLib library. + +static bool +is_valid_unicode_char(u32 c) +{ + return (c < 0x110000 && + ((c & 0xfffff800) != 0xd800) && + (c < 0xfdd0 || c > 0xfdef) && + (c & 0xfffe) != 0xfffe); +} + +static bool +utf8_consume_continuation_char(u8 c, u32 & val) +{ + if ((c & 0xc0) != 0x80) + return false; + val <<= 6; + val |= c & 0x3f; + return true; +} + +bool +utf8_validate(utf8 const & utf) +{ + std::string::size_type left = utf().size(); + u32 min, val; + + for (std::string::const_iterator i = utf().begin(); + i != utf().end(); ++i, --left) + { + u8 c = *i; + if (c < 128) + continue; + if ((c & 0xe0) == 0xc0) + { + if (left < 2) + return false; + if ((c & 0x1e) == 0) + return false; + ++i; --left; c = *i; + if ((c & 0xc0) != 0x80) + return false; + } + else + { + if ((c & 0xf0) == 0xe0) + { + if (left < 3) + return false; + min = 1 << 11; + val = c & 0x0f; + goto two_remaining; + } + else if ((c & 0xf8) == 0xf0) + { + if (left < 4) + return false; + min = 1 << 16; + val = c & 0x07; + } + else + return false; + ++i; --left; c = *i; + if (!utf8_consume_continuation_char(c, val)) + return false; +two_remaining: + ++i; --left; c = *i; + if (!utf8_consume_continuation_char(c, val)) + return false; + ++i; --left; c = *i; + if (!utf8_consume_continuation_char(c, val)) + return false; + if (val < min) + return false; + if (!is_valid_unicode_char(val)) + return false; + } + } + return true; +} + +static string +decode_idna_error(int err) +{ + switch (static_cast(err)) + { + case IDNA_STRINGPREP_ERROR: return "stringprep error"; break; + case IDNA_PUNYCODE_ERROR: return "punycode error"; break; + case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break; + case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break; + case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break; + case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break; + case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break; + case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break; + case IDNA_ICONV_ERROR: return "iconv error"; break; + case IDNA_MALLOC_ERROR: return "malloc error"; break; + default: return "unknown error"; break; + } + return "unknown error"; +} + +void +ace_to_utf8(ace const & a, utf8 & utf) +{ + char *out = NULL; + L(FL("converting %d bytes from IDNA ACE to UTF-8\n") % a().size()); + int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES); + N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX, + F("error converting %d UTF-8 bytes to IDNA ACE: %s") + % a().size() + % decode_idna_error(res)); + utf = string(out); + free(out); +} + +void +utf8_to_ace(utf8 const & utf, ace & a) +{ + char *out = NULL; + L(FL("converting %d bytes from UTF-8 to IDNA ACE\n") % utf().size()); + int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES); + N(res == IDNA_SUCCESS, + F("error converting %d UTF-8 bytes to IDNA ACE: %s") + % utf().size() + % decode_idna_error(res)); + a = string(out); + free(out); +} + +void +internalize_cert_name(utf8 const & utf, cert_name & c) +{ + ace a; + utf8_to_ace(utf, a); + c = a(); +} + +void +internalize_cert_name(external const & ext, cert_name & c) +{ + utf8 utf; + system_to_utf8(ext(), utf); + internalize_cert_name(utf, c); +} + +void +externalize_cert_name(cert_name const & c, utf8 & utf) +{ + ace_to_utf8(ace(c()), utf); +} + +void +externalize_cert_name(cert_name const & c, external & ext) +{ + utf8 utf; + externalize_cert_name(c, utf); + utf8_to_system(utf, ext); +} + +void +internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key) +{ + string tmp; + typedef boost::tokenizer > + tokenizer; + boost::char_separator sep("", ".@", boost::keep_empty_tokens); + tokenizer tokens(utf(), sep); + bool in_domain = false; + for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i) + { + if (!in_domain || *i == "." || *i == "@") + tmp += *i; + else + { + ace a; + utf8_to_ace(*i, a); + tmp += a(); + } + if (*i == "@") + in_domain = true; + } + key = tmp; +} + +void +internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key) +{ + utf8 utf; + system_to_utf8(ext, utf); + internalize_rsa_keypair_id(utf, key); +} + +void +externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf) +{ + string tmp; + typedef boost::tokenizer > + tokenizer; + boost::char_separator sep("", ".@", boost::keep_empty_tokens); + tokenizer tokens(key(), sep); + bool in_domain = false; + for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i) + { + if (!in_domain || *i == "." || *i == "@") + tmp += *i; + else + { + ace a(*i); + utf8 u; + ace_to_utf8(a, u); + tmp += u(); + } + if (*i == "@") + in_domain = true; + } + utf = tmp; +} + +void +externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext) +{ + utf8 utf; + externalize_rsa_keypair_id(key, utf); + utf8_to_system(utf, ext); +} + +void +internalize_var_domain(utf8 const & utf, var_domain & d) +{ + ace a; + utf8_to_ace(utf, a); + d = a(); +} + +void +internalize_var_domain(external const & ext, var_domain & d) +{ + utf8 utf; + system_to_utf8(ext(), utf); + internalize_var_domain(utf, d); +} + +void +externalize_var_domain(var_domain const & d, utf8 & utf) +{ + ace_to_utf8(ace(d()), utf); +} + +void +externalize_var_domain(var_domain const & d, external & ext) +{ + utf8 utf; + externalize_var_domain(d, utf); + utf8_to_system(utf, ext); +} + + +#ifdef BUILD_UNIT_TESTS +#include "unit_tests.hh" +#include + +#define IDNA_ACE_PREFIX "xn--" +#define IDNA_SUCCESS 0 + +struct +idna +{ + char *name; + size_t inlen; + uint32_t in[100]; + char *out; + int allowunassigned; + int usestd3asciirules; + int toasciirc; + int tounicoderc; +} idna_vec[] = + { + { + "Arabic (Egyptian)", 17, + { + 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643, + 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A, + 0x061F}, + IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Chinese (simplified)", 9, + { + 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587}, + IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Chinese (traditional)", 9, + { + 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587}, + IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Czech", 22, + { + 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073, + 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076, + 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079}, + IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Hebrew", 22, + { + 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5, + 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9, + 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA}, + IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Hindi (Devanagari)", 30, + { + 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928, + 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902, + 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938, + 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902}, + IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0, + IDNA_SUCCESS}, + { + "Japanese (kanji and hiragana)", 18, + { + 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E, + 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044, + 0x306E, 0x304B}, + IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0, + IDNA_SUCCESS}, + { + "Russian (Cyrillic)", 28, + { + 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435, + 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432, + 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443, + 0x0441, 0x0441, 0x043A, 0x0438}, + IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0, + IDNA_SUCCESS, IDNA_SUCCESS}, + { + "Spanish", 40, + { + 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F, + 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069, + 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074, + 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065, + 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C}, + IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0, + IDNA_SUCCESS}, + { + "Vietnamese", 31, + { + 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD, + 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3, + 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069, + 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074}, + IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0, + IDNA_SUCCESS}, + { + "Japanese", 8, + { + 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F}, + IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Japanese", 24, + { + 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069, + 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052, + 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053}, + IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0, + IDNA_SUCCESS}, + { + "Japanese", 25, + { + 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E, + 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061, + 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834, + 0x6240}, + IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0, + IDNA_SUCCESS}, + { + "Japanese", 8, + { + 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032}, + IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Japanese", 13, + { + 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069, + 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D}, + IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0, IDNA_SUCCESS, + IDNA_SUCCESS}, + { + "Japanese", 9, + { + 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0}, + IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, + { + "Japanese", 7, + { + 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067}, + IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, + { + "Greek", 8, + {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac}, + IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, + { + "Maltese (Malti)", 10, + {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127, + 0x0127, 0x0061}, + IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, + { + "Russian (Cyrillic)", 28, + {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435, + 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432, + 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443, + 0x0441, 0x0441, 0x043a, 0x0438}, + IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0, + IDNA_SUCCESS, IDNA_SUCCESS}, + }; + +static void +check_idna_encoding() +{ + putenv("CHARSET=UTF-8"); + + for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i) + { + BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name)); + + size_t p, q; + char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in, + idna_vec[i].inlen, + &p, &q); + utf8 utf = string(uc); + utf8 tutf; + free(uc); + + ace a = string(idna_vec[i].out); + ace tace; + utf8_to_ace(utf, tace); + L(boost::format("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tace()); + BOOST_CHECK(lowercase(a()) == lowercase(tace())); + ace_to_utf8(a, tutf); + BOOST_CHECK(lowercase(utf()) == lowercase(tutf())); + } +} + +static void encode_test() +{ + check_idna_encoding(); +} + +static void utf8_validation_test() +{ + // these tests are based on the tests from the file utf8-validate.c of the + // GLib library, and also include sequences from Markus Kuhn's UTF-8 + // example files. + const char* good_strings[] = { + "this is a valid but boring ASCII string", + "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88", + "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73", + "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d", + "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e\xe2\x80\x9c", + "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6", + "\xc2\xa9\xc2\xa9\xc2\xa9", + "\xe2\x89\xa0\xe2\x89\xa0", + "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", + "\x00", + "\xc2\x80", + "\xe0\xa0\x80", + "\xf0\x90\x80\x80", + "\x7f", + "\xdf\xbf", + "\xed\x9f\xbf", + "\xee\x80\x80", + "\xef\xbf\xbd", + 0 + }; + const char* bad_strings[] = { + "\xf8\x88\x80\x80\x80", + "\xfc\x84\x80\x80\x80\x80", + "\xef\xbf\xbf", + "\xf7\xbf\xbf\xbf", + "\xfb\xbf\xbf\xbf\xbf", + "\xfd\xbf\xbf\xbf\xbf\xbf", + "\xf4\x8f\xbf\xbf", + "\xf4\x90\x80\x80", + "\x80", + "\xbf", + "\x80\xbf", + "\x80\xbf\x80", + "\x80\xbf\x80\xbf", + "\x80\xbf\x80\xbf\x80", + "\x80\xbf\x80\xbf\x80\xbf", + "\x80\xbf\x80\xbf\x80\xbf\x80", + "\x80", + "\x81", + "\x82", + "\x83", + "\x84", + "\x85", + "\x86", + "\x87", + "\x88", + "\x89", + "\x8a", + "\x8b", + "\x8c", + "\x8d", + "\x8e", + "\x8f", + "\x90", + "\x91", + "\x92", + "\x93", + "\x94", + "\x95", + "\x96", + "\x97", + "\x98", + "\x99", + "\x9a", + "\x9b", + "\x9c", + "\x9d", + "\x9e", + "\x9f", + "\xa0", + "\xa1", + "\xa2", + "\xa3", + "\xa4", + "\xa5", + "\xa6", + "\xa7", + "\xa8", + "\xa9", + "\xaa", + "\xab", + "\xac", + "\xad", + "\xae", + "\xaf", + "\xb0", + "\xb1", + "\xb2", + "\xb3", + "\xb4", + "\xb5", + "\xb6", + "\xb7", + "\xb8", + "\xb9", + "\xba", + "\xbb", + "\xbc", + "\xbd", + "\xbe", + "\xbf", + "\xc0\x20", + "\xc1\x20", + "\xc2\x20", + "\xc3\x20", + "\xc4\x20", + "\xc5\x20", + "\xc6\x20", + "\xc7\x20", + "\xc8\x20", + "\xc9\x20", + "\xca\x20", + "\xcb\x20", + "\xcc\x20", + "\xcd\x20", + "\xce\x20", + "\xcf\x20", + "\xd0\x20", + "\xd1\x20", + "\xd2\x20", + "\xd3\x20", + "\xd4\x20", + "\xd5\x20", + "\xd6\x20", + "\xd7\x20", + "\xd8\x20", + "\xd9\x20", + "\xda\x20", + "\xdb\x20", + "\xdc\x20", + "\xdd\x20", + "\xde\x20", + "\xdf\x20", + "\xe0\x20", + "\xe1\x20", + "\xe2\x20", + "\xe3\x20", + "\xe4\x20", + "\xe5\x20", + "\xe6\x20", + "\xe7\x20", + "\xe8\x20", + "\xe9\x20", + "\xea\x20", + "\xeb\x20", + "\xec\x20", + "\xed\x20", + "\xee\x20", + "\xef\x20", + "\xf0\x20", + "\xf1\x20", + "\xf2\x20", + "\xf3\x20", + "\xf4\x20", + "\xf5\x20", + "\xf6\x20", + "\xf7\x20", + "\xf8\x20", + "\xf9\x20", + "\xfa\x20", + "\xfb\x20", + "\xfc\x20", + "\xfd\x20", + "\x20\xc0", + "\x20\xe0\x80", + "\x20\xf0\x80\x80", + "\x20\xf8\x80\x80\x80", + "\x20\xfc\x80\x80\x80\x80", + "\x20\xdf", + "\x20\xef\xbf", + "\x20\xf7\xbf\xbf", + "\x20\xfb\xbf\xbf\xbf", + "\x20\xfd\xbf\xbf\xbf\xbf", + "\x20\xfe\x20", + "\x20\xff\x20", + "\x20\xc0\xaf\x20", + "\x20\xe0\x80\xaf\x20", + "\x20\xf0\x80\x80\xaf\x20", + "\x20\xf8\x80\x80\x80\xaf\x20", + "\x20\xfc\x80\x80\x80\x80\xaf\x20", + "\x20\xc1\xbf\x20", + "\x20\xe0\x9f\xbf\x20", + "\x20\xf0\x8f\xbf\xbf\x20", + "\x20\xf8\x87\xbf\xbf\xbf\x20", + "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", + "\x20\xc0\x80\x20", + "\x20\xe0\x80\x80\x20", + "\x20\xf0\x80\x80\x80\x20", + "\x20\xf8\x80\x80\x80\x80\x20", + "\x20\xfc\x80\x80\x80\x80\x80\x20", + "\x20\xed\xa0\x80\x20", + "\x20\xed\xad\xbf\x20", + "\x20\xed\xae\x80\x20", + "\x20\xed\xaf\xbf\x20", + "\x20\xed\xb0\x80\x20", + "\x20\xed\xbe\x80\x20", + "\x20\xed\xbf\xbf\x20", + "\x20\xed\xa0\x80\xed\xb0\x80\x20", + "\x20\xed\xa0\x80\xed\xbf\xbf\x20", + "\x20\xed\xad\xbf\xed\xb0\x80\x20", + "\x20\xed\xad\xbf\xed\xbf\xbf\x20", + "\x20\xed\xae\x80\xed\xb0\x80\x20", + "\x20\xed\xae\x80\xed\xbf\xbf\x20", + "\x20\xed\xaf\xbf\xed\xb0\x80\x20", + "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", + "\x20\xef\xbf\xbe\x20", + "\x20\xef\xbf\xbf\x20", + 0 + }; + + for (int i = 0; good_strings[i]; ++i) + BOOST_CHECK(utf8_validate(string(good_strings[i])) == true); + + for (int i = 0; bad_strings[i]; ++i) + BOOST_CHECK(utf8_validate(string(bad_strings[i])) == false); +} + + +void +add_charset_tests(test_suite * suite) +{ + I(suite); + suite->add(BOOST_TEST_CASE(&encode_test)); + suite->add(BOOST_TEST_CASE(&utf8_validation_test)); +} + +#endif // BUILD_UNIT_TESTS ============================================================ --- charset.hh e4f776a4afa20b3a770e0a6ac87824ebd24232ef +++ charset.hh e4f776a4afa20b3a770e0a6ac87824ebd24232ef @@ -0,0 +1,33 @@ +#ifndef __CHARSET_HH__ +#define __CHARSET_HH__ + +#include "vocab.hh" + +// charset conversions +void charset_convert(std::string const & src_charset, std::string const & dst_charset, + std::string const & src, std::string & dst); +void system_to_utf8(external const & system, utf8 & utf); +void utf8_to_system(utf8 const & utf, external & system); +void utf8_to_system(utf8 const & utf, std::string & system); +void ace_to_utf8(ace const & ac, utf8 & utf); +void utf8_to_ace(utf8 const & utf, ace & a); +bool utf8_validate(utf8 const & utf); + +// returns length in characters (not bytes) +size_t display_width(utf8 const & utf); + +// specific internal / external conversions for various vocab terms +void internalize_cert_name(utf8 const & utf, cert_name & c); +void internalize_cert_name(external const & ext, cert_name & c); +void externalize_cert_name(cert_name const & c, utf8 & utf); +void externalize_cert_name(cert_name const & c, external & ext); +void internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key); +void internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key); +void externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf); +void externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext); +void internalize_var_domain(utf8 const & utf, var_domain & d); +void internalize_var_domain(external const & ext, var_domain & d); +void externalize_var_domain(var_domain const & d, utf8 & utf); +void externalize_var_domain(var_domain const & d, external & ext); + +#endif ============================================================ --- simplestring_xform.cc b6d21fa059da420562b8dfea0e833b1806d7766d +++ simplestring_xform.cc b6d21fa059da420562b8dfea0e833b1806d7766d @@ -0,0 +1,256 @@ +#include "simplestring_xform.hh" +#include "sanity.hh" +#include "constants.hh" + +#include + +using std::string; +using std::vector; +using std::ostringstream; +using std::ostream_iterator; + +struct +lowerize +{ + char operator()(char const & c) const + { + return ::tolower(static_cast(c)); + } +}; + +string +lowercase(string const & in) +{ + string n(in); + transform(n.begin(), n.end(), n.begin(), lowerize()); + return n; +} + +struct +upperize +{ + char operator()(char const & c) const + { + return ::toupper(static_cast(c)); + } +}; + +string +uppercase(string const & in) +{ + string n(in); + transform(n.begin(), n.end(), n.begin(), upperize()); + return n; +} + +void split_into_lines(std::string const & in, + std::string const & encoding, + std::vector & out) +{ + std::string lc_encoding = lowercase(encoding); + out.clear(); + + // note: this function does not handle ISO-2022-X, Shift-JIS, and + // probably a good deal of other encodings as well. please expand + // the logic here if you can work out an easy way of doing line + // breaking on these encodings. currently it's just designed to + // work with charsets in which 0x0a / 0x0d are *always* \n and \r + // respectively. + // + // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI, + // ASCII, and UTF-8 families of encodings. + + if (lc_encoding == constants::default_encoding + || lc_encoding.find("ascii") != std::string::npos + || lc_encoding.find("8859") != std::string::npos + || lc_encoding.find("euc") != std::string::npos + || lc_encoding.find("koi") != std::string::npos + || lc_encoding.find("gb") != std::string::npos + || lc_encoding == "utf-8" + || lc_encoding == "utf_8" + || lc_encoding == "utf8") + { + std::string::size_type begin = 0; + std::string::size_type end = in.find_first_of("\r\n", begin); + + while (end != std::string::npos && end >= begin) + { + out.push_back(in.substr(begin, end-begin)); + if (in.at(end) == '\r' + && in.size() > end+1 + && in.at(end+1) == '\n') + begin = end + 2; + else + begin = end + 1; + if (begin >= in.size()) + break; + end = in.find_first_of("\r\n", begin); + } + if (begin < in.size()) + out.push_back(in.substr(begin, in.size() - begin)); + } + else + { + out.push_back(in); + } +} + + +void +split_into_lines(string const & in, + vector & out) +{ + split_into_lines(in, constants::default_encoding, out); +} + +void +join_lines(vector const & in, + string & out, + string const & linesep) +{ + ostringstream oss; + copy(in.begin(), in.end(), ostream_iterator(oss, linesep.c_str())); + out = oss.str(); +} + +void +join_lines(vector const & in, + string & out) +{ + join_lines(in, out, "\n"); +} + +void +prefix_lines_with(string const & prefix, string const & lines, string & out) +{ + std::vector msgs; + split_into_lines(lines, msgs); + + ostringstream oss; + for (std::vector::const_iterator i = msgs.begin(); + i != msgs.end();) + { + oss << prefix << *i; + i++; + if (i != msgs.end()) + oss << "\n"; + } + + out = oss.str(); +} + +string +remove_ws(string const & s) +{ + string tmp; + tmp.reserve(s.size()); + for (string::const_iterator i = s.begin(); + i != s.end(); ++i) + { + switch (*i) + { + case '\n': + case '\r': + case '\t': + case ' ': + break; + default: + tmp += *i; + break; + } + } + return tmp; +} + +string +trim_ws(string const & s) +{ + string tmp = s; + string::size_type pos = tmp.find_last_not_of("\n\r\t "); + if (pos < string::npos) + tmp.erase(++pos); + pos = tmp.find_first_not_of("\n\r\t "); + if (pos < string::npos) + tmp = tmp.substr(pos); + return tmp; +} + +void +line_end_convert(string const & linesep, string const & src, string & dst) +{ + string linesep_str("\n"); + if (linesep == "CR" || linesep == "\r") + linesep_str = "\r"; + else if (linesep == "CRLF" || linesep == "\r\n") + linesep_str = "\r\n"; + else if (linesep == "LF"|| linesep == "\n") + linesep_str = "\n"; + + L(FL("doing linesep conversion to %s\n") % linesep); + vector tmp; + split_into_lines(src, tmp); + join_lines(tmp, dst, linesep_str); + if (src.size() >= linesep.size() && + (src.compare(src.size() - linesep.size(), linesep.size(), linesep) == 0)) + dst += linesep_str; +} + + +#ifdef BUILD_UNIT_TESTS +#include "unit_tests.hh" +#include + +static void +caseconv_test() +{ + BOOST_CHECK(uppercase("hello") == "HELLO"); + BOOST_CHECK(uppercase("heLlO") == "HELLO"); + BOOST_CHECK(lowercase("POODLE DAY") == "poodle day"); + BOOST_CHECK(lowercase("PooDLe DaY") == "poodle day"); + BOOST_CHECK(uppercase("address@hidden&*()") == "address@hidden&*()"); + BOOST_CHECK(lowercase("address@hidden&*()") == "address@hidden&*()"); +} + +static void +join_lines_test() +{ + vector strs; + string joined; + + strs.clear(); + join_lines(strs, joined); + BOOST_CHECK(joined == ""); + + strs.push_back("hi"); + join_lines(strs, joined); + BOOST_CHECK(joined == "hi\n"); + + strs.push_back("there"); + join_lines(strs, joined); + BOOST_CHECK(joined == "hi\nthere\n"); + + strs.push_back("user"); + join_lines(strs, joined); + BOOST_CHECK(joined == "hi\nthere\nuser\n"); +} + +static void +strip_ws_test() +{ + BOOST_CHECK(trim_ws("\n leading space") == "leading space"); + BOOST_CHECK(trim_ws("trailing space \n") == "trailing space"); + BOOST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both"); + BOOST_CHECK(remove_ws(" I like going\tfor walks\n ") + == "Ilikegoingforwalks"); +} + +void +add_simplestring_xform_tests(test_suite * suite) +{ + I(suite); + suite->add(BOOST_TEST_CASE(&caseconv_test)); + suite->add(BOOST_TEST_CASE(&join_lines_test)); + suite->add(BOOST_TEST_CASE(&strip_ws_test)); +} + +#endif // BUILD_UNIT_TESTS ============================================================ --- simplestring_xform.hh 5226320784043474c9f5fd799b8e86ca2f070d8d +++ simplestring_xform.hh 5226320784043474c9f5fd799b8e86ca2f070d8d @@ -0,0 +1,37 @@ +#ifndef __SIMPLESTRING_XFORM_HH__ +#define __SIMPLESTRING_XFORM_HH__ + +#include +#include + +std::string uppercase(std::string const & in); +std::string lowercase(std::string const & in); + +void split_into_lines(std::string const & in, + std::vector & out); + +void split_into_lines(std::string const & in, + std::string const & encoding, + std::vector & out); + +void join_lines(std::vector const & in, + std::string & out, + std::string const & linesep); + +void join_lines(std::vector const & in, + std::string & out); + +void prefix_lines_with(std::string const & prefix, + std::string const & lines, + std::string & out); + +// remove all whitespace +std::string remove_ws(std::string const & s); + +// remove leading and trailing whitespace +std::string trim_ws(std::string const & s); + +// line-ending conversion +void line_end_convert(std::string const & linesep, std::string const & src, std::string & dst); + +#endif ============================================================ --- ChangeLog b7eab83ddef0e1e1aa9ce27433aefe5ef7c1767a +++ ChangeLog 9d48a6925823f964e9a54d74c749a8f36aa0b010 @@ -1,5 +1,14 @@ 2006-05-22 Timothy Brownawell + * transforms.{cc,hh} charset.{cc,hh} simplestring_xform.{cc,hh}: + Split up transforms.cc . Hex, base64, gzip, and ident stay in + transforms. Simple string stuff (no botan or idna) goes in + simplestring_xform. Charset stuff goes in charset. + * Makefile.am unit_tests.{cc,hh}: Adjust for having new files. + * 20 others: Adjust includes. + +2006-05-22 Timothy Brownawell + * work.{cc,hh} constants.{cc,hh}: Move attribute strings to constants. * transforms.cc: Include constants.hh instead of work.hh . * annotate.cc diff_patch.cc crypto_tests.cc: Fix includes. ============================================================ --- Makefile.am 9e4f81a1650f6b7ea78fc91412facc9085d17370 +++ Makefile.am dff9212d58d7f5530cd3474b073ebb03114922f4 @@ -11,7 +11,8 @@ commands.cc commands.hh $(CMD_SOURCES) \ diff_patch.cc diff_patch.hh \ lua.cc lua.hh lua_hooks.cc lua_hooks.hh \ - transforms.cc transforms.hh \ + transforms.cc transforms.hh charset.cc charset.hh \ + simplestring_xform.cc simplestring_xform.hh \ update.cc update.hh \ work.cc work.hh \ cert.cc cert.hh \ ============================================================ --- annotate.cc 714dd862061320e7a6b98231a9fece85fb777f6d +++ annotate.cc 5f284a607f1ef3dd5a7e192a58c396ce135ae9ff @@ -21,6 +21,7 @@ #include "revision.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "vocab.hh" #include "cert.hh" #include "ui.hh" ============================================================ --- app_state.cc 659c5d285188926cc6837777a4e8ba037bb8dc92 +++ app_state.cc 6a8be1d3051e8e84464730d9d4f60cc9abca098d @@ -13,7 +13,7 @@ #include "database.hh" #include "file_io.hh" #include "sanity.hh" -#include "transforms.hh" +#include "charset.hh" #include "work.hh" #include "platform.hh" ============================================================ --- cert.cc 49898212ea84352348d3de005ec0f64aedacec6d +++ cert.cc 90434ca979245b8d440c3daead3d8ad5120e27ee @@ -13,6 +13,7 @@ #include "netio.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "ui.hh" #include "options.hh" #include "revision.hh" ============================================================ --- cmd_db.cc f52e5e7e9d01d7cc025ed7dc398e8b108952a159 +++ cmd_db.cc cef42bd1ec6ec18b7e37bfbd70d1d9decee2d679 @@ -1,7 +1,7 @@ #include "cmd.hh" #include "revision.hh" #include "database_check.hh" -#include "transforms.hh" +#include "charset.hh" #include using std::cin; ============================================================ --- cmd_diff_log.cc a831b231bb020f181f60425bd0bca65624c140c5 +++ cmd_diff_log.cc e4ee11b436926500bb79a452886c3570dad608af @@ -1,7 +1,8 @@ #include "cmd.hh" #include "diff_patch.hh" #include "revision.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "restrictions.hh" using std::set; ============================================================ --- cmd_files.cc 180ee596a5ca3008d4ba94215d821e769f81a6b9 +++ cmd_files.cc 6979ad01ab19f55ea0fa0a119cef5c4b7a6c4bdc @@ -1,6 +1,7 @@ #include "cmd.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "packet.hh" #include "annotate.hh" #include "diff_patch.hh" ============================================================ --- cmd_key_cert.cc 4791bf13e667065cab77e9e26b33c0ae2b255b7c +++ cmd_key_cert.cc 6b1f09475de549a193b7604f4512b0a4e6a5ff2e @@ -1,8 +1,9 @@ #include "cmd.hh" #include "keys.hh" #include "cert.hh" #include "transforms.hh" +#include "charset.hh" #include "packet.hh" #include ============================================================ --- cmd_list.cc a7051aab9b0e1dcb9d6880c12c41c1df3abf309e +++ cmd_list.cc 5fcd71a963f52bf864f3a8f392c8d9ae328ac953 @@ -1,9 +1,11 @@ #include "cmd.hh" #include "globish.hh" #include "restrictions.hh" #include "revision.hh" #include "transforms.hh" +#include "simplestring_xform.hh" +#include "charset.hh" #include "database.hh" #include "ui.hh" #include "keys.hh" ============================================================ --- commands.cc 54fcd00817ebb45cbb14262b8538da0ff5a19c01 +++ commands.cc a02d61685a3807368a2e1ec1fed79ea5f342ea79 @@ -9,6 +9,8 @@ #include #include "transforms.hh" +#include "simplestring_xform.hh" +#include "charset.hh" #include "inodeprint.hh" #include "cert.hh" ============================================================ --- diff_patch.cc d33f922b61cd32299ba5adad92c746680b392cc8 +++ diff_patch.cc aa0b86873084492d817470968a14537d56168bce @@ -23,6 +23,7 @@ #include "safe_map.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "vocab.hh" #include "revision.hh" #include "constants.hh" ============================================================ --- file_io.cc c697f41e7ea39c7ba830a3662f9750449402f5fd +++ file_io.cc 76f4b114209f9e82c86a7af82cea0f36a75bf447 @@ -17,6 +17,8 @@ #include "lua_hooks.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" +#include "charset.hh" #include "platform.hh" // this file deals with talking to the filesystem, loading and ============================================================ --- keys.cc 4c974aa7fc542e4b9d3361a185f0ee29291af931 +++ keys.cc ab0e49cae3664e3aa8f52c652e0825dea72de281 @@ -18,6 +18,7 @@ #include "platform.hh" #include "safe_map.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "sanity.hh" #include "ui.hh" #include "cert.hh" ============================================================ --- monotone.cc cf9b8d637091861448eb03a8c358a8708c27aebb +++ monotone.cc 7ce7990a7e4ebb99f1980ccdeca7a6e1b297ea7b @@ -29,7 +29,7 @@ #include "sanity.hh" #include "cleanup.hh" #include "file_io.hh" -#include "transforms.hh" +#include "charset.hh" #include "ui.hh" #include "mt_version.hh" #include "options.hh" ============================================================ --- packet.cc 0403b0840c9d378669cdebee37b9bd5247e0f4b7 +++ packet.cc 130498edfbbd9c527c89e6e6c5c2d8d6193d3590 @@ -16,6 +16,7 @@ #include "revision.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "keys.hh" #include "cert.hh" ============================================================ --- paths.cc 375da7ea4ea6a8a047824de80c1c5a5f3daef181 +++ paths.cc 5680d2c2726d5894fe21cc4c7dd9f149ba1efb32 @@ -16,7 +16,8 @@ #include "platform.hh" #include "sanity.hh" #include "interner.hh" -#include "transforms.hh" +#include "charset.hh" +#include "simplestring_xform.hh" // some structure to ensure we aren't doing anything broken when resolving // filenames. the idea is to make sure ============================================================ --- revision.cc 30726868211c9c820002c3e281a5728c4a640ac6 +++ revision.cc e41a1f765fa1a8d68288a62f74da3215864ada44 @@ -34,6 +34,7 @@ #include "revision.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "ui.hh" #include "vocab.hh" #include "safe_map.hh" ============================================================ --- roster.cc a57e3ffdc43ec0089cbcf72ad5d8165945cbcdad +++ roster.cc 0aa31818d4a5c43136c8b6fe3ff2e0ba9f3aa751 @@ -19,6 +19,7 @@ #include "revision.hh" #include "vocab.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "parallel_iter.hh" #include "restrictions.hh" #include "safe_map.hh" ============================================================ --- sanity.cc 113604698c798bcaea7d7d4fb42417090fe4cec1 +++ sanity.cc d10487eb6479f58640483c72bb8c207903b0ade1 @@ -19,7 +19,7 @@ #include "constants.hh" #include "platform.hh" #include "sanity.hh" -#include "transforms.hh" +#include "simplestring_xform.hh" #include "ui.hh" using namespace std; ============================================================ --- transforms.cc d3970881bad05091164e06c3186db95beebf6533 +++ transforms.cc 079c3479e99dda5711ca148a67c93f756641b0d1 @@ -20,13 +20,11 @@ #include "botan/gzip.h" #include "botan/sha160.h" -#include "idna/idna.h" -#include "idna/stringprep.h" - #include "cleanup.hh" #include "constants.hh" #include "sanity.hh" #include "transforms.hh" +#include "simplestring_xform.hh" #include "vocab.hh" #include "xdelta.hh" @@ -151,40 +149,6 @@ } } -struct -lowerize -{ - char operator()(char const & c) const - { - return ::tolower(static_cast(c)); - } -}; - -string -lowercase(string const & in) -{ - string n(in); - transform(n.begin(), n.end(), n.begin(), lowerize()); - return n; -} - -struct -upperize -{ - char operator()(char const & c) const - { - return ::toupper(static_cast(c)); - } -}; - -string -uppercase(string const & in) -{ - string n(in); - transform(n.begin(), n.end(), n.begin(), upperize()); - return n; -} - template void pack(T const & in, base64< gzip > & out) { @@ -279,139 +243,7 @@ ident = tmp; } -void split_into_lines(std::string const & in, - std::string const & encoding, - std::vector & out) -{ - std::string lc_encoding = lowercase(encoding); - out.clear(); - - // note: this function does not handle ISO-2022-X, Shift-JIS, and - // probably a good deal of other encodings as well. please expand - // the logic here if you can work out an easy way of doing line - // breaking on these encodings. currently it's just designed to - // work with charsets in which 0x0a / 0x0d are *always* \n and \r - // respectively. - // - // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI, - // ASCII, and UTF-8 families of encodings. - - if (lc_encoding == constants::default_encoding - || lc_encoding.find("ascii") != std::string::npos - || lc_encoding.find("8859") != std::string::npos - || lc_encoding.find("euc") != std::string::npos - || lc_encoding.find("koi") != std::string::npos - || lc_encoding.find("gb") != std::string::npos - || lc_encoding == "utf-8" - || lc_encoding == "utf_8" - || lc_encoding == "utf8") - { - std::string::size_type begin = 0; - std::string::size_type end = in.find_first_of("\r\n", begin); - - while (end != std::string::npos && end >= begin) - { - out.push_back(in.substr(begin, end-begin)); - if (in.at(end) == '\r' - && in.size() > end+1 - && in.at(end+1) == '\n') - begin = end + 2; - else - begin = end + 1; - if (begin >= in.size()) - break; - end = in.find_first_of("\r\n", begin); - } - if (begin < in.size()) - out.push_back(in.substr(begin, in.size() - begin)); - } - else - { - out.push_back(in); - } -} - - -void -split_into_lines(string const & in, - vector & out) -{ - split_into_lines(in, constants::default_encoding, out); -} - -void -join_lines(vector const & in, - string & out, - string const & linesep) -{ - ostringstream oss; - copy(in.begin(), in.end(), ostream_iterator(oss, linesep.c_str())); - out = oss.str(); -} - -void -join_lines(vector const & in, - string & out) -{ - join_lines(in, out, "\n"); -} - -void -prefix_lines_with(string const & prefix, string const & lines, string & out) -{ - std::vector msgs; - split_into_lines(lines, msgs); - - ostringstream oss; - for (std::vector::const_iterator i = msgs.begin(); - i != msgs.end();) - { - oss << prefix << *i; - i++; - if (i != msgs.end()) - oss << endl; - } - - out = oss.str(); -} - string -remove_ws(string const & s) -{ - string tmp; - tmp.reserve(s.size()); - for (string::const_iterator i = s.begin(); - i != s.end(); ++i) - { - switch (*i) - { - case '\n': - case '\r': - case '\t': - case ' ': - break; - default: - tmp += *i; - break; - } - } - return tmp; -} - -string -trim_ws(string const & s) -{ - string tmp = s; - string::size_type pos = tmp.find_last_not_of("\n\r\t "); - if (pos < string::npos) - tmp.erase(++pos); - pos = tmp.find_first_not_of("\n\r\t "); - if (pos < string::npos) - tmp = tmp.substr(pos); - return tmp; -} - -string canonical_base64(string const & s) { return xform @@ -419,433 +251,6 @@ } -// general character code conversion routines - -static string -system_charset() -{ - char const * locale_charset_name = stringprep_locale_charset (); - I(locale_charset_name != NULL); - string sys_charset(locale_charset_name); - return sys_charset; -} - -void -charset_convert(string const & src_charset, - string const & dst_charset, - string const & src, - string & dst) -{ - if (src_charset == dst_charset) - dst = src; - else - { - L(FL("converting %d bytes from %s to %s\n") % src.size() - % src_charset % dst_charset); - char * converted = stringprep_convert(src.c_str(), - dst_charset.c_str(), - src_charset.c_str()); - E(converted != NULL, - F("failed to convert string from %s to %s: '%s'") - % src_charset % dst_charset % src); - dst = string(converted); - free(converted); - } -} - -void -system_to_utf8(external const & ext, utf8 & utf) -{ - string out; - charset_convert(system_charset(), "UTF-8", ext(), out); - I(utf8_validate(out)); - utf = out; -} - -size_t -display_width(utf8 const & utf) -{ - std::string const & u = utf(); - size_t sz = 0; - std::string::const_iterator i = u.begin(); - while (i != u.end()) - { - if (UNLIKELY(static_cast(*i) & static_cast(0x80))) - { - // A UTF-8 escape: consume the full escape. - ++i; - ++sz; - while (i != u.end() - && (static_cast(*i) & static_cast(0x80)) - && (!(static_cast(*i) & static_cast(0x40)))) - ++i; - } - else - { - // An ASCII-like character in the range 0..0x7F. - ++i; - ++sz; - } - } - return sz; -} - -// Lots of gunk to avoid charset conversion as much as possible. Running -// iconv over every element of every path in a 30,000 file manifest takes -// multiple seconds, which then is a minimum bound on pretty much any -// operation we do... -static inline bool -system_charset_is_utf8_impl() -{ - std::string lc_encoding = lowercase(system_charset()); - return (lc_encoding == "utf-8" - || lc_encoding == "utf_8" - || lc_encoding == "utf8"); -} - -static inline bool -system_charset_is_utf8() -{ - static bool it_is = system_charset_is_utf8_impl(); - return it_is; -} - -static inline bool -system_charset_is_ascii_extension_impl() -{ - if (system_charset_is_utf8()) - return true; - std::string lc_encoding = lowercase(system_charset()); - // if your character set is identical to ascii in the lower 7 bits, then add - // it here for a speed boost. - return (lc_encoding.find("ascii") != std::string::npos - || lc_encoding.find("8859") != std::string::npos - || lc_encoding.find("ansi_x3.4") != std::string::npos - // http://www.cs.mcgill.ca/~aelias4/encodings.html -- "EUC (Extended - // Unix Code) is a simple and clean encoding, standard on Unix - // systems.... It is backwards-compatible with ASCII (i.e. valid - // ASCII implies valid EUC)." - || lc_encoding.find("euc") != std::string::npos); -} - -static inline bool -system_charset_is_ascii_extension() -{ - static bool it_is = system_charset_is_ascii_extension_impl(); - return it_is; -} - -inline static bool -is_all_ascii(string const & utf) -{ - // could speed this up by vectorization -- mask against 0x80808080, - // process a whole word at at time... - for (std::string::const_iterator i = utf.begin(); i != utf.end(); ++i) - if (0x80 & *i) - return false; - return true; -} - -// this function must be fast. do not make it slow. -void -utf8_to_system(utf8 const & utf, std::string & ext) -{ - if (system_charset_is_utf8()) - ext = utf(); - else if (system_charset_is_ascii_extension() - && is_all_ascii(utf())) - ext = utf(); - else - charset_convert("UTF-8", system_charset(), utf(), ext); -} - -void -utf8_to_system(utf8 const & utf, external & ext) -{ - string out; - utf8_to_system(utf, out); - ext = out; -} - -// utf8_validate and the helper functions is_valid_unicode_char and -// utf8_consume_continuation_char g_utf8_validate and supporting functions -// from the file gutf8.c of the GLib library. - -static bool -is_valid_unicode_char(u32 c) -{ - return (c < 0x110000 && - ((c & 0xfffff800) != 0xd800) && - (c < 0xfdd0 || c > 0xfdef) && - (c & 0xfffe) != 0xfffe); -} - -static bool -utf8_consume_continuation_char(u8 c, u32 & val) -{ - if ((c & 0xc0) != 0x80) - return false; - val <<= 6; - val |= c & 0x3f; - return true; -} - -bool -utf8_validate(utf8 const & utf) -{ - std::string::size_type left = utf().size(); - u32 min, val; - - for (std::string::const_iterator i = utf().begin(); - i != utf().end(); ++i, --left) - { - u8 c = *i; - if (c < 128) - continue; - if ((c & 0xe0) == 0xc0) - { - if (left < 2) - return false; - if ((c & 0x1e) == 0) - return false; - ++i; --left; c = *i; - if ((c & 0xc0) != 0x80) - return false; - } - else - { - if ((c & 0xf0) == 0xe0) - { - if (left < 3) - return false; - min = 1 << 11; - val = c & 0x0f; - goto two_remaining; - } - else if ((c & 0xf8) == 0xf0) - { - if (left < 4) - return false; - min = 1 << 16; - val = c & 0x07; - } - else - return false; - ++i; --left; c = *i; - if (!utf8_consume_continuation_char(c, val)) - return false; -two_remaining: - ++i; --left; c = *i; - if (!utf8_consume_continuation_char(c, val)) - return false; - ++i; --left; c = *i; - if (!utf8_consume_continuation_char(c, val)) - return false; - if (val < min) - return false; - if (!is_valid_unicode_char(val)) - return false; - } - } - return true; -} - -static string -decode_idna_error(int err) -{ - switch (static_cast(err)) - { - case IDNA_STRINGPREP_ERROR: return "stringprep error"; break; - case IDNA_PUNYCODE_ERROR: return "punycode error"; break; - case IDNA_CONTAINS_NON_LDH: return "non-LDH characters"; break; - case IDNA_CONTAINS_MINUS: return "leading / trailing hyphen-minus character"; break; - case IDNA_INVALID_LENGTH: return "invalid length (output must be between 1 and 63 chars)"; break; - case IDNA_NO_ACE_PREFIX: return "no ace prefix"; break; - case IDNA_ROUNDTRIP_VERIFY_ERROR: return "roundtrip verify error"; break; - case IDNA_CONTAINS_ACE_PREFIX: return "contains ACE prefix (\"xn--\")"; break; - case IDNA_ICONV_ERROR: return "iconv error"; break; - case IDNA_MALLOC_ERROR: return "malloc error"; break; - default: return "unknown error"; break; - } - return "unknown error"; -} - -void -ace_to_utf8(ace const & a, utf8 & utf) -{ - char *out = NULL; - L(FL("converting %d bytes from IDNA ACE to UTF-8\n") % a().size()); - int res = idna_to_unicode_8z8z(a().c_str(), &out, IDNA_USE_STD3_ASCII_RULES); - N(res == IDNA_SUCCESS || res == IDNA_NO_ACE_PREFIX, - F("error converting %d UTF-8 bytes to IDNA ACE: %s") - % a().size() - % decode_idna_error(res)); - utf = string(out); - free(out); -} - -void -utf8_to_ace(utf8 const & utf, ace & a) -{ - char *out = NULL; - L(FL("converting %d bytes from UTF-8 to IDNA ACE\n") % utf().size()); - int res = idna_to_ascii_8z(utf().c_str(), &out, IDNA_USE_STD3_ASCII_RULES); - N(res == IDNA_SUCCESS, - F("error converting %d UTF-8 bytes to IDNA ACE: %s") - % utf().size() - % decode_idna_error(res)); - a = string(out); - free(out); -} - -void -internalize_cert_name(utf8 const & utf, cert_name & c) -{ - ace a; - utf8_to_ace(utf, a); - c = a(); -} - -void -internalize_cert_name(external const & ext, cert_name & c) -{ - utf8 utf; - system_to_utf8(ext(), utf); - internalize_cert_name(utf, c); -} - -void -externalize_cert_name(cert_name const & c, utf8 & utf) -{ - ace_to_utf8(ace(c()), utf); -} - -void -externalize_cert_name(cert_name const & c, external & ext) -{ - utf8 utf; - externalize_cert_name(c, utf); - utf8_to_system(utf, ext); -} - -void -internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key) -{ - string tmp; - typedef boost::tokenizer > - tokenizer; - boost::char_separator sep("", ".@", boost::keep_empty_tokens); - tokenizer tokens(utf(), sep); - bool in_domain = false; - for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i) - { - if (!in_domain || *i == "." || *i == "@") - tmp += *i; - else - { - ace a; - utf8_to_ace(*i, a); - tmp += a(); - } - if (*i == "@") - in_domain = true; - } - key = tmp; -} - -void -internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key) -{ - utf8 utf; - system_to_utf8(ext, utf); - internalize_rsa_keypair_id(utf, key); -} - -void -externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf) -{ - string tmp; - typedef boost::tokenizer > - tokenizer; - boost::char_separator sep("", ".@", boost::keep_empty_tokens); - tokenizer tokens(key(), sep); - bool in_domain = false; - for(tokenizer::iterator i = tokens.begin(); i != tokens.end(); ++i) - { - if (!in_domain || *i == "." || *i == "@") - tmp += *i; - else - { - ace a(*i); - utf8 u; - ace_to_utf8(a, u); - tmp += u(); - } - if (*i == "@") - in_domain = true; - } - utf = tmp; -} - -void -externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext) -{ - utf8 utf; - externalize_rsa_keypair_id(key, utf); - utf8_to_system(utf, ext); -} - -void -internalize_var_domain(utf8 const & utf, var_domain & d) -{ - ace a; - utf8_to_ace(utf, a); - d = a(); -} - -void -internalize_var_domain(external const & ext, var_domain & d) -{ - utf8 utf; - system_to_utf8(ext(), utf); - internalize_var_domain(utf, d); -} - -void -externalize_var_domain(var_domain const & d, utf8 & utf) -{ - ace_to_utf8(ace(d()), utf); -} - -void -externalize_var_domain(var_domain const & d, external & ext) -{ - utf8 utf; - externalize_var_domain(d, utf); - utf8_to_system(utf, ext); -} - -void -line_end_convert(string const & linesep, string const & src, string & dst) -{ - string linesep_str("\n"); - if (linesep == "CR" || linesep == "\r") - linesep_str = "\r"; - else if (linesep == "CRLF" || linesep == "\r\n") - linesep_str = "\r\n"; - else if (linesep == "LF"|| linesep == "\n") - linesep_str = "\n"; - - L(FL("doing linesep conversion to %s\n") % linesep); - vector tmp; - split_into_lines(src, tmp); - join_lines(tmp, dst, linesep_str); - if (src.size() >= linesep.size() && - (src.compare(src.size() - linesep.size(), linesep.size(), linesep) == 0)) - dst += linesep_str; -} - - #ifdef BUILD_UNIT_TESTS #include "unit_tests.hh" #include @@ -887,467 +292,6 @@ BOOST_CHECK(output() == ident); } -static void -caseconv_test() -{ - BOOST_CHECK(uppercase("hello") == "HELLO"); - BOOST_CHECK(uppercase("heLlO") == "HELLO"); - BOOST_CHECK(lowercase("POODLE DAY") == "poodle day"); - BOOST_CHECK(lowercase("PooDLe DaY") == "poodle day"); - BOOST_CHECK(uppercase("address@hidden&*()") == "address@hidden&*()"); - BOOST_CHECK(lowercase("address@hidden&*()") == "address@hidden&*()"); -} - -static void -join_lines_test() -{ - vector strs; - string joined; - - strs.clear(); - join_lines(strs, joined); - BOOST_CHECK(joined == ""); - - strs.push_back("hi"); - join_lines(strs, joined); - BOOST_CHECK(joined == "hi\n"); - - strs.push_back("there"); - join_lines(strs, joined); - BOOST_CHECK(joined == "hi\nthere\n"); - - strs.push_back("user"); - join_lines(strs, joined); - BOOST_CHECK(joined == "hi\nthere\nuser\n"); -} - -static void -strip_ws_test() -{ - BOOST_CHECK(trim_ws("\n leading space") == "leading space"); - BOOST_CHECK(trim_ws("trailing space \n") == "trailing space"); - BOOST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both"); - BOOST_CHECK(remove_ws(" I like going\tfor walks\n ") - == "Ilikegoingforwalks"); -} - -#define IDNA_ACE_PREFIX "xn--" -#define IDNA_SUCCESS 0 - -struct -idna -{ - char *name; - size_t inlen; - uint32_t in[100]; - char *out; - int allowunassigned; - int usestd3asciirules; - int toasciirc; - int tounicoderc; -} idna_vec[] = - { - { - "Arabic (Egyptian)", 17, - { - 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643, - 0x0644, 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A, - 0x061F}, - IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Chinese (simplified)", 9, - { - 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587}, - IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Chinese (traditional)", 9, - { - 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587}, - IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Czech", 22, - { - 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073, - 0x0074, 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076, - 0x00ED, 0x010D, 0x0065, 0x0073, 0x006B, 0x0079}, - IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Hebrew", 22, - { - 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5, - 0x05D8, 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9, - 0x05DD, 0x05E2, 0x05D1, 0x05E8, 0x05D9, 0x05EA}, - IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Hindi (Devanagari)", 30, - { - 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928, - 0x094D, 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902, - 0x0928, 0x0939, 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938, - 0x0915, 0x0924, 0x0947, 0x0939, 0x0948, 0x0902}, - IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", 0, 0, - IDNA_SUCCESS}, - { - "Japanese (kanji and hiragana)", 18, - { - 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E, - 0x3092, 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044, - 0x306E, 0x304B}, - IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", 0, 0, - IDNA_SUCCESS}, - { - "Russian (Cyrillic)", 28, - { - 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435, - 0x043E, 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432, - 0x043E, 0x0440, 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443, - 0x0441, 0x0441, 0x043A, 0x0438}, - IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0, - IDNA_SUCCESS, IDNA_SUCCESS}, - { - "Spanish", 40, - { - 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F, - 0x0070, 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069, - 0x006D, 0x0070, 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074, - 0x0065, 0x0068, 0x0061, 0x0062, 0x006C, 0x0061, 0x0072, 0x0065, - 0x006E, 0x0045, 0x0073, 0x0070, 0x0061, 0x00F1, 0x006F, 0x006C}, - IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a", 0, 0, - IDNA_SUCCESS}, - { - "Vietnamese", 31, - { - 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD, - 0x006B, 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3, - 0x0063, 0x0068, 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069, - 0x1EBF, 0x006E, 0x0067, 0x0056, 0x0069, 0x1EC7, 0x0074}, - IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", 0, 0, - IDNA_SUCCESS}, - { - "Japanese", 8, - { - 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F}, - IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Japanese", 24, - { - 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069, - 0x0074, 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052, - 0x002D, 0x004D, 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053}, - IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", 0, 0, - IDNA_SUCCESS}, - { - "Japanese", 25, - { - 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E, - 0x006F, 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061, - 0x0079, 0x002D, 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834, - 0x6240}, - IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b", 0, 0, - IDNA_SUCCESS}, - { - "Japanese", 8, - { - 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032}, - IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Japanese", 13, - { - 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069, - 0x3059, 0x308B, 0x0035, 0x79D2, 0x524D}, - IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e", 0, 0, IDNA_SUCCESS, - IDNA_SUCCESS}, - { - "Japanese", 9, - { - 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0}, - IDNA_ACE_PREFIX "de-jg4avhby1noc0d", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, - { - "Japanese", 7, - { - 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067}, - IDNA_ACE_PREFIX "d9juau41awczczp", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, - { - "Greek", 8, - {0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac}, - IDNA_ACE_PREFIX "hxargifdar", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, - { - "Maltese (Malti)", 10, - {0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127, - 0x0127, 0x0061}, - IDNA_ACE_PREFIX "bonusaa-5bb1da", 0, 0, IDNA_SUCCESS, IDNA_SUCCESS}, - { - "Russian (Cyrillic)", 28, - {0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435, - 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432, - 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443, - 0x0441, 0x0441, 0x043a, 0x0438}, - IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l", 0, 0, - IDNA_SUCCESS, IDNA_SUCCESS}, - }; - -static void -check_idna_encoding() -{ - putenv("CHARSET=UTF-8"); - - for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i) - { - BOOST_CHECKPOINT("IDNA language: " + string(idna_vec[i].name)); - - size_t p, q; - char *uc = stringprep_ucs4_to_utf8(idna_vec[i].in, - idna_vec[i].inlen, - &p, &q); - utf8 utf = string(uc); - utf8 tutf; - free(uc); - - ace a = string(idna_vec[i].out); - ace tace; - utf8_to_ace(utf, tace); - L(boost::format("ACE-encoded %s: '%s'\n") % idna_vec[i].name % tace()); - BOOST_CHECK(lowercase(a()) == lowercase(tace())); - ace_to_utf8(a, tutf); - BOOST_CHECK(lowercase(utf()) == lowercase(tutf())); - } -} - -static void encode_test() -{ - check_idna_encoding(); -} - -static void utf8_validation_test() -{ - // these tests are based on the tests from the file utf8-validate.c of the - // GLib library, and also include sequences from Markus Kuhn's UTF-8 - // example files. - const char* good_strings[] = { - "this is a valid but boring ASCII string", - "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88", - "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73", - "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d", - "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e\xe2\x80\x9c", - "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6", - "\xc2\xa9\xc2\xa9\xc2\xa9", - "\xe2\x89\xa0\xe2\x89\xa0", - "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", - "\x00", - "\xc2\x80", - "\xe0\xa0\x80", - "\xf0\x90\x80\x80", - "\x7f", - "\xdf\xbf", - "\xed\x9f\xbf", - "\xee\x80\x80", - "\xef\xbf\xbd", - 0 - }; - const char* bad_strings[] = { - "\xf8\x88\x80\x80\x80", - "\xfc\x84\x80\x80\x80\x80", - "\xef\xbf\xbf", - "\xf7\xbf\xbf\xbf", - "\xfb\xbf\xbf\xbf\xbf", - "\xfd\xbf\xbf\xbf\xbf\xbf", - "\xf4\x8f\xbf\xbf", - "\xf4\x90\x80\x80", - "\x80", - "\xbf", - "\x80\xbf", - "\x80\xbf\x80", - "\x80\xbf\x80\xbf", - "\x80\xbf\x80\xbf\x80", - "\x80\xbf\x80\xbf\x80\xbf", - "\x80\xbf\x80\xbf\x80\xbf\x80", - "\x80", - "\x81", - "\x82", - "\x83", - "\x84", - "\x85", - "\x86", - "\x87", - "\x88", - "\x89", - "\x8a", - "\x8b", - "\x8c", - "\x8d", - "\x8e", - "\x8f", - "\x90", - "\x91", - "\x92", - "\x93", - "\x94", - "\x95", - "\x96", - "\x97", - "\x98", - "\x99", - "\x9a", - "\x9b", - "\x9c", - "\x9d", - "\x9e", - "\x9f", - "\xa0", - "\xa1", - "\xa2", - "\xa3", - "\xa4", - "\xa5", - "\xa6", - "\xa7", - "\xa8", - "\xa9", - "\xaa", - "\xab", - "\xac", - "\xad", - "\xae", - "\xaf", - "\xb0", - "\xb1", - "\xb2", - "\xb3", - "\xb4", - "\xb5", - "\xb6", - "\xb7", - "\xb8", - "\xb9", - "\xba", - "\xbb", - "\xbc", - "\xbd", - "\xbe", - "\xbf", - "\xc0\x20", - "\xc1\x20", - "\xc2\x20", - "\xc3\x20", - "\xc4\x20", - "\xc5\x20", - "\xc6\x20", - "\xc7\x20", - "\xc8\x20", - "\xc9\x20", - "\xca\x20", - "\xcb\x20", - "\xcc\x20", - "\xcd\x20", - "\xce\x20", - "\xcf\x20", - "\xd0\x20", - "\xd1\x20", - "\xd2\x20", - "\xd3\x20", - "\xd4\x20", - "\xd5\x20", - "\xd6\x20", - "\xd7\x20", - "\xd8\x20", - "\xd9\x20", - "\xda\x20", - "\xdb\x20", - "\xdc\x20", - "\xdd\x20", - "\xde\x20", - "\xdf\x20", - "\xe0\x20", - "\xe1\x20", - "\xe2\x20", - "\xe3\x20", - "\xe4\x20", - "\xe5\x20", - "\xe6\x20", - "\xe7\x20", - "\xe8\x20", - "\xe9\x20", - "\xea\x20", - "\xeb\x20", - "\xec\x20", - "\xed\x20", - "\xee\x20", - "\xef\x20", - "\xf0\x20", - "\xf1\x20", - "\xf2\x20", - "\xf3\x20", - "\xf4\x20", - "\xf5\x20", - "\xf6\x20", - "\xf7\x20", - "\xf8\x20", - "\xf9\x20", - "\xfa\x20", - "\xfb\x20", - "\xfc\x20", - "\xfd\x20", - "\x20\xc0", - "\x20\xe0\x80", - "\x20\xf0\x80\x80", - "\x20\xf8\x80\x80\x80", - "\x20\xfc\x80\x80\x80\x80", - "\x20\xdf", - "\x20\xef\xbf", - "\x20\xf7\xbf\xbf", - "\x20\xfb\xbf\xbf\xbf", - "\x20\xfd\xbf\xbf\xbf\xbf", - "\x20\xfe\x20", - "\x20\xff\x20", - "\x20\xc0\xaf\x20", - "\x20\xe0\x80\xaf\x20", - "\x20\xf0\x80\x80\xaf\x20", - "\x20\xf8\x80\x80\x80\xaf\x20", - "\x20\xfc\x80\x80\x80\x80\xaf\x20", - "\x20\xc1\xbf\x20", - "\x20\xe0\x9f\xbf\x20", - "\x20\xf0\x8f\xbf\xbf\x20", - "\x20\xf8\x87\xbf\xbf\xbf\x20", - "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", - "\x20\xc0\x80\x20", - "\x20\xe0\x80\x80\x20", - "\x20\xf0\x80\x80\x80\x20", - "\x20\xf8\x80\x80\x80\x80\x20", - "\x20\xfc\x80\x80\x80\x80\x80\x20", - "\x20\xed\xa0\x80\x20", - "\x20\xed\xad\xbf\x20", - "\x20\xed\xae\x80\x20", - "\x20\xed\xaf\xbf\x20", - "\x20\xed\xb0\x80\x20", - "\x20\xed\xbe\x80\x20", - "\x20\xed\xbf\xbf\x20", - "\x20\xed\xa0\x80\xed\xb0\x80\x20", - "\x20\xed\xa0\x80\xed\xbf\xbf\x20", - "\x20\xed\xad\xbf\xed\xb0\x80\x20", - "\x20\xed\xad\xbf\xed\xbf\xbf\x20", - "\x20\xed\xae\x80\xed\xb0\x80\x20", - "\x20\xed\xae\x80\xed\xbf\xbf\x20", - "\x20\xed\xaf\xbf\xed\xb0\x80\x20", - "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", - "\x20\xef\xbf\xbe\x20", - "\x20\xef\xbf\xbf\x20", - 0 - }; - - for (int i = 0; good_strings[i]; ++i) - BOOST_CHECK(utf8_validate(string(good_strings[i])) == true); - - for (int i = 0; bad_strings[i]; ++i) - BOOST_CHECK(utf8_validate(string(bad_strings[i])) == false); -} - void add_transform_tests(test_suite * suite) { @@ -1355,11 +299,6 @@ suite->add(BOOST_TEST_CASE(&enc_test)); suite->add(BOOST_TEST_CASE(&rdiff_test)); suite->add(BOOST_TEST_CASE(&calculate_ident_test)); - suite->add(BOOST_TEST_CASE(&caseconv_test)); - suite->add(BOOST_TEST_CASE(&join_lines_test)); - suite->add(BOOST_TEST_CASE(&strip_ws_test)); - suite->add(BOOST_TEST_CASE(&encode_test)); - suite->add(BOOST_TEST_CASE(&utf8_validation_test)); } #endif // BUILD_UNIT_TESTS ============================================================ --- transforms.hh 0ab0d07b7a5b8d05c7c64a9770429435d667475e +++ transforms.hh 72398b1a08314a36352c7216fff9f6e2c00f74b3 @@ -51,9 +51,6 @@ // hex encoding -std::string uppercase(std::string const & in); -std::string lowercase(std::string const & in); - std::string encode_hexenc(std::string const & in); std::string decode_hexenc(std::string const & in); @@ -122,63 +119,8 @@ void calculate_ident(revision_data const & dat, revision_id & ident); - - -void split_into_lines(std::string const & in, - std::vector & out); - -void split_into_lines(std::string const & in, - std::string const & encoding, - std::vector & out); - -void join_lines(std::vector const & in, - std::string & out, - std::string const & linesep); - -void join_lines(std::vector const & in, - std::string & out); - -void prefix_lines_with(std::string const & prefix, - std::string const & lines, - std::string & out); - -// remove all whitespace -std::string remove_ws(std::string const & s); - -// remove leading and trailing whitespace -std::string trim_ws(std::string const & s); - // canonicalize base64 encoding std::string canonical_base64(std::string const & s); -// charset conversions -void charset_convert(std::string const & src_charset, std::string const & dst_charset, - std::string const & src, std::string & dst); -void system_to_utf8(external const & system, utf8 & utf); -void utf8_to_system(utf8 const & utf, external & system); -void utf8_to_system(utf8 const & utf, std::string & system); -void ace_to_utf8(ace const & ac, utf8 & utf); -void utf8_to_ace(utf8 const & utf, ace & a); -bool utf8_validate(utf8 const & utf); -// returns length in characters (not bytes) -size_t display_width(utf8 const & utf); - -// specific internal / external conversions for various vocab terms -void internalize_cert_name(utf8 const & utf, cert_name & c); -void internalize_cert_name(external const & ext, cert_name & c); -void externalize_cert_name(cert_name const & c, utf8 & utf); -void externalize_cert_name(cert_name const & c, external & ext); -void internalize_rsa_keypair_id(utf8 const & utf, rsa_keypair_id & key); -void internalize_rsa_keypair_id(external const & ext, rsa_keypair_id & key); -void externalize_rsa_keypair_id(rsa_keypair_id const & key, utf8 & utf); -void externalize_rsa_keypair_id(rsa_keypair_id const & key, external & ext); -void internalize_var_domain(utf8 const & utf, var_domain & d); -void internalize_var_domain(external const & ext, var_domain & d); -void externalize_var_domain(var_domain const & d, utf8 & utf); -void externalize_var_domain(var_domain const & d, external & ext); - -// line-ending conversion -void line_end_convert(std::string const & linesep, std::string const & src, std::string & dst); - #endif // __TRANSFORMS_HH__ ============================================================ --- ui.cc 1d1fcfee81b62f2698d9deb380c49f9c29765a67 +++ ui.cc 180abc6a7a5e1e7c552abc05d8d1806aebcae786 @@ -13,7 +13,8 @@ #include "platform.hh" #include "sanity.hh" #include "ui.hh" -#include "transforms.hh" +#include "charset.hh" +#include "simplestring_xform.hh" #include "constants.hh" #include ============================================================ --- unit_tests.cc b9f978cb102ef892f9c08f757e2150107e1e792d +++ unit_tests.cc 9b3d3fd11212c735e093314f79e5fbd76977ebab @@ -53,6 +53,12 @@ if (t.empty() || t.find("transform") != t.end()) add_transform_tests(suite); + if (t.empty() || t.find("charset") != t.end()) + add_charset_tests(suite); + + if (t.empty() || t.find("simplestring_xform") != t.end()) + add_simplestring_xform_tests(suite); + if (t.empty() || t.find("vocab") != t.end()) add_vocab_tests(suite); ============================================================ --- unit_tests.hh 0b883587fecefa652cf6bcc11c45dd326ad2a181 +++ unit_tests.hh dd69dcf3b16174632d8e4ef123ae6404208fc81e @@ -25,6 +25,8 @@ void add_file_io_tests(test_suite * suite); void add_key_tests(test_suite * suite); void add_transform_tests(test_suite * suite); +void add_charset_tests(test_suite * suite); +void add_simplestring_xform_tests(test_suite * suite); void add_vocab_tests(test_suite * suite); void add_cset_tests(test_suite * suite); void add_revision_tests(test_suite * suite); ============================================================ --- work.cc 7968939058440542a070522a04e9f36e6711f318 +++ work.cc 53ac432f1fc3241c5e2b3141bbbe01fe96ce7401 @@ -18,7 +18,7 @@ #include "restrictions.hh" #include "sanity.hh" #include "safe_map.hh" -#include "transforms.hh" +#include "simplestring_xform.hh" #include "vocab.hh" #include "work.hh" #include "revision.hh"