# # # patch "ChangeLog" # from [541a833618bb71df0c59a2f71b62271edd8c08fd] # to [e740b27fde448d4ea8815b882e631795d31ace3a] # # patch "paths.cc" # from [05c27908836d28a468070bb8e7a808417b8401f1] # to [5862b10a890a78c64c21e4c84e10acdab5a9cc70] # # patch "transforms.cc" # from [1f17068bcfa0609f9962200eb33920168f10e7be] # to [d530e6715819b76c4173dc4afc3d3341da60b0a0] # # patch "transforms.hh" # from [20649bff0b21dec65062baa1de4b11ab1f05d950] # to [b7c20f2b92989c820febdb61bb3b55cb6887964a] # ============================================================ --- ChangeLog 541a833618bb71df0c59a2f71b62271edd8c08fd +++ ChangeLog e740b27fde448d4ea8815b882e631795d31ace3a @@ -1,3 +1,11 @@ +2006-03-01 Matthew Gregan + + * transforms.cc, transforms.hh: Add utf8_validate function and + some validity unit tests. + (system_to_utf8): Check that conversion has resulted in valid + UTF-8. + * paths.cc (file_path::file_path): Check that path is valid UTF-8. + 2006-02-28 Matthew Gregan * tests/t_mt_ignore.at: Fix test on MinGW/Win32. ============================================================ --- paths.cc 05c27908836d28a468070bb8e7a808417b8401f1 +++ paths.cc 5862b10a890a78c64c21e4c84e10acdab5a9cc70 @@ -172,6 +172,7 @@ file_path::file_path(file_path::source_type type, std::string const & path) { + I(utf8_validate(path)); switch (type) { case internal: ============================================================ --- transforms.cc 1f17068bcfa0609f9962200eb33920168f10e7be +++ transforms.cc d530e6715819b76c4173dc4afc3d3341da60b0a0 @@ -486,6 +486,7 @@ { string out; charset_convert(system_charset(), "UTF-8", ext(), out); + I(utf8_validate(out)); utf = out; } @@ -594,6 +595,89 @@ ext = out; } +// utf8_validate and the helper functions is_valid_unicode_char and +// utf8_consume_continuation_char g_utf8_validate and supporting functions +// from the file gutf8.c of the GLib library. + +static bool +is_valid_unicode_char(wchar_t c) +{ + return (c < 0x110000 && + ((c & 0xfffff800) != 0xd800) && + (c < 0xfdd0 || c > 0xfdef) && + (c & 0xfffe) != 0xfffe); +} + +static bool +utf8_consume_continuation_char(unsigned char c, wchar_t & val) +{ + if ((c & 0xc0) != 0x80) + return false; + val <<= 6; + val |= c & 0x3f; + return true; +} + +bool +utf8_validate(utf8 const & utf) +{ + std::string::size_type left = utf().size(); + wchar_t min, val; + + for (std::string::const_iterator i = utf().begin(); + i != utf().end(); ++i, --left) + { + unsigned char c = *i; + if (c < 128) + continue; + if ((c & 0xe0) == 0xc0) + { + if (left < 2) + return false; + if ((c & 0x1e) == 0) + return false; + ++i; --left; c = *i; + if ((c & 0xc0) != 0x80) + return false; + } + else + { + if ((c & 0xf0) == 0xe0) + { + if (left < 3) + return false; + min = 1 << 11; + val = c & 0x0f; + goto two_remaining; + } + else if ((c & 0xf8) == 0xf0) + { + if (left < 4) + return false; + min = 1 << 16; + val = c & 0x07; + } + else + return false; + ++i; --left; c = *i; + if (!utf8_consume_continuation_char(c, val)) + return false; +two_remaining: + ++i; --left; c = *i; + if (!utf8_consume_continuation_char(c, val)) + return false; + ++i; --left; c = *i; + if (!utf8_consume_continuation_char(c, val)) + return false; + if (val < min) + return false; + if (!is_valid_unicode_char(val)) + return false; + } + } + return true; +} + static string decode_idna_error(int err) { @@ -1069,6 +1153,229 @@ check_idna_encoding(); } +static void utf8_validation_test() +{ + // these tests are based on the tests from the file utf8-validate.c of the + // GLib library, and also include sequences from Markus Kuhn's UTF-8 + // example files. + const char* good_strings[] = { + "this is a valid but boring ASCII string", + "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88", + "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73", + "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d", + "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e\xe2\x80\x9c", + "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6", + "\xc2\xa9\xc2\xa9\xc2\xa9", + "\xe2\x89\xa0\xe2\x89\xa0", + "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", + "\x00", + "\xc2\x80", + "\xe0\xa0\x80", + "\xf0\x90\x80\x80", + "\x7f", + "\xdf\xbf", + "\xed\x9f\xbf", + "\xee\x80\x80", + "\xef\xbf\xbd", + 0 + }; + const char* bad_strings[] = { + "\xf8\x88\x80\x80\x80", + "\xfc\x84\x80\x80\x80\x80", + "\xef\xbf\xbf", + "\xf7\xbf\xbf\xbf", + "\xfb\xbf\xbf\xbf\xbf", + "\xfd\xbf\xbf\xbf\xbf\xbf", + "\xf4\x8f\xbf\xbf", + "\xf4\x90\x80\x80", + "\x80", + "\xbf", + "\x80\xbf", + "\x80\xbf\x80", + "\x80\xbf\x80\xbf", + "\x80\xbf\x80\xbf\x80", + "\x80\xbf\x80\xbf\x80\xbf", + "\x80\xbf\x80\xbf\x80\xbf\x80", + "\x80", + "\x81", + "\x82", + "\x83", + "\x84", + "\x85", + "\x86", + "\x87", + "\x88", + "\x89", + "\x8a", + "\x8b", + "\x8c", + "\x8d", + "\x8e", + "\x8f", + "\x90", + "\x91", + "\x92", + "\x93", + "\x94", + "\x95", + "\x96", + "\x97", + "\x98", + "\x99", + "\x9a", + "\x9b", + "\x9c", + "\x9d", + "\x9e", + "\x9f", + "\xa0", + "\xa1", + "\xa2", + "\xa3", + "\xa4", + "\xa5", + "\xa6", + "\xa7", + "\xa8", + "\xa9", + "\xaa", + "\xab", + "\xac", + "\xad", + "\xae", + "\xaf", + "\xb0", + "\xb1", + "\xb2", + "\xb3", + "\xb4", + "\xb5", + "\xb6", + "\xb7", + "\xb8", + "\xb9", + "\xba", + "\xbb", + "\xbc", + "\xbd", + "\xbe", + "\xbf", + "\xc0\x20", + "\xc1\x20", + "\xc2\x20", + "\xc3\x20", + "\xc4\x20", + "\xc5\x20", + "\xc6\x20", + "\xc7\x20", + "\xc8\x20", + "\xc9\x20", + "\xca\x20", + "\xcb\x20", + "\xcc\x20", + "\xcd\x20", + "\xce\x20", + "\xcf\x20", + "\xd0\x20", + "\xd1\x20", + "\xd2\x20", + "\xd3\x20", + "\xd4\x20", + "\xd5\x20", + "\xd6\x20", + "\xd7\x20", + "\xd8\x20", + "\xd9\x20", + "\xda\x20", + "\xdb\x20", + "\xdc\x20", + "\xdd\x20", + "\xde\x20", + "\xdf\x20", + "\xe0\x20", + "\xe1\x20", + "\xe2\x20", + "\xe3\x20", + "\xe4\x20", + "\xe5\x20", + "\xe6\x20", + "\xe7\x20", + "\xe8\x20", + "\xe9\x20", + "\xea\x20", + "\xeb\x20", + "\xec\x20", + "\xed\x20", + "\xee\x20", + "\xef\x20", + "\xf0\x20", + "\xf1\x20", + "\xf2\x20", + "\xf3\x20", + "\xf4\x20", + "\xf5\x20", + "\xf6\x20", + "\xf7\x20", + "\xf8\x20", + "\xf9\x20", + "\xfa\x20", + "\xfb\x20", + "\xfc\x20", + "\xfd\x20", + "\x20\xc0", + "\x20\xe0\x80", + "\x20\xf0\x80\x80", + "\x20\xf8\x80\x80\x80", + "\x20\xfc\x80\x80\x80\x80", + "\x20\xdf", + "\x20\xef\xbf", + "\x20\xf7\xbf\xbf", + "\x20\xfb\xbf\xbf\xbf", + "\x20\xfd\xbf\xbf\xbf\xbf", + "\x20\xfe\x20", + "\x20\xff\x20", + "\x20\xc0\xaf\x20", + "\x20\xe0\x80\xaf\x20", + "\x20\xf0\x80\x80\xaf\x20", + "\x20\xf8\x80\x80\x80\xaf\x20", + "\x20\xfc\x80\x80\x80\x80\xaf\x20", + "\x20\xc1\xbf\x20", + "\x20\xe0\x9f\xbf\x20", + "\x20\xf0\x8f\xbf\xbf\x20", + "\x20\xf8\x87\xbf\xbf\xbf\x20", + "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", + "\x20\xc0\x80\x20", + "\x20\xe0\x80\x80\x20", + "\x20\xf0\x80\x80\x80\x20", + "\x20\xf8\x80\x80\x80\x80\x20", + "\x20\xfc\x80\x80\x80\x80\x80\x20", + "\x20\xed\xa0\x80\x20", + "\x20\xed\xad\xbf\x20", + "\x20\xed\xae\x80\x20", + "\x20\xed\xaf\xbf\x20", + "\x20\xed\xb0\x80\x20", + "\x20\xed\xbe\x80\x20", + "\x20\xed\xbf\xbf\x20", + "\x20\xed\xa0\x80\xed\xb0\x80\x20", + "\x20\xed\xa0\x80\xed\xbf\xbf\x20", + "\x20\xed\xad\xbf\xed\xb0\x80\x20", + "\x20\xed\xad\xbf\xed\xbf\xbf\x20", + "\x20\xed\xae\x80\xed\xb0\x80\x20", + "\x20\xed\xae\x80\xed\xbf\xbf\x20", + "\x20\xed\xaf\xbf\xed\xb0\x80\x20", + "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", + "\x20\xef\xbf\xbe\x20", + "\x20\xef\xbf\xbf\x20", + 0 + }; + + for (int i = 0; good_strings[i]; ++i) + BOOST_CHECK(utf8_validate(string(good_strings[i])) == true); + + for (int i = 0; bad_strings[i]; ++i) + BOOST_CHECK(utf8_validate(string(bad_strings[i])) == false); +} + void add_transform_tests(test_suite * suite) { @@ -1080,6 +1387,7 @@ suite->add(BOOST_TEST_CASE(&join_lines_test)); suite->add(BOOST_TEST_CASE(&strip_ws_test)); suite->add(BOOST_TEST_CASE(&encode_test)); + suite->add(BOOST_TEST_CASE(&utf8_validation_test)); } #endif // BUILD_UNIT_TESTS ============================================================ --- transforms.hh 20649bff0b21dec65062baa1de4b11ab1f05d950 +++ transforms.hh b7c20f2b92989c820febdb61bb3b55cb6887964a @@ -176,6 +176,7 @@ void utf8_to_system(utf8 const & utf, std::string & system); void ace_to_utf8(ace const & ac, utf8 & utf); void utf8_to_ace(utf8 const & utf, ace & a); +bool utf8_validate(utf8 const & utf); // returns length in characters (not bytes) size_t display_width(utf8 const & utf);