[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Pingus-CVS] r3976 - trunk/pingus/src
From: |
grumbel at BerliOS |
Subject: |
[Pingus-CVS] r3976 - trunk/pingus/src |
Date: |
Thu, 26 Feb 2009 15:54:37 +0100 |
Author: grumbel
Date: 2009-02-26 15:54:37 +0100 (Thu, 26 Feb 2009)
New Revision: 3976
Modified:
trunk/pingus/src/string_format.cpp
trunk/pingus/src/utf8_iterator.cpp
trunk/pingus/src/utf8_iterator.hpp
Log:
Some more cleanup of the UTF-8 stuff
Modified: trunk/pingus/src/string_format.cpp
===================================================================
--- trunk/pingus/src/string_format.cpp 2009-02-26 14:07:49 UTC (rev 3975)
+++ trunk/pingus/src/string_format.cpp 2009-02-26 14:54:37 UTC (rev 3976)
@@ -68,15 +68,17 @@
{
std::string text = StringFormat::normalize(text_);
- std::string::const_iterator beg = text.begin();
+ UTF8::iterator beg(text);
int line_width = 0;
std::ostringstream out;
- for(std::string::const_iterator it = beg; it != text.end(); it =
UTF8::advance(it))
+
+ for(UTF8::iterator it(text); !it.done(); ++it)
{
- std::string word(beg, UTF8::advance(it));
+ std::string word = UTF8::substr(beg, it+1);
int word_width = font.get_width(word);
-
- if (UTF8::is_linebreak_character(UTF8::decode_utf8(std::string(it,
const_cast<const std::string&>(text).end()))))
+
+ // This is pretty ugly and not fast
+ if (UTF8::is_linebreak_character(*it))
{
if ((line_width + word_width) > width)
{
@@ -89,12 +91,11 @@
line_width += word_width;
}
- beg = UTF8::advance(it);
+ beg = it+1;
}
}
- std::string::const_iterator end = text.end();
- out << std::string(beg, end);
+ out << UTF8::substr(beg, UTF8::iterator(text, text.end()));
return out.str();
}
Modified: trunk/pingus/src/utf8_iterator.cpp
===================================================================
--- trunk/pingus/src/utf8_iterator.cpp 2009-02-26 14:07:49 UTC (rev 3975)
+++ trunk/pingus/src/utf8_iterator.cpp 2009-02-26 14:54:37 UTC (rev 3976)
@@ -18,6 +18,9 @@
#include <iostream>
#include <stdexcept>
#include "utf8_iterator.hpp"
+
+/** Replacement character for invalid UTF-8 sequences */
+static const uint32_t INVALID_UTF8_SEQUENCE = 0xFFFD;
bool
UTF8::is_linebreak_character(uint32_t unicode)
@@ -35,7 +38,8 @@
std::string::size_type
UTF8::length(const std::string& str)
{
- // FIXME: Doesn't check if UTF8 sequence is valid
+ // Not checking for valid UTF-8 sequences should be ok, since
+ // incorrect ones are a character too.
std::string::size_type len = 0;
for(std::string::const_iterator i = str.begin(); i != str.end(); ++i)
{
@@ -50,6 +54,13 @@
}
std::string
+UTF8::substr(const iterator& first, const iterator& last)
+{
+ return first.get_string().substr(first.get_index(),
+ last.get_index() - first.get_index());
+}
+
+std::string
UTF8::substr(const std::string& text, std::string::size_type pos,
std::string::size_type n)
{
std::string::const_iterator beg_it = UTF8::advance(text.begin(), pos);
@@ -127,6 +138,7 @@
{
// 0xxx.xxxx: 1 byte sequence
p+=1;
+
return c1;
}
else if ((c1 & 0340) == 0300)
@@ -136,6 +148,7 @@
uint32_t c2 = (unsigned char) text[p+1];
if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8
sequence");
p+=2;
+
return (c1 & 0037) << 6 | (c2 & 0077);
}
else if ((c1 & 0360) == 0340)
@@ -147,6 +160,7 @@
if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8
sequence");
if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8
sequence");
p+=3;
+
return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
}
else if ((c1 & 0370) == 0360)
@@ -160,6 +174,7 @@
if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8
sequence");
if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8
sequence");
p+=4;
+
return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 &
0077);
}
else
@@ -170,32 +185,66 @@
// FIXME: Get rid of exceptions in this code
UTF8::iterator::iterator(const std::string& text_)
- : text(text_),
- pos(0)
+ : text(&text_),
+ pos(0),
+ idx(0)
{
- try {
- chr = decode_utf8(text, pos);
- } catch (std::exception) {
- std::cout << "Malformed utf-8 sequence beginning with " <<
*((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
- chr = 0;
- }
+ try
+ {
+ chr = decode_utf8(*text, pos);
+ }
+ catch (std::exception)
+ {
+ std::cout << "Malformed utf-8 sequence beginning with " <<
*((uint32_t*)(text->c_str() + pos)) << " found " << std::endl;
+ chr = INVALID_UTF8_SEQUENCE;
+ }
}
+UTF8::iterator::iterator(const std::string& text_, const std::string::iterator
it)
+ : text(&text_),
+ pos(it - text->begin()),
+ idx(pos)
+{
+ try
+ {
+ chr = decode_utf8(*text, pos);
+ }
+ catch (std::exception)
+ {
+ std::cout << "Malformed utf-8 sequence beginning with " <<
*((uint32_t*)(text->c_str() + pos)) << " found " << std::endl;
+ chr = INVALID_UTF8_SEQUENCE;
+ }
+}
+
bool
UTF8::iterator::done() const
{
- return pos > text.size();
+ return pos > text->size();
}
+UTF8::iterator
+UTF8::iterator::operator+(int n)
+{
+ UTF8::iterator it = *this;
+ for(int i = 0; i < n; ++i)
+ ++it;
+ return it;
+}
+
UTF8::iterator&
-UTF8::iterator::operator++() {
- try {
- chr = decode_utf8(text, pos);
- } catch (std::exception) {
- std::cout << "Malformed utf-8 sequence beginning with " <<
*((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
- chr = 0;
- ++pos;
- }
+UTF8::iterator::operator++()
+{
+ try
+ {
+ idx = pos;
+ chr = decode_utf8(*text, pos);
+ }
+ catch (std::exception)
+ {
+ std::cout << "Malformed utf-8 sequence beginning with " <<
*((uint32_t*)(text->c_str() + pos)) << " found " << std::endl;
+ chr = INVALID_UTF8_SEQUENCE;
+ ++pos;
+ }
return *this;
}
Modified: trunk/pingus/src/utf8_iterator.hpp
===================================================================
--- trunk/pingus/src/utf8_iterator.hpp 2009-02-26 14:07:49 UTC (rev 3975)
+++ trunk/pingus/src/utf8_iterator.hpp 2009-02-26 14:54:37 UTC (rev 3976)
@@ -24,11 +24,42 @@
class UTF8
{
public:
+ class iterator
+ {
+ private:
+ const std::string* text;
+
+ /** Position of the next Unicode character after \a chr */
+ std::string::size_type pos;
+
+ /** Position of \a chr */
+ std::string::size_type idx;
+
+ /** Current Unicode character */
+ uint32_t chr;
+
+ public:
+ /** Create a UTF8 iterator, note that \a text is stored as
+ pointer, thus it must remain valid for the lifetime of the
+ iterator. */
+ iterator(const std::string& text);
+ iterator(const std::string& text, std::string::iterator it);
+
+ bool done() const;
+ iterator& operator++();
+ iterator operator+(int n);
+ uint32_t operator*() const;
+
+ std::string::size_type get_index() const { return idx; }
+ const std::string& get_string() const { return *text; }
+ };
+
/**
* Returns the number of characters in a UTF-8 string
*/
static std::string::size_type length(const std::string& str);
+ static std::string substr(const iterator& first, const iterator& last);
static std::string substr(const std::string& text, std::string::size_type
pos, std::string::size_type n);
static std::string::const_iterator advance(std::string::const_iterator it,
std::string::size_type n = 1);
@@ -52,20 +83,6 @@
static uint32_t decode_utf8(const std::string& text, size_t& p);
static uint32_t decode_utf8(const std::string& text);
-
- class iterator
- {
- private:
- const std::string& text;
- std::string::size_type pos;
- uint32_t chr;
-
- public:
- iterator(const std::string& text_);
- bool done() const;
- iterator& operator++();
- uint32_t operator*() const;
- };
};
#endif
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Pingus-CVS] r3976 - trunk/pingus/src,
grumbel at BerliOS <=