[Pingus-CVS] r3976 - trunk/pingus/src

pingus-cvs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Pingus-CVS] r3976 - trunk/pingus/src

From:	grumbel at BerliOS
Subject:	[Pingus-CVS] r3976 - trunk/pingus/src
Date:	Thu, 26 Feb 2009 15:54:37 +0100
Author: grumbel
Date: 2009-02-26 15:54:37 +0100 (Thu, 26 Feb 2009)
New Revision: 3976

Modified:
   trunk/pingus/src/string_format.cpp
   trunk/pingus/src/utf8_iterator.cpp
   trunk/pingus/src/utf8_iterator.hpp
Log:
Some more cleanup of the UTF-8 stuff

Modified: trunk/pingus/src/string_format.cpp
===================================================================
--- trunk/pingus/src/string_format.cpp  2009-02-26 14:07:49 UTC (rev 3975)
+++ trunk/pingus/src/string_format.cpp  2009-02-26 14:54:37 UTC (rev 3976)
@@ -68,15 +68,17 @@
 {
   std::string text = StringFormat::normalize(text_);
 
-  std::string::const_iterator beg = text.begin();
+  UTF8::iterator beg(text);
   int line_width = 0;
   std::ostringstream out;
-  for(std::string::const_iterator it = beg; it != text.end(); it = 
UTF8::advance(it))
+  
+  for(UTF8::iterator it(text); !it.done(); ++it)
     {
-      std::string word(beg, UTF8::advance(it));
+      std::string word = UTF8::substr(beg, it+1);
       int word_width = font.get_width(word);
-      
-      if (UTF8::is_linebreak_character(UTF8::decode_utf8(std::string(it, 
const_cast<const std::string&>(text).end()))))
+
+      // This is pretty ugly and not fast
+      if (UTF8::is_linebreak_character(*it))
         {
           if ((line_width + word_width) > width)
             {
@@ -89,12 +91,11 @@
               line_width += word_width;
             }
           
-          beg = UTF8::advance(it);
+          beg = it+1;
         }
     }
   
-  std::string::const_iterator end = text.end();
-  out << std::string(beg, end);
+  out << UTF8::substr(beg, UTF8::iterator(text, text.end()));
 
   return out.str();
 }

Modified: trunk/pingus/src/utf8_iterator.cpp
===================================================================
--- trunk/pingus/src/utf8_iterator.cpp  2009-02-26 14:07:49 UTC (rev 3975)
+++ trunk/pingus/src/utf8_iterator.cpp  2009-02-26 14:54:37 UTC (rev 3976)
@@ -18,6 +18,9 @@
 #include <iostream>
 #include <stdexcept>
 #include "utf8_iterator.hpp"
+
+/** Replacement character for invalid UTF-8 sequences */
+static const uint32_t INVALID_UTF8_SEQUENCE = 0xFFFD;
 
 bool
 UTF8::is_linebreak_character(uint32_t unicode)
@@ -35,7 +38,8 @@
 std::string::size_type
 UTF8::length(const std::string& str)
 {
-  // FIXME: Doesn't check if UTF8 sequence is valid
+  // Not checking for valid UTF-8 sequences should be ok, since
+  // incorrect ones are a character too.
   std::string::size_type len = 0;  
   for(std::string::const_iterator i = str.begin(); i != str.end(); ++i)
     {
@@ -50,6 +54,13 @@
 }
 
 std::string
+UTF8::substr(const iterator& first, const iterator& last)
+{
+  return first.get_string().substr(first.get_index(), 
+                                   last.get_index() - first.get_index());
+}
+
+std::string
 UTF8::substr(const std::string& text, std::string::size_type pos, 
std::string::size_type n)
 {
   std::string::const_iterator beg_it = UTF8::advance(text.begin(), pos);
@@ -127,6 +138,7 @@
     {
       // 0xxx.xxxx: 1 byte sequence
       p+=1;
+
       return c1;
     }
   else if ((c1 & 0340) == 0300) 
@@ -136,6 +148,7 @@
       uint32_t c2 = (unsigned char) text[p+1];
       if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 
sequence");
       p+=2;
+
       return (c1 & 0037) << 6 | (c2 & 0077);
     }
   else if ((c1 & 0360) == 0340) 
@@ -147,6 +160,7 @@
       if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 
sequence");
       if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 
sequence");
       p+=3;
+
       return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
     }
   else if ((c1 & 0370) == 0360) 
@@ -160,6 +174,7 @@
       if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 
sequence");
       if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 
sequence");
       p+=4;
+
       return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 
0077);
     }
   else
@@ -170,32 +185,66 @@
 
 // FIXME: Get rid of exceptions in this code
 UTF8::iterator::iterator(const std::string& text_)
-  : text(text_),
-    pos(0)
+  : text(&text_),
+    pos(0),
+    idx(0)
 {
-  try {
-    chr = decode_utf8(text, pos);
-  } catch (std::exception) {
-    std::cout << "Malformed utf-8 sequence beginning with " << 
*((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
-    chr = 0;
-  }
+  try 
+    {
+      chr = decode_utf8(*text, pos);
+    } 
+  catch (std::exception) 
+    {
+      std::cout << "Malformed utf-8 sequence beginning with " << 
*((uint32_t*)(text->c_str() + pos)) << " found " << std::endl;
+      chr = INVALID_UTF8_SEQUENCE;
+    }
 }
 
+UTF8::iterator::iterator(const std::string& text_, const std::string::iterator 
it)
+  : text(&text_),
+    pos(it - text->begin()),
+    idx(pos)
+{
+  try 
+    {
+      chr = decode_utf8(*text, pos);
+    } 
+  catch (std::exception) 
+    {
+      std::cout << "Malformed utf-8 sequence beginning with " << 
*((uint32_t*)(text->c_str() + pos)) << " found " << std::endl;
+      chr = INVALID_UTF8_SEQUENCE;
+    }
+}
+
 bool
 UTF8::iterator::done() const
 {
-  return pos > text.size();
+  return pos > text->size();
 }
 
+UTF8::iterator
+UTF8::iterator::operator+(int n)
+{
+  UTF8::iterator it = *this;
+  for(int i = 0; i < n; ++i)
+    ++it;
+  return it;
+}
+
 UTF8::iterator&
-UTF8::iterator::operator++() {
-  try {
-    chr = decode_utf8(text, pos);
-  } catch (std::exception) {
-    std::cout << "Malformed utf-8 sequence beginning with " << 
*((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
-    chr = 0;
-    ++pos;
-  }
+UTF8::iterator::operator++() 
+{
+  try 
+    {
+      idx = pos;
+      chr = decode_utf8(*text, pos);
+    } 
+  catch (std::exception) 
+    {
+      std::cout << "Malformed utf-8 sequence beginning with " << 
*((uint32_t*)(text->c_str() + pos)) << " found " << std::endl;
+      chr = INVALID_UTF8_SEQUENCE;
+      ++pos;
+    }
 
   return *this;
 }

Modified: trunk/pingus/src/utf8_iterator.hpp
===================================================================
--- trunk/pingus/src/utf8_iterator.hpp  2009-02-26 14:07:49 UTC (rev 3975)
+++ trunk/pingus/src/utf8_iterator.hpp  2009-02-26 14:54:37 UTC (rev 3976)
@@ -24,11 +24,42 @@
 class UTF8
 {
 public:
+  class iterator
+  {
+  private:
+    const std::string* text;
+    
+    /** Position of the next Unicode character after \a chr */
+    std::string::size_type pos;
+    
+    /** Position of \a chr */
+    std::string::size_type idx;
+
+    /** Current Unicode character */
+    uint32_t chr;
+  
+  public:
+    /** Create a UTF8 iterator, note that \a text is stored as
+        pointer, thus it must remain valid for the lifetime of the
+        iterator. */
+    iterator(const std::string& text);
+    iterator(const std::string& text, std::string::iterator it);
+
+    bool done() const;
+    iterator& operator++();
+    iterator operator+(int n);
+    uint32_t operator*() const;
+    
+    std::string::size_type get_index() const { return idx; }
+    const std::string& get_string() const { return *text; }
+  };
+
   /** 
    * Returns the number of characters in a UTF-8 string 
    */
   static std::string::size_type length(const std::string& str);
 
+  static std::string substr(const iterator& first, const iterator& last);
   static std::string substr(const std::string& text, std::string::size_type 
pos, std::string::size_type n);
   static std::string::const_iterator advance(std::string::const_iterator it, 
std::string::size_type n = 1);
 
@@ -52,20 +83,6 @@
   static uint32_t decode_utf8(const std::string& text, size_t& p);
 
   static uint32_t decode_utf8(const std::string& text);
-
-  class iterator
-  {
-  private:
-    const std::string&     text;
-    std::string::size_type pos;
-    uint32_t chr;
-  
-  public:
-    iterator(const std::string& text_);
-    bool done() const;
-    iterator& operator++();
-    uint32_t operator*() const;
-  };
 };
 
 #endif
[Prev in Thread]
Current Thread
[Next in Thread]
[Pingus-CVS] r3976 - trunk/pingus/src, grumbel at BerliOS <=
Prev by Date: [Pingus-CVS] r3975 - trunk/pingus/src
Next by Date: [Pingus-CVS] r3977 - trunk/pingus/src
Previous by thread: [Pingus-CVS] r3975 - trunk/pingus/src
Next by thread: [Pingus-CVS] r3977 - trunk/pingus/src
Index(es):
- Date
- Thread