complete class UTF8.cpp

5df72886 · Dick Hollenbeck · 2f327f06 · 5df72886 · 5df72886 · 5df72886
Commit 5df72886 authored Dec 08, 2013 by Dick Hollenbeck
Show whitespace changes
Inline Side-by-side

Showing with 176 additions and 22 deletions

stroke_font.cpp common/gal/stroke_font.cpp +2 -1

UTF8.cpp tools/UTF8.cpp +170 -19

make-UTF8.sh tools/make-UTF8.sh +4 -2

No files found.
--- a/common/gal/stroke_font.cpp
+++ b/common/gal/stroke_font.cpp
@@ -249,7 +249,8 @@ void STROKE_FONT::drawSingleLineText( const wxString& aText )
        // (textSize.x)
        xOffset = textSize.x;
        glyphSize.x = -m_glyphSize.x;
-    } else
+    }
+    else
    {
        xOffset = 0.0;
    }

--- a/tools/UTF8.cpp
+++ b/tools/UTF8.cpp
@@ -10,6 +10,15 @@
 * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
 * conversion support to and from wxString, and has iteration over unicode characters.
 *
+ * <p>I've been careful to supply only conversion facillities and not try
+ * and duplicate wxString() with many member functions.  In the end it is
+ * to be a std::string.  There are multiple ways to create text into a std::string
+ * without the need of member functions.  std::ostringstream.
+ *
+ * <p>Because this class used no virtuals, it should be possible to cast any
+ * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
+ * or copying being the effect of the cast.
+ *
 * @author Dick Hollenbeck
 */
 class UTF8 : public std::string
@@ -25,6 +34,9 @@ public:
    {
    }

+    /// For use with _() function on wx 2.8:
+    UTF8( const wchar_t* txt );
+
    explicit UTF8( const std::string& o ) :
        std::string( o )
    {
@@ -54,25 +66,20 @@ public:

    /**
     * Function uni_forward
-     * advances over a UTF8 encoded multibyte character, capturing the unicode
-     * character as it goes, and returning the number of bytes consumed.
+     * advances over a single UTF8 encoded multibyte character, capturing the
+     * unicode character as it goes, and returning the number of bytes consumed.
     *
-     * @param aSequence is the UTF8 byte sequence.
-     * @param aResult is where to put the unicode character.
+     * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
+     * @param aResult is where to put the unicode character, and may be NULL if no interest.
+     * @return int - the count of bytes consumed.
     */
-    static int uni_forward( unsigned char* aSequence, unsigned* aResult )
-    {
-        // @todo: have this read UTF8 characters into result, not bytes.
-        // What's here now is scaffolding, reading single byte characters only.
-        *aResult = *aSequence;
-        return 1;
-    }
+    static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL );

    /**
     * class uni_iter
     * is a non-mutable iterator that walks through code points in the UTF8 encoded
     * string.  The normal ++(), ++(int), ->(), and *() operators are all supported and
-     * they return a unsigned holding the unicode character appropriate for respective
+     * they return an unsigned holding the unicode character appropriate for respective
     * operation.
     */
    class uni_iter
@@ -81,10 +88,11 @@ public:

        unsigned char* it;

+        // private constructor.
        uni_iter( const char* start ) :
            it( (unsigned char*) start )
        {
-            assert( sizeof(unsigned) >= 4 );
+            // for the human: assert( sizeof(unsigned) >= 4 );
        }

    public:
@@ -94,10 +102,10 @@ public:
        {
            unsigned    result;

-            // advance, and toss the result
-            it += uni_forward( it, &result );
+            // advance over current, and toss the unicode result
+            it += uni_forward( it );

-            // get the next result, but do not advance:
+            // get the next unicode result, but do not advance:
            uni_forward( it, &result );
            return result;
        }
@@ -173,15 +181,21 @@ wxString wxFunctionTaking_wxString( const wxString& wx )
 int main()
 {
    std::string str = "input";
+
+    UTF8        u0 = L"wide string";
    UTF8        u1 = "initial";
    wxString    wx = wxT( "input2" );

+    printf( "u0:'%s'\n", u0.c_str() );
    printf( "u1:'%s'\n", u1.c_str() );

    u1 = str;

    wxString    wx2 = u1;

+    // force a std::string into a UTF8, then into a wxString, then copy construct:
+    wxString    wx3 = (UTF8&) u1;
+
    UTF8        u2 = wx2;

    u2 += 'X';
@@ -196,7 +210,7 @@ int main()
    printf( "result:'%s'\n", result.c_str() );

    // test the unicode iterator:
-    for( UTF8::uni_iter it = u2.ubegin();  it != u2.uend();  )
+    for( UTF8::uni_iter it = u2.ubegin();  it < u2.uend();  )
    {
        // test post-increment:
        printf( " _%c_", it++ );
@@ -211,8 +225,13 @@ int main()
 }


-// These to go into a library *.cpp, they are not inlined so that code space
-// is saved creating the intermediate objects and referencing wxConvUTF8.
+/*
+
+    These to go into a library *.cpp, they are not inlined so that significant
+    code space is saved by encapsulating the creation of intermediate objects
+    and referencing wxConvUTF8.
+
+*/


 UTF8::UTF8( const wxString& o ) :
@@ -232,3 +251,135 @@ UTF8& UTF8::operator=( const wxString& o )
    std::string::operator=( (const char*) o.utf8_str() );
    return *this;
 }
+
+
+static const unsigned char utf8_len[256] = {
+    // Map encoded prefix byte to sequence length.  Zero means
+    // illegal prefix.  See RFC 3629 for details
+    /*
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
+    */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F0-F4 + F5-FF
+};
+
+
+#ifndef THROW_IO_ERROR
+ #define THROW_IO_ERROR(x)      // nothing
+#endif
+
+// There is no wxWidgets function that does this, because wchar_t is 16 bits
+// on windows and wx wants to encode the output in UTF16 for such.
+
+int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult )
+{
+    unsigned ch = *aSequence;
+
+    if( ch < 0x80 )
+    {
+        if( aResult )
+            *aResult = ch;
+        return 1;
+    }
+
+    unsigned char* s = aSequence;
+
+    int len = utf8_len[ *s - 0x80  /* top half of table is missing */ ];
+
+    switch( len )
+    {
+    default:
+    case 0:
+        THROW_IO_ERROR( "invalid start byte" );
+        break;
+
+    case 2:
+        if( ( s[1] & 0xc0 ) != 0x80 )
+        {
+            THROW_IO_ERROR( "invalid continuation byte" );
+        }
+
+        ch =    ((s[0] & 0x1f) << 6) +
+                ((s[1] & 0x3f) << 0);
+
+        assert( ch > 0x007F && ch <= 0x07FF );
+        break;
+
+    case 3:
+        if( (s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[0] == 0xE0 && s[1] < 0xA0)
+            // || (s[0] == 0xED && s[1] > 0x9F)
+        )
+        {
+            THROW_IO_ERROR( "invalid continuation byte" );
+        }
+
+        ch =    ((s[0] & 0x0f) << 12) +
+                ((s[1] & 0x3f) << 6 ) +
+                ((s[2] & 0x3f) << 0 );
+
+        assert( ch > 0x07FF && ch <= 0xFFFF );
+        break;
+
+    case 4:
+        if( (s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[3] & 0xc0) != 0x80 ||
+            (s[0] == 0xF0 && s[1] < 0x90) ||
+            (s[0] == 0xF4 && s[1] > 0x8F) )
+        {
+            THROW_IO_ERROR( "invalid continuation byte" );
+        }
+
+        ch =    ((s[0] & 0x7)  << 18) +
+                ((s[1] & 0x3f) << 12) +
+                ((s[2] & 0x3f) << 6 ) +
+                ((s[3] & 0x3f) << 0 );
+
+        assert( ch > 0xFFFF && ch <= 0x10ffff );
+        break;
+    }
+
+    if( aResult )
+    {
+        *aResult = ch;
+    }
+
+    return len;
+}
+
+
+UTF8::UTF8( const wchar_t* txt ) :
+    // size initial string safely large enough, then shrink to known size later.
+    std::string( wcslen( txt ) * 4, 0 )
+{
+    /*
+
+        "this" string was sized to hold the worst case UTF8 encoded byte
+        sequence, and was initialized with all nul bytes. Overwrite some of
+        those nuls, then resize, shrinking down to actual size.
+
+        Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t
+        possibly being 16 bits wide on Windows and holding UTF16 input.
+
+    */
+
+    int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() );
+
+    resize( sz );
+}
+
--- a/tools/make-UTF8.sh
+++ b/tools/make-UTF8.sh
+
+
 WXCONFIG=wx-config
-INCLUDE=/usr/include/wx-2.8
+#WXCONFIG=/opt/wx2.9/bin/wx-config

-g++ -I $INCLUDE $($WXCONFIG --cppflags) UTF8.cpp -o test  $($WXCONFIG --libs)
+g++ -g $($WXCONFIG --cppflags) UTF8.cpp -o test  $($WXCONFIG --libs)