Commit 5df72886 authored by Dick Hollenbeck's avatar Dick Hollenbeck

complete class UTF8.cpp

parent 2f327f06
......@@ -249,7 +249,8 @@ void STROKE_FONT::drawSingleLineText( const wxString& aText )
// (textSize.x)
xOffset = textSize.x;
glyphSize.x = -m_glyphSize.x;
} else
}
else
{
xOffset = 0.0;
}
......
......@@ -10,6 +10,15 @@
* is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
* conversion support to and from wxString, and has iteration over unicode characters.
*
* <p>I've been careful to supply only conversion facillities and not try
* and duplicate wxString() with many member functions. In the end it is
* to be a std::string. There are multiple ways to create text into a std::string
* without the need of member functions. std::ostringstream.
*
* <p>Because this class used no virtuals, it should be possible to cast any
* std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
* or copying being the effect of the cast.
*
* @author Dick Hollenbeck
*/
class UTF8 : public std::string
......@@ -25,6 +34,9 @@ public:
{
}
/// For use with _() function on wx 2.8:
UTF8( const wchar_t* txt );
explicit UTF8( const std::string& o ) :
std::string( o )
{
......@@ -54,25 +66,20 @@ public:
/**
* Function uni_forward
* advances over a UTF8 encoded multibyte character, capturing the unicode
* character as it goes, and returning the number of bytes consumed.
* advances over a single UTF8 encoded multibyte character, capturing the
* unicode character as it goes, and returning the number of bytes consumed.
*
* @param aSequence is the UTF8 byte sequence.
* @param aResult is where to put the unicode character.
* @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
* @param aResult is where to put the unicode character, and may be NULL if no interest.
* @return int - the count of bytes consumed.
*/
static int uni_forward( unsigned char* aSequence, unsigned* aResult )
{
// @todo: have this read UTF8 characters into result, not bytes.
// What's here now is scaffolding, reading single byte characters only.
*aResult = *aSequence;
return 1;
}
static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL );
/**
* class uni_iter
* is a non-mutable iterator that walks through code points in the UTF8 encoded
* string. The normal ++(), ++(int), ->(), and *() operators are all supported and
* they return a unsigned holding the unicode character appropriate for respective
* they return an unsigned holding the unicode character appropriate for respective
* operation.
*/
class uni_iter
......@@ -81,10 +88,11 @@ public:
unsigned char* it;
// private constructor.
uni_iter( const char* start ) :
it( (unsigned char*) start )
{
assert( sizeof(unsigned) >= 4 );
// for the human: assert( sizeof(unsigned) >= 4 );
}
public:
......@@ -94,10 +102,10 @@ public:
{
unsigned result;
// advance, and toss the result
it += uni_forward( it, &result );
// advance over current, and toss the unicode result
it += uni_forward( it );
// get the next result, but do not advance:
// get the next unicode result, but do not advance:
uni_forward( it, &result );
return result;
}
......@@ -173,15 +181,21 @@ wxString wxFunctionTaking_wxString( const wxString& wx )
int main()
{
std::string str = "input";
UTF8 u0 = L"wide string";
UTF8 u1 = "initial";
wxString wx = wxT( "input2" );
printf( "u0:'%s'\n", u0.c_str() );
printf( "u1:'%s'\n", u1.c_str() );
u1 = str;
wxString wx2 = u1;
// force a std::string into a UTF8, then into a wxString, then copy construct:
wxString wx3 = (UTF8&) u1;
UTF8 u2 = wx2;
u2 += 'X';
......@@ -196,7 +210,7 @@ int main()
printf( "result:'%s'\n", result.c_str() );
// test the unicode iterator:
for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); )
for( UTF8::uni_iter it = u2.ubegin(); it < u2.uend(); )
{
// test post-increment:
printf( " _%c_", it++ );
......@@ -211,8 +225,13 @@ int main()
}
// These to go into a library *.cpp, they are not inlined so that code space
// is saved creating the intermediate objects and referencing wxConvUTF8.
/*
These to go into a library *.cpp, they are not inlined so that significant
code space is saved by encapsulating the creation of intermediate objects
and referencing wxConvUTF8.
*/
UTF8::UTF8( const wxString& o ) :
......@@ -232,3 +251,135 @@ UTF8& UTF8::operator=( const wxString& o )
std::string::operator=( (const char*) o.utf8_str() );
return *this;
}
static const unsigned char utf8_len[256] = {
// Map encoded prefix byte to sequence length. Zero means
// illegal prefix. See RFC 3629 for details
/*
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
*/
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
};
#ifndef THROW_IO_ERROR
#define THROW_IO_ERROR(x) // nothing
#endif
// There is no wxWidgets function that does this, because wchar_t is 16 bits
// on windows and wx wants to encode the output in UTF16 for such.
int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult )
{
unsigned ch = *aSequence;
if( ch < 0x80 )
{
if( aResult )
*aResult = ch;
return 1;
}
unsigned char* s = aSequence;
int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
switch( len )
{
default:
case 0:
THROW_IO_ERROR( "invalid start byte" );
break;
case 2:
if( ( s[1] & 0xc0 ) != 0x80 )
{
THROW_IO_ERROR( "invalid continuation byte" );
}
ch = ((s[0] & 0x1f) << 6) +
((s[1] & 0x3f) << 0);
assert( ch > 0x007F && ch <= 0x07FF );
break;
case 3:
if( (s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[0] == 0xE0 && s[1] < 0xA0)
// || (s[0] == 0xED && s[1] > 0x9F)
)
{
THROW_IO_ERROR( "invalid continuation byte" );
}
ch = ((s[0] & 0x0f) << 12) +
((s[1] & 0x3f) << 6 ) +
((s[2] & 0x3f) << 0 );
assert( ch > 0x07FF && ch <= 0xFFFF );
break;
case 4:
if( (s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
(s[0] == 0xF0 && s[1] < 0x90) ||
(s[0] == 0xF4 && s[1] > 0x8F) )
{
THROW_IO_ERROR( "invalid continuation byte" );
}
ch = ((s[0] & 0x7) << 18) +
((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6 ) +
((s[3] & 0x3f) << 0 );
assert( ch > 0xFFFF && ch <= 0x10ffff );
break;
}
if( aResult )
{
*aResult = ch;
}
return len;
}
UTF8::UTF8( const wchar_t* txt ) :
// size initial string safely large enough, then shrink to known size later.
std::string( wcslen( txt ) * 4, 0 )
{
/*
"this" string was sized to hold the worst case UTF8 encoded byte
sequence, and was initialized with all nul bytes. Overwrite some of
those nuls, then resize, shrinking down to actual size.
Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t
possibly being 16 bits wide on Windows and holding UTF16 input.
*/
int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() );
resize( sz );
}
WXCONFIG=wx-config
INCLUDE=/usr/include/wx-2.8
#WXCONFIG=/opt/wx2.9/bin/wx-config
g++ -I $INCLUDE $($WXCONFIG --cppflags) UTF8.cpp -o test $($WXCONFIG --libs)
g++ -g $($WXCONFIG --cppflags) UTF8.cpp -o test $($WXCONFIG --libs)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment