drw_textcodec.cpp 18.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
#include "drw_textcodec.h"
#include <sstream>
#include <iomanip>
#include <algorithm>
#include "../drw_base.h"
#include "drw_cptables.h"
#include "drw_cptable932.h"
#include "drw_cptable936.h"
#include "drw_cptable949.h"
#include "drw_cptable950.h"

12 13
DRW_TextCodec::DRW_TextCodec()
{
14
    version = DRW::AC1021;
15
    conv    = new DRW_Converter( NULL, 0 );
16 17
}

18 19 20

DRW_TextCodec::~DRW_TextCodec()
{
21 22 23
    delete conv;
}

24 25 26

void DRW_TextCodec::setVersion( std::string* v )
{
27
    std::string versionStr = *v;
28 29 30

    if( versionStr == "AC1009" || versionStr == "AC1006" )
    {
31 32
        version = DRW::AC1009;
        cp = "ANSI_1252";
33 34 35 36 37
        setCodePage( &cp );
    }
    else if( versionStr == "AC1012" || versionStr == "AC1014"
             || versionStr == "AC1015" || versionStr == "AC1018" )
    {
38
        version = DRW::AC1015;
39 40 41

        if( cp.empty() )    // codepage not set, initialize
        {
42
            cp = "ANSI_1252";
43
            setCodePage( &cp );
44
        }
45 46 47
    }
    else
    {
48 49 50 51 52
        version = DRW::AC1021;
        cp = "ANSI_1252";
    }
}

53 54 55 56

void DRW_TextCodec::setCodePage( std::string* c )
{
    cp = correctCodePage( *c );
57
    delete conv;
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92

    if( version == DRW::AC1009 || version == DRW::AC1015 )
    {
        if( cp == "ANSI_874" )
            conv = new DRW_ConvTable( DRW_Table874, CPLENGHTCOMMON );
        else if( cp == "ANSI_932" )
            conv = new DRW_Conv932Table( DRW_Table932, DRW_LeadTable932,
                                         DRW_DoubleTable932, CPLENGHT932 );
        else if( cp == "ANSI_936" )
            conv = new DRW_ConvDBCSTable( DRW_Table936, DRW_LeadTable936,
                                          DRW_DoubleTable936, CPLENGHT936 );
        else if( cp == "ANSI_949" )
            conv = new DRW_ConvDBCSTable( DRW_Table949, DRW_LeadTable949,
                                          DRW_DoubleTable949, CPLENGHT949 );
        else if( cp == "ANSI_950" )
            conv = new DRW_ConvDBCSTable( DRW_Table950, DRW_LeadTable950,
                                          DRW_DoubleTable950, CPLENGHT950 );
        else if( cp == "ANSI_1250" )
            conv = new DRW_ConvTable( DRW_Table1250, CPLENGHTCOMMON );
        else if( cp == "ANSI_1251" )
            conv = new DRW_ConvTable( DRW_Table1251, CPLENGHTCOMMON );
        else if( cp == "ANSI_1253" )
            conv = new DRW_ConvTable( DRW_Table1253, CPLENGHTCOMMON );
        else if( cp == "ANSI_1254" )
            conv = new DRW_ConvTable( DRW_Table1254, CPLENGHTCOMMON );
        else if( cp == "ANSI_1255" )
            conv = new DRW_ConvTable( DRW_Table1255, CPLENGHTCOMMON );
        else if( cp == "ANSI_1256" )
            conv = new DRW_ConvTable( DRW_Table1256, CPLENGHTCOMMON );
        else if( cp == "ANSI_1257" )
            conv = new DRW_ConvTable( DRW_Table1257, CPLENGHTCOMMON );
        else if( cp == "ANSI_1258" )
            conv = new DRW_ConvTable( DRW_Table1258, CPLENGHTCOMMON );
        else if( cp == "UTF-8" )    // DXF older than 2007 are write in win codepages
        {
93
            cp = "ANSI_1252";
94 95 96 97 98 99 100 101
            conv = new DRW_Converter( NULL, 0 );
        }
        else
            conv = new DRW_ConvTable( DRW_Table1252, CPLENGHTCOMMON );
    }
    else
    {
        conv = new DRW_Converter( NULL, 0 );
102 103 104
    }
}

105 106 107 108

std::string DRW_TextCodec::toUtf8( std::string s )
{
    return conv->toUtf8( &s );
109 110
}

111 112 113 114

std::string DRW_TextCodec::fromUtf8( std::string s )
{
    return conv->fromUtf8( &s );
115 116
}

117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

std::string DRW_Converter::toUtf8( std::string* s )
{
    std::string     result;
    int             j   = 0;
    unsigned int    i   = 0;

    for( i = 0; i < s->length(); i++ )
    {
        unsigned char c = s->at( i );

        if( c < 0x80 )    // ascii check for /U+????
        {
            if( c == '\\' && i + 6 < s->length() && s->at( i + 1 ) == 'U' && s->at( i + 2 ) ==
                '+' )
            {
                result  += s->substr( j, i - j );
                result  += encodeText( s->substr( i, 7 ) );
                i   += 6;
                j   = i + 1;
137
            }
138 139 140
        }
        else if( c < 0xE0 )    // 2 bits
        {
141
            i++;
142 143 144 145 146 147 148 149
        }
        else if( c < 0xF0 )    // 3 bits
        {
            i += 2;
        }
        else if( c < 0xF8 )    // 4 bits
        {
            i += 3;
150 151
        }
    }
152 153

    result += s->substr( j );
154 155 156 157 158

    return result;
}


159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
std::string DRW_ConvTable::fromUtf8( std::string* s )
{
    std::string result;
    bool        notFound;
    int         code;

    int         j = 0;

    for( unsigned int i = 0; i < s->length(); i++ )
    {
        unsigned char c = s->at( i );

        if( c > 0x7F )    // need to decode
        {
            result += s->substr( j, i - j );
            std::string part1 = s->substr( i, 4 );
            int         l;
            code = decodeNum( part1, &l );
            j = i + l;
178 179
            i = j - 1;
            notFound = true;
180 181 182 183 184 185

            for( int k = 0; k<cpLenght; k++ )
            {
                if( table[k] == code )
                {
                    result += CPOFFSET + k; // translate from table
186 187 188 189
                    notFound = false;
                    break;
                }
            }
190 191 192

            if( notFound )
                result += decodeText( code );
193 194
        }
    }
195 196

    result += s->substr( j );
197 198 199 200

    return result;
}

201 202 203 204 205 206 207 208

std::string DRW_ConvTable::toUtf8( std::string* s )
{
    std::string             res;
    std::string::iterator   it;

    for( it = s->begin(); it < s->end(); it++ )
    {
209
        unsigned char c = *it;
210 211 212 213 214 215 216 217 218 219

        if( c < 0x80 )
        {
            // check for \U+ encoded text
            if( c == '\\' )
            {
                if( it + 6 < s->end() && *(it + 1) == 'U' && *(it + 2) == '+' )
                {
                    res += encodeText( std::string( it, it + 7 ) );
                    it  += 6;
220
                }
221 222 223 224 225 226 227
                else
                {
                    res += c;    // no \U+ encoded text write
                }
            }
            else
                res += c;                           // c!='\' ascii char write
228
        }
229 230 231 232 233
        else                                        // end c < 0x80
        {
            res += encodeNum( table[c - 0x80] );    // translate from table
        }
    }                                               // end for
234 235 236 237

    return res;
}

238 239 240

std::string DRW_Converter::encodeText( std::string stmp )
{
241
    int code;
242

243
#if defined(__APPLE__)
244 245 246
    int Succeeded = sscanf( &( stmp.substr( 3, 4 )[0]), "%x", &code );

    if( !Succeeded || Succeeded == EOF )
247
        code = 0;
248

249
#else
250
    std::istringstream sd( stmp.substr( 3, 4 ) );
251 252
    sd >> std::hex >> code;
#endif
253
    return encodeNum( code );
254 255
}

256 257 258

std::string DRW_Converter::decodeText( int c )
{
259 260
    std::string res = "\\U+";
    std::string num;
261

262
#if defined(__APPLE__)
263 264
    std::string str( 16, '\0' );
    snprintf( &(str[0]), 16, "%04X", c );
265 266 267
    num = str;
#else
    std::stringstream ss;
268
    ss << std::uppercase << std::setfill( '0' ) << std::setw( 4 ) << std::hex << c;
269 270 271 272 273 274
    ss >> num;
#endif
    res += num;
    return res;
}

275 276 277

std::string DRW_Converter::encodeNum( int c )
{
278
    unsigned char ret[5];
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296

    if( c < 128 )    // 0-7F US-ASCII 7 bits
    {
        ret[0]  = c;
        ret[1]  = 0;
    }
    else if( c < 0x800 )    // 80-07FF 2 bytes
    {
        ret[0]  = 0xC0 | (c >> 6);
        ret[1]  = 0x80 | (c & 0x3f);
        ret[2]  = 0;
    }
    else if( c< 0x10000 )    // 800-FFFF 3 bytes
    {
        ret[0]  = 0xe0 | (c >> 12);
        ret[1]  = 0x80 | ( (c >> 6) & 0x3f );
        ret[2]  = 0x80 | (c & 0x3f);
        ret[3]  = 0;
297
    }
298 299 300 301 302 303 304 305 306 307
    else     // 10000-10FFFF 4 bytes
    {
        ret[0]  = 0xf0 | (c >> 18);
        ret[1]  = 0x80 | ( (c >> 12) & 0x3f );
        ret[2]  = 0x80 | ( (c >> 6) & 0x3f );
        ret[3]  = 0x80 | (c & 0x3f);
        ret[4]  = 0;
    }

    return std::string( (char*) ret );
308 309
}

310

311 312 313
/** 's' is a string with at least 4 bytes lenght
** returned 'b' is byte lenght of encoded char: 2,3 or 4
**/
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
int DRW_Converter::decodeNum( std::string s, int* b )
{
    int             code    = 0;
    unsigned char   c       = s.at( 0 );

    if( (c & 0xE0)  == 0xC0 )    // 2 bytes
    {
        code    = ( c & 0x1F) << 6;
        code    = (s.at( 1 ) & 0x3F) | code;
        *b      = 2;
    }
    else if( (c & 0xF0)  == 0xE0 )    // 3 bytes
    {
        code    = ( c & 0x0F) << 12;
        code    = ( (s.at( 1 ) & 0x3F) << 6 ) | code;
        code    = (s.at( 2 ) & 0x3F) | code;
        *b      = 3;
    }
    else if( (c & 0xF8)  == 0xF0 )    // 4 bytes
    {
        code    = ( c & 0x07) << 18;
        code    = ( (s.at( 1 ) & 0x3F) << 12 ) | code;
        code    = ( (s.at( 2 ) & 0x3F) << 6 ) | code;
        code    = (s.at( 3 ) & 0x3F) | code;
        *b      = 4;
339 340 341 342 343 344
    }

    return code;
}


345 346
std::string DRW_ConvDBCSTable::fromUtf8( std::string* s )
{
347
    std::string result;
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
    bool        notFound;
    int         code;

    int         j = 0;

    for( unsigned int i = 0; i < s->length(); i++ )
    {
        unsigned char c = s->at( i );

        if( c > 0x7F )    // need to decode
        {
            result += s->substr( j, i - j );
            std::string part1 = s->substr( i, 4 );
            int         l;
            code = decodeNum( part1, &l );
            j = i + l;
364 365
            i = j - 1;
            notFound = true;
366 367 368 369 370 371 372 373 374 375 376 377 378

            for( int k = 0; k<cpLenght; k++ )
            {
                if( doubleTable[k][1] == code )
                {
                    int     data = doubleTable[k][0];
                    char    d[3];
                    d[0] = data >> 8;
                    d[1] = data & 0xFF;
                    d[2] = '\0';
                    result += d;     // translate from table
                    notFound = false;
                    break;
379
                }
380 381 382 383 384
            }

            if( notFound )
                result += decodeText( code );
        }    // direct conversion
385
    }
386 387

    result += s->substr( j );
388 389 390 391

    return result;
}

392 393 394 395 396 397 398 399 400 401 402 403 404

std::string DRW_ConvDBCSTable::toUtf8( std::string* s )
{
    std::string             res;
    std::string::iterator   it;

    for( it = s->begin(); it < s->end(); it++ )
    {
        bool            notFound = true;
        unsigned char   c = *it;

        if( c < 0x80 )
        {
405
            notFound = false;
406 407 408 409 410 411 412 413 414 415 416 417

            // check for \U+ encoded text
            if( c == '\\' )
            {
                if( it + 6 < s->end() && *(it + 1) == 'U' && *(it + 2) == '+' )
                {
                    res += encodeText( std::string( it, it + 7 ) );
                    it  += 6;
                }
                else
                {
                    res += c;    // no \U+ encoded text write
418
                }
419 420 421 422 423 424
            }
            else
                res += c;       // c!='\' ascii char write
        }
        else if( c == 0x80 )    // 1 byte table
        {
425
            notFound = false;
426 427 428 429
            res += encodeNum( 0x20AC ); // euro sign
        }
        else                            // 2 bytes
        {
430
            ++it;
431 432 433 434 435 436 437 438 439
            int code    = (c << 8) | (unsigned char) (*it);
            int sta     = leadTable[c - 0x81];
            int end     = leadTable[c - 0x80];

            for( int k = sta; k<end; k++ )
            {
                if( doubleTable[k][0] == code )
                {
                    res += encodeNum( doubleTable[k][1] );    // translate from table
440 441 442 443 444
                    notFound = false;
                    break;
                }
            }
        }
445 446 447 448 449

        // not found
        if( notFound )
            res += encodeNum( NOTFOUND936 );
    }    // end for
450 451 452 453 454

    return res;
}


455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
std::string DRW_Conv932Table::fromUtf8( std::string* s )
{
    std::string result;
    bool        notFound;
    int         code;

    int         j = 0;

    for( unsigned int i = 0; i < s->length(); i++ )
    {
        unsigned char c = s->at( i );

        if( c > 0x7F )    // need to decode
        {
            result += s->substr( j, i - j );
            std::string part1 = s->substr( i, 4 );
            int         l;
            code = decodeNum( part1, &l );
            j = i + l;
474 475
            i = j - 1;
            notFound = true;
476

477
            // 1 byte table
478 479 480
            if( code > 0xff60 && code < 0xFFA0 )
            {
                result += code - CPOFFSET932; // translate from table
481 482
                notFound = false;
            }
483 484 485 486 487 488 489 490 491 492

            if( notFound && ( code<0xF8 || (code>0x390 && code<0x542)
                              || (code>0x200F && code<0x9FA1) || code>0xF928 ) )
            {
                for( int k = 0; k<cpLenght; k++ )
                {
                    if( doubleTable[k][1] == code )
                    {
                        int     data = doubleTable[k][0];
                        char    d[3];
493 494
                        d[0] = data >> 8;
                        d[1] = data & 0xFF;
495 496
                        d[2] = '\0';
                        result += d; // translate from table
497 498 499 500 501
                        notFound = false;
                        break;
                    }
                }
            }
502 503 504 505

            if( notFound )
                result += decodeText( code );
        }    // direct conversion
506
    }
507 508

    result += s->substr( j );
509 510 511 512

    return result;
}

513 514 515 516 517 518 519 520 521 522 523 524 525

std::string DRW_Conv932Table::toUtf8( std::string* s )
{
    std::string             res;
    std::string::iterator   it;

    for( it = s->begin(); it < s->end(); it++ )
    {
        bool            notFound = true;
        unsigned char   c = *it;

        if( c < 0x80 )
        {
526
            notFound = false;
527 528 529 530 531 532 533 534

            // check for \U+ encoded text
            if( c == '\\' )
            {
                if( it + 6 < s->end() && *(it + 1) == 'U' && *(it + 2) == '+' )
                {
                    res += encodeText( std::string( it, it + 7 ) );
                    it  += 6;
535
                }
536 537 538 539 540 541 542 543 544 545
                else
                {
                    res += c;    // no \U+ encoded text write
                }
            }
            else
                res += c;               // c!='\' ascii char write
        }
        else if( c > 0xA0 && c < 0xE0 ) // 1 byte table
        {
546
            notFound = false;
547 548 549 550
            res += encodeNum( c + CPOFFSET932 );    // translate from table
        }
        else                                        // 2 bytes
        {
551
            ++it;
552
            int code = (c << 8) | (unsigned char) (*it);
553
            int sta;
554 555 556 557 558 559 560 561 562 563 564
            int end = 0;

            if( c > 0x80 && c < 0xA0 )
            {
                sta = DRW_LeadTable932[c - 0x81];
                end = DRW_LeadTable932[c - 0x80];
            }
            else if( c > 0xDF && c < 0xFD )
            {
                sta = DRW_LeadTable932[c - 0xC1];
                end = DRW_LeadTable932[c - 0xC0];
565
            }
566 567 568 569 570 571 572 573

            if( end > 0 )
            {
                for( int k = sta; k<end; k++ )
                {
                    if( DRW_DoubleTable932[k][0] == code )
                    {
                        res += encodeNum( DRW_DoubleTable932[k][1] );    // translate from table
574 575 576 577 578 579
                        notFound = false;
                        break;
                    }
                }
            }
        }
580 581 582 583 584

        // not found
        if( notFound )
            res += encodeNum( NOTFOUND932 );
    }    // end for
585 586 587 588

    return res;
}

589 590 591 592 593 594 595 596 597 598 599

std::string DRW_TextCodec::correctCodePage( const std::string& s )
{
    // stringstream cause crash in OS/X, bug#3597944
    std::string cp = s;

    transform( cp.begin(), cp.end(), cp.begin(), toupper );

    // Latin/Thai
    if( cp=="ANSI_874" || cp=="CP874" || cp=="ISO8859-11" || cp=="TIS-620" )
    {
600
        return "ANSI_874";
601 602 603 604
        // Central Europe and Eastern Europe
    }
    else if( cp=="ANSI_1250" || cp=="CP1250" || cp=="ISO8859-2" )
    {
605
        return "ANSI_1250";
606 607 608 609 610
        // Cyrillic script
    }
    else if( cp=="ANSI_1251" || cp=="CP1251" || cp=="ISO8859-5" || cp=="KOI8-R"
             || cp=="KOI8-U" || cp=="IBM 866" )
    {
611
        return "ANSI_1251";
612 613 614 615 616 617 618
        // Western Europe
    }
    else if( cp=="ANSI_1252" || cp=="CP1252" || cp=="LATIN1" || cp=="ISO-8859-1"
             || cp=="CP819" || cp=="CSISO" || cp=="IBM819" || cp=="ISO_8859-1" || cp=="APPLE ROMAN"
             || cp=="ISO8859-1" || cp=="ISO8859-15" || cp=="ISO-IR-100" || cp=="L1" || cp==
             "IBM 850" )
    {
619
        return "ANSI_1252";
620 621 622 623
        // Greek
    }
    else if( cp=="ANSI_1253" || cp=="CP1253" || cp=="iso8859-7" )
    {
624
        return "ANSI_1253";
625 626 627 628
        // Turkish
    }
    else if( cp=="ANSI_1254" || cp=="CP1254" || cp=="iso8859-9" || cp=="iso8859-3" )
    {
629
        return "ANSI_1254";
630 631 632 633
        // Hebrew
    }
    else if( cp=="ANSI_1255" || cp=="CP1255" || cp=="iso8859-8" )
    {
634
        return "ANSI_1255";
635 636 637 638
        // Arabic
    }
    else if( cp=="ANSI_1256" || cp=="CP1256" || cp=="ISO8859-6" )
    {
639
        return "ANSI_1256";
640 641 642 643 644
        // Baltic
    }
    else if( cp=="ANSI_1257" || cp=="CP1257" || cp=="ISO8859-4" || cp=="ISO8859-10" || cp==
             "ISO8859-13" )
    {
645
        return "ANSI_1257";
646 647 648 649
        // Vietnamese
    }
    else if( cp=="ANSI_1258" || cp=="CP1258" )
    {
650 651
        return "ANSI_1258";

652 653 654 655 656 657 658
        // Japanese
    }
    else if( cp=="ANSI_932" || cp=="SHIFT-JIS" || cp=="SHIFT_JIS" || cp=="CSSHIFTJIS"
             || cp=="CSWINDOWS31J" || cp=="MS_KANJI" || cp=="X-MS-CP932" || cp=="X-SJIS"
             || cp=="EUCJP" || cp=="EUC-JP" || cp=="CSEUCPKDFMTJAPANESE" || cp=="X-EUC"
             || cp=="X-EUC-JP" || cp=="JIS7" )
    {
659
        return "ANSI_932";
660 661 662 663 664 665 666
        // Chinese PRC GBK (XGB) simplified
    }
    else if( cp=="ANSI_936" || cp=="GBK" || cp=="GB2312" || cp=="CHINESE" || cp=="CN-GB"
             || cp=="CSGB2312" || cp=="CSGB231280" || cp=="CSISO58BG231280"
             || cp=="GB_2312-80" || cp=="GB231280" || cp=="GB2312-80" || cp=="GBK"
             || cp=="ISO-IR-58" || cp=="GB18030" )
    {
667
        return "ANSI_936";
668 669 670 671
        // Korean
    }
    else if( cp=="ANSI_949" || cp=="EUCKR" )
    {
672
        return "ANSI_949";
673 674 675 676 677
        // Chinese Big5 (Taiwan, Hong Kong SAR)
    }
    else if( cp=="ANSI_950" || cp=="BIG5" || cp=="CN-BIG5" || cp=="CSBIG5"
             || cp=="X-X-BIG5" || cp=="BIG5-HKSCS" )
    {
678 679
        return "ANSI_950";

680
// celtic
681
/*    } else if (cp=="ISO8859-14") {
682 683 684 685 686 687 688 689
 *      return "ISO8859-14";
 *   } else if (cp=="TSCII") {
 *       return "TSCII"; //tamil
 *   } else if (cp=="UTF16") {
 *       return "UTF16"; */
    }
    else if( cp=="UTF-8" || cp=="UTF8" || cp=="UTF88-BIT" )
    {
690 691 692 693 694
        return "UTF-8";
    }

    return "ANSI_1252";
}