|
fltk 1.3.0rc3
About: FLTK (Fast Light Tool Kit) is a cross-platform C++ GUI toolkit for UNIX/Linux (X11), Microsoft Windows, and MacOS X. Release candidate.
SfR Fresh Dox: fltk-1.3.0rc3-source.tar.gz ("inofficial" and yet experimental doxygen-generated source code documentation) ![]() |
00001 /* 00002 * "$Id: fl_utf.c 8214 2011-01-07 17:23:02Z AlbrechtS $" 00003 * 00004 * This is the utf.c file from fltk2 adapted for use in my fltk1.1 port 00005 */ 00006 /* Copyright 2006-2010 by Bill Spitzak and others. 00007 * 00008 * This library is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Library General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2 of the License, or (at your option) any later version. 00012 * 00013 * This library is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Library General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Library General Public 00019 * License along with this library; if not, write to the Free Software 00020 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00021 * USA. 00022 * 00023 * Please report all bugs and problems on the following page: 00024 * 00025 * http://www.fltk.org/str.php 00026 */ 00027 00028 /* Modified to obey rfc3629, which limits unicode to 0-0x10ffff */ 00029 00030 #include <FL/fl_utf8.h> 00031 #include <string.h> 00032 #include <stdlib.h> 00033 00039 #if 0 00040 00052 /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */ 00053 00066 /* FL_EXPORT int fl_utf8_size(unsigned int uc); */ 00067 00069 #endif /* 0 */ 00070 00079 #define ERRORS_TO_ISO8859_1 1 00080 00087 #define ERRORS_TO_CP1252 1 00088 00095 #define STRICT_RFC3629 0 00096 00097 #if ERRORS_TO_CP1252 00098 /* Codes 0x80..0x9f from the Microsoft CP1252 character set, translated 00099 * to Unicode: 00100 */ 00101 static unsigned short cp1252[32] = { 00102 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, 00103 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f, 00104 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 00105 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178 00106 }; 00107 #endif 00108 00140 unsigned fl_utf8decode(const char* p, const char* end, int* len) 00141 { 00142 unsigned char c = *(unsigned char*)p; 00143 if (c < 0x80) { 00144 if (len) *len = 1; 00145 return c; 00146 #if ERRORS_TO_CP1252 00147 } else if (c < 0xa0) { 00148 if (len) *len = 1; 00149 return cp1252[c-0x80]; 00150 #endif 00151 } else if (c < 0xc2) { 00152 goto FAIL; 00153 } 00154 if ( (end && p+1 >= end) || (p[1]&0xc0) != 0x80) goto FAIL; 00155 if (c < 0xe0) { 00156 if (len) *len = 2; 00157 return 00158 ((p[0] & 0x1f) << 6) + 00159 ((p[1] & 0x3f)); 00160 } else if (c == 0xe0) { 00161 if (((unsigned char*)p)[1] < 0xa0) goto FAIL; 00162 goto UTF8_3; 00163 #if STRICT_RFC3629 00164 } else if (c == 0xed) { 00165 /* RFC 3629 says surrogate chars are illegal. */ 00166 if (((unsigned char*)p)[1] >= 0xa0) goto FAIL; 00167 goto UTF8_3; 00168 } else if (c == 0xef) { 00169 /* 0xfffe and 0xffff are also illegal characters */ 00170 if (((unsigned char*)p)[1]==0xbf && 00171 ((unsigned char*)p)[2]>=0xbe) goto FAIL; 00172 goto UTF8_3; 00173 #endif 00174 } else if (c < 0xf0) { 00175 UTF8_3: 00176 if ( (end && p+2 >= end) || (p[2]&0xc0) != 0x80) goto FAIL; 00177 if (len) *len = 3; 00178 return 00179 ((p[0] & 0x0f) << 12) + 00180 ((p[1] & 0x3f) << 6) + 00181 ((p[2] & 0x3f)); 00182 } else if (c == 0xf0) { 00183 if (((unsigned char*)p)[1] < 0x90) goto FAIL; 00184 goto UTF8_4; 00185 } else if (c < 0xf4) { 00186 UTF8_4: 00187 if ( (end && p+3 >= end) || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL; 00188 if (len) *len = 4; 00189 #if STRICT_RFC3629 00190 /* RFC 3629 says all codes ending in fffe or ffff are illegal: */ 00191 if ((p[1]&0xf)==0xf && 00192 ((unsigned char*)p)[2] == 0xbf && 00193 ((unsigned char*)p)[3] >= 0xbe) goto FAIL; 00194 #endif 00195 return 00196 ((p[0] & 0x07) << 18) + 00197 ((p[1] & 0x3f) << 12) + 00198 ((p[2] & 0x3f) << 6) + 00199 ((p[3] & 0x3f)); 00200 } else if (c == 0xf4) { 00201 if (((unsigned char*)p)[1] > 0x8f) goto FAIL; /* after 0x10ffff */ 00202 goto UTF8_4; 00203 } else { 00204 FAIL: 00205 if (len) *len = 1; 00206 #if ERRORS_TO_ISO8859_1 00207 return c; 00208 #else 00209 return 0xfffd; /* Unicode REPLACEMENT CHARACTER */ 00210 #endif 00211 } 00212 } 00213 00232 const char* fl_utf8fwd(const char* p, const char* start, const char* end) 00233 { 00234 const char* a; 00235 int len; 00236 /* if we are not pointing at a continuation character, we are done: */ 00237 if ((*p&0xc0) != 0x80) return p; 00238 /* search backwards for a 0xc0 starting the character: */ 00239 for (a = p-1; ; --a) { 00240 if (a < start) return p; 00241 if (!(a[0]&0x80)) return p; 00242 if ((a[0]&0x40)) break; 00243 } 00244 fl_utf8decode(a,end,&len); 00245 a += len; 00246 if (a > p) return a; 00247 return p; 00248 } 00249 00263 const char* fl_utf8back(const char* p, const char* start, const char* end) 00264 { 00265 const char* a; 00266 int len; 00267 /* if we are not pointing at a continuation character, we are done: */ 00268 if ((*p&0xc0) != 0x80) return p; 00269 /* search backwards for a 0xc0 starting the character: */ 00270 for (a = p-1; ; --a) { 00271 if (a < start) return p; 00272 if (!(a[0]&0x80)) return p; 00273 if ((a[0]&0x40)) break; 00274 } 00275 fl_utf8decode(a,end,&len); 00276 if (a+len > p) return a; 00277 return p; 00278 } 00279 00282 int fl_utf8bytes(unsigned ucs) { 00283 if (ucs < 0x000080U) { 00284 return 1; 00285 } else if (ucs < 0x000800U) { 00286 return 2; 00287 } else if (ucs < 0x010000U) { 00288 return 3; 00289 } else if (ucs <= 0x10ffffU) { 00290 return 4; 00291 } else { 00292 return 3; /* length of the illegal character encoding */ 00293 } 00294 } 00295 00312 int fl_utf8encode(unsigned ucs, char* buf) { 00313 if (ucs < 0x000080U) { 00314 buf[0] = ucs; 00315 return 1; 00316 } else if (ucs < 0x000800U) { 00317 buf[0] = 0xc0 | (ucs >> 6); 00318 buf[1] = 0x80 | (ucs & 0x3F); 00319 return 2; 00320 } else if (ucs < 0x010000U) { 00321 buf[0] = 0xe0 | (ucs >> 12); 00322 buf[1] = 0x80 | ((ucs >> 6) & 0x3F); 00323 buf[2] = 0x80 | (ucs & 0x3F); 00324 return 3; 00325 } else if (ucs <= 0x0010ffffU) { 00326 buf[0] = 0xf0 | (ucs >> 18); 00327 buf[1] = 0x80 | ((ucs >> 12) & 0x3F); 00328 buf[2] = 0x80 | ((ucs >> 6) & 0x3F); 00329 buf[3] = 0x80 | (ucs & 0x3F); 00330 return 4; 00331 } else { 00332 /* encode 0xfffd: */ 00333 buf[0] = 0xefU; 00334 buf[1] = 0xbfU; 00335 buf[2] = 0xbdU; 00336 return 3; 00337 } 00338 } 00339 00371 unsigned fl_utf8toUtf16(const char* src, unsigned srclen, 00372 unsigned short* dst, unsigned dstlen) 00373 { 00374 const char* p = src; 00375 const char* e = src+srclen; 00376 unsigned count = 0; 00377 if (dstlen) for (;;) { 00378 if (p >= e) {dst[count] = 0; return count;} 00379 if (!(*p & 0x80)) { /* ascii */ 00380 dst[count] = *p++; 00381 } else { 00382 int len; unsigned ucs = fl_utf8decode(p,e,&len); 00383 p += len; 00384 if (ucs < 0x10000) { 00385 dst[count] = ucs; 00386 } else { 00387 /* make a surrogate pair: */ 00388 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;} 00389 dst[count] = (((ucs-0x10000u)>>10)&0x3ff) | 0xd800; 00390 dst[++count] = (ucs&0x3ff) | 0xdc00; 00391 } 00392 } 00393 if (++count == dstlen) {dst[count-1] = 0; break;} 00394 } 00395 /* we filled dst, measure the rest: */ 00396 while (p < e) { 00397 if (!(*p & 0x80)) p++; 00398 else { 00399 int len; unsigned ucs = fl_utf8decode(p,e,&len); 00400 p += len; 00401 if (ucs >= 0x10000) ++count; 00402 } 00403 ++count; 00404 } 00405 return count; 00406 } 00407 00408 00418 unsigned fl_utf8towc(const char* src, unsigned srclen, 00419 wchar_t* dst, unsigned dstlen) 00420 { 00421 #if defined(WIN32) || defined(__CYGWIN__) 00422 return fl_utf8toUtf16(src, srclen, (unsigned short*)dst, dstlen); 00423 #else 00424 const char* p = src; 00425 const char* e = src+srclen; 00426 unsigned count = 0; 00427 if (dstlen) for (;;) { 00428 if (p >= e) { 00429 dst[count] = 0; 00430 return count; 00431 } 00432 if (!(*p & 0x80)) { /* ascii */ 00433 dst[count] = *p++; 00434 } else { 00435 int len; unsigned ucs = fl_utf8decode(p,e,&len); 00436 p += len; 00437 dst[count] = (wchar_t)ucs; 00438 } 00439 if (++count == dstlen) {dst[count-1] = 0; break;} 00440 } 00441 /* we filled dst, measure the rest: */ 00442 while (p < e) { 00443 if (!(*p & 0x80)) p++; 00444 else { 00445 int len; fl_utf8decode(p,e,&len); 00446 p += len; 00447 } 00448 ++count; 00449 } 00450 return count; 00451 #endif 00452 } 00453 00474 unsigned fl_utf8toa(const char* src, unsigned srclen, 00475 char* dst, unsigned dstlen) 00476 { 00477 const char* p = src; 00478 const char* e = src+srclen; 00479 unsigned count = 0; 00480 if (dstlen) for (;;) { 00481 unsigned char c; 00482 if (p >= e) {dst[count] = 0; return count;} 00483 c = *(unsigned char*)p; 00484 if (c < 0xC2) { /* ascii or bad code */ 00485 dst[count] = c; 00486 p++; 00487 } else { 00488 int len; unsigned ucs = fl_utf8decode(p,e,&len); 00489 p += len; 00490 if (ucs < 0x100) dst[count] = ucs; 00491 else dst[count] = '?'; 00492 } 00493 if (++count >= dstlen) {dst[count-1] = 0; break;} 00494 } 00495 /* we filled dst, measure the rest: */ 00496 while (p < e) { 00497 if (!(*p & 0x80)) p++; 00498 else { 00499 int len; 00500 fl_utf8decode(p,e,&len); 00501 p += len; 00502 } 00503 ++count; 00504 } 00505 return count; 00506 } 00507 00535 unsigned fl_utf8fromwc(char* dst, unsigned dstlen, 00536 const wchar_t* src, unsigned srclen) { 00537 unsigned i = 0; 00538 unsigned count = 0; 00539 if (dstlen) for (;;) { 00540 unsigned ucs; 00541 if (i >= srclen) {dst[count] = 0; return count;} 00542 ucs = src[i++]; 00543 if (ucs < 0x80U) { 00544 dst[count++] = ucs; 00545 if (count >= dstlen) {dst[count-1] = 0; break;} 00546 } else if (ucs < 0x800U) { /* 2 bytes */ 00547 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;} 00548 dst[count++] = 0xc0 | (ucs >> 6); 00549 dst[count++] = 0x80 | (ucs & 0x3F); 00550 #if defined(WIN32) || defined(__CYGWIN__) 00551 } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen && 00552 src[i] >= 0xdc00 && src[i] <= 0xdfff) { 00553 /* surrogate pair */ 00554 unsigned ucs2 = src[i++]; 00555 ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff); 00556 /* all surrogate pairs turn into 4-byte utf8 */ 00557 #else 00558 } else if (ucs >= 0x10000) { 00559 if (ucs > 0x10ffff) { 00560 ucs = 0xfffd; 00561 goto J1; 00562 } 00563 #endif 00564 if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;} 00565 dst[count++] = 0xf0 | (ucs >> 18); 00566 dst[count++] = 0x80 | ((ucs >> 12) & 0x3F); 00567 dst[count++] = 0x80 | ((ucs >> 6) & 0x3F); 00568 dst[count++] = 0x80 | (ucs & 0x3F); 00569 } else { 00570 #if !(defined(WIN32) || defined(__CYGWIN__)) 00571 J1: 00572 #endif 00573 /* all others are 3 bytes: */ 00574 if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;} 00575 dst[count++] = 0xe0 | (ucs >> 12); 00576 dst[count++] = 0x80 | ((ucs >> 6) & 0x3F); 00577 dst[count++] = 0x80 | (ucs & 0x3F); 00578 } 00579 } 00580 /* we filled dst, measure the rest: */ 00581 while (i < srclen) { 00582 unsigned ucs = src[i++]; 00583 if (ucs < 0x80U) { 00584 count++; 00585 } else if (ucs < 0x800U) { /* 2 bytes */ 00586 count += 2; 00587 #if defined(WIN32) || defined(__CYGWIN__) 00588 } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 && 00589 src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) { 00590 /* surrogate pair */ 00591 ++i; 00592 #else 00593 } else if (ucs >= 0x10000 && ucs <= 0x10ffff) { 00594 #endif 00595 count += 4; 00596 } else { 00597 count += 3; 00598 } 00599 } 00600 return count; 00601 } 00602 00623 unsigned fl_utf8froma(char* dst, unsigned dstlen, 00624 const char* src, unsigned srclen) { 00625 const char* p = src; 00626 const char* e = src+srclen; 00627 unsigned count = 0; 00628 if (dstlen) for (;;) { 00629 unsigned char ucs; 00630 if (p >= e) {dst[count] = 0; return count;} 00631 ucs = *(unsigned char*)p++; 00632 if (ucs < 0x80U) { 00633 dst[count++] = ucs; 00634 if (count >= dstlen) {dst[count-1] = 0; break;} 00635 } else { /* 2 bytes (note that CP1252 translate could make 3 bytes!) */ 00636 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;} 00637 dst[count++] = 0xc0 | (ucs >> 6); 00638 dst[count++] = 0x80 | (ucs & 0x3F); 00639 } 00640 } 00641 /* we filled dst, measure the rest: */ 00642 while (p < e) { 00643 unsigned char ucs = *(unsigned char*)p++; 00644 if (ucs < 0x80U) { 00645 count++; 00646 } else { 00647 count += 2; 00648 } 00649 } 00650 return count; 00651 } 00652 00653 #ifdef WIN32 00654 # include <windows.h> 00655 #endif 00656 00669 int fl_utf8locale(void) { 00670 static int ret = 2; 00671 if (ret == 2) { 00672 #ifdef WIN32 00673 ret = GetACP() == CP_UTF8; 00674 #else 00675 char* s; 00676 ret = 1; /* assume UTF-8 if no locale */ 00677 if (((s = getenv("LC_CTYPE")) && *s) || 00678 ((s = getenv("LC_ALL")) && *s) || 00679 ((s = getenv("LANG")) && *s)) { 00680 ret = (strstr(s,"utf") || strstr(s,"UTF")); 00681 } 00682 #endif 00683 } 00684 return ret; 00685 } 00686 00702 unsigned fl_utf8to_mb(const char* src, unsigned srclen, 00703 char* dst, unsigned dstlen) 00704 { 00705 if (!fl_utf8locale()) { 00706 #ifdef WIN32 00707 wchar_t lbuf[1024]; 00708 wchar_t* buf = lbuf; 00709 unsigned length = fl_utf8towc(src, srclen, buf, 1024); 00710 unsigned ret; 00711 if (length >= 1024) { 00712 buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t))); 00713 fl_utf8towc(src, srclen, buf, length+1); 00714 } 00715 if (dstlen) { 00716 /* apparently this does not null-terminate, even though msdn 00717 * documentation claims it does: 00718 */ 00719 ret = 00720 WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0); 00721 dst[ret] = 0; 00722 } 00723 /* if it overflows or measuring length, get the actual length: */ 00724 if (dstlen==0 || ret >= dstlen-1) 00725 ret = 00726 WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0); 00727 if (buf != lbuf) free((void*)buf); 00728 return ret; 00729 #else 00730 wchar_t lbuf[1024]; 00731 wchar_t* buf = lbuf; 00732 unsigned length = fl_utf8towc(src, srclen, buf, 1024); 00733 int ret; 00734 if (length >= 1024) { 00735 buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t))); 00736 fl_utf8towc(src, srclen, buf, length+1); 00737 } 00738 if (dstlen) { 00739 ret = wcstombs(dst, buf, dstlen); 00740 if (ret >= dstlen-1) ret = wcstombs(0,buf,0); 00741 } else { 00742 ret = wcstombs(0,buf,0); 00743 } 00744 if (buf != lbuf) free((void*)buf); 00745 if (ret >= 0) return (unsigned)ret; 00746 /* on any errors we return the UTF-8 as raw text...*/ 00747 #endif 00748 } 00749 /* identity transform: */ 00750 if (srclen < dstlen) { 00751 memcpy(dst, src, srclen); 00752 dst[srclen] = 0; 00753 } else { 00754 // Buffer insufficent or buffer query 00755 } 00756 return srclen; 00757 } 00758 00776 unsigned fl_utf8from_mb(char* dst, unsigned dstlen, 00777 const char* src, unsigned srclen) 00778 { 00779 if (!fl_utf8locale()) { 00780 #ifdef WIN32 00781 wchar_t lbuf[1024]; 00782 wchar_t* buf = lbuf; 00783 unsigned length; 00784 unsigned ret; 00785 length = MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024); 00786 if ((length == 0)&&(GetLastError()==ERROR_INSUFFICIENT_BUFFER)) { 00787 length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0); 00788 buf = (wchar_t*)(malloc(length*sizeof(wchar_t))); 00789 MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length); 00790 } 00791 ret = fl_utf8fromwc(dst, dstlen, buf, length); 00792 if (buf != lbuf) free((void*)buf); 00793 return ret; 00794 #else 00795 wchar_t lbuf[1024]; 00796 wchar_t* buf = lbuf; 00797 int length; 00798 unsigned ret; 00799 length = mbstowcs(buf, src, 1024); 00800 if (length >= 1024) { 00801 length = mbstowcs(0, src, 0)+1; 00802 buf = (wchar_t*)(malloc(length*sizeof(wchar_t))); 00803 mbstowcs(buf, src, length); 00804 } 00805 if (length >= 0) { 00806 ret = fl_utf8fromwc(dst, dstlen, buf, length); 00807 if (buf != lbuf) free((void*)buf); 00808 return ret; 00809 } 00810 /* errors in conversion return the UTF-8 unchanged */ 00811 #endif 00812 } 00813 /* identity transform: */ 00814 if (srclen < dstlen) { 00815 memcpy(dst, src, srclen); 00816 dst[srclen] = 0; 00817 } else { 00818 // Buffer insufficent or buffer query 00819 } 00820 return srclen; 00821 } 00822 00843 int fl_utf8test(const char* src, unsigned srclen) { 00844 int ret = 1; 00845 const char* p = src; 00846 const char* e = src+srclen; 00847 while (p < e) { 00848 if (*p & 0x80) { 00849 int len; fl_utf8decode(p,e,&len); 00850 if (len < 2) return 0; 00851 if (len > ret) ret = len; 00852 p += len; 00853 } else { 00854 p++; 00855 } 00856 } 00857 return ret; 00858 } 00859 00860 /* forward declare mk_wcwidth() as static so the name is not visible. 00861 */ 00862 static int mk_wcwidth(unsigned int ucs); 00863 00864 /* include the c source directly so it's contents are only visible here 00865 */ 00866 #include "xutf8/mk_wcwidth.c" 00867 00881 int fl_wcwidth_(unsigned int ucs) { 00882 return mk_wcwidth(ucs); 00883 } 00884 00898 int fl_wcwidth(const char* src) { 00899 int len = fl_utf8len(*src); 00900 int ret = 0; 00901 unsigned int ucs = fl_utf8decode(src, src+len, &ret); 00902 int width = fl_wcwidth_(ucs); 00903 return width; 00904 } 00905 00908 /* 00909 * End of "$Id: fl_utf.c 8214 2011-01-07 17:23:02Z AlbrechtS $". 00910 */