fltk 1.3.0rc3
About: FLTK (Fast Light Tool Kit) is a cross-platform C++ GUI toolkit for UNIX/Linux (X11), Microsoft Windows, and MacOS X. Release candidate.
  SfR Fresh Dox: fltk-1.3.0rc3-source.tar.gz ("inofficial" and yet experimental doxygen-generated source code documentation)  

fl_utf.c

Go to the documentation of this file.
00001 /*
00002  * "$Id: fl_utf.c 8214 2011-01-07 17:23:02Z AlbrechtS $"
00003  *
00004  * This is the utf.c file from fltk2 adapted for use in my fltk1.1 port
00005  */
00006 /* Copyright 2006-2010 by Bill Spitzak and others.
00007  *
00008  * This library is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Library General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2 of the License, or (at your option) any later version.
00012  *
00013  * This library is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Library General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Library General Public
00019  * License along with this library; if not, write to the Free Software
00020  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00021  * USA.
00022  *
00023  * Please report all bugs and problems on the following page:
00024  *
00025  *     http://www.fltk.org/str.php
00026  */
00027 
00028 /* Modified to obey rfc3629, which limits unicode to 0-0x10ffff */
00029 
00030 #include <FL/fl_utf8.h>
00031 #include <string.h>
00032 #include <stdlib.h>
00033 
00039 #if 0
00040 
00052   /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
00053   
00066   /* FL_EXPORT int fl_utf8_size(unsigned int uc); */
00067   
00069 #endif /* 0 */
00070   
00079 #define ERRORS_TO_ISO8859_1 1
00080 
00087 #define ERRORS_TO_CP1252 1
00088 
00095 #define STRICT_RFC3629 0
00096 
00097 #if ERRORS_TO_CP1252
00098 /* Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
00099  * to Unicode:
00100  */
00101 static unsigned short cp1252[32] = {
00102   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
00103   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
00104   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
00105   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
00106 };
00107 #endif
00108 
00140 unsigned fl_utf8decode(const char* p, const char* end, int* len)
00141 {
00142   unsigned char c = *(unsigned char*)p;
00143   if (c < 0x80) {
00144     if (len) *len = 1;
00145     return c;
00146 #if ERRORS_TO_CP1252
00147   } else if (c < 0xa0) {
00148     if (len) *len = 1;
00149     return cp1252[c-0x80];
00150 #endif
00151   } else if (c < 0xc2) {
00152     goto FAIL;
00153   }
00154   if ( (end && p+1 >= end) || (p[1]&0xc0) != 0x80) goto FAIL;
00155   if (c < 0xe0) {
00156     if (len) *len = 2;
00157     return
00158       ((p[0] & 0x1f) << 6) +
00159       ((p[1] & 0x3f));
00160   } else if (c == 0xe0) {
00161     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
00162     goto UTF8_3;
00163 #if STRICT_RFC3629
00164   } else if (c == 0xed) {
00165     /* RFC 3629 says surrogate chars are illegal. */
00166     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
00167     goto UTF8_3;
00168   } else if (c == 0xef) {
00169     /* 0xfffe and 0xffff are also illegal characters */
00170     if (((unsigned char*)p)[1]==0xbf &&
00171         ((unsigned char*)p)[2]>=0xbe) goto FAIL;
00172     goto UTF8_3;
00173 #endif
00174   } else if (c < 0xf0) {
00175   UTF8_3:
00176     if ( (end && p+2 >= end) || (p[2]&0xc0) != 0x80) goto FAIL;
00177     if (len) *len = 3;
00178     return
00179       ((p[0] & 0x0f) << 12) +
00180       ((p[1] & 0x3f) << 6) +
00181       ((p[2] & 0x3f));
00182   } else if (c == 0xf0) {
00183     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
00184     goto UTF8_4;
00185   } else if (c < 0xf4) {
00186   UTF8_4:
00187     if ( (end && p+3 >= end) || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
00188     if (len) *len = 4;
00189 #if STRICT_RFC3629
00190     /* RFC 3629 says all codes ending in fffe or ffff are illegal: */
00191     if ((p[1]&0xf)==0xf &&
00192         ((unsigned char*)p)[2] == 0xbf &&
00193         ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
00194 #endif
00195     return
00196       ((p[0] & 0x07) << 18) +
00197       ((p[1] & 0x3f) << 12) +
00198       ((p[2] & 0x3f) << 6) +
00199       ((p[3] & 0x3f));
00200   } else if (c == 0xf4) {
00201     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; /* after 0x10ffff */
00202     goto UTF8_4;
00203   } else {
00204   FAIL:
00205     if (len) *len = 1;
00206 #if ERRORS_TO_ISO8859_1
00207     return c;
00208 #else
00209     return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
00210 #endif
00211   }
00212 }
00213 
00232 const char* fl_utf8fwd(const char* p, const char* start, const char* end)
00233 {
00234   const char* a;
00235   int len;
00236   /* if we are not pointing at a continuation character, we are done: */
00237   if ((*p&0xc0) != 0x80) return p;
00238   /* search backwards for a 0xc0 starting the character: */
00239   for (a = p-1; ; --a) {
00240     if (a < start) return p;
00241     if (!(a[0]&0x80)) return p;
00242     if ((a[0]&0x40)) break;
00243   }
00244   fl_utf8decode(a,end,&len);
00245   a += len;
00246   if (a > p) return a;
00247   return p;
00248 }
00249 
00263 const char* fl_utf8back(const char* p, const char* start, const char* end)
00264 {
00265   const char* a;
00266   int len;
00267   /* if we are not pointing at a continuation character, we are done: */
00268   if ((*p&0xc0) != 0x80) return p;
00269   /* search backwards for a 0xc0 starting the character: */
00270   for (a = p-1; ; --a) {
00271     if (a < start) return p;
00272     if (!(a[0]&0x80)) return p;
00273     if ((a[0]&0x40)) break;
00274   }
00275   fl_utf8decode(a,end,&len);
00276   if (a+len > p) return a;
00277   return p;
00278 }
00279 
00282 int fl_utf8bytes(unsigned ucs) {
00283   if (ucs < 0x000080U) {
00284     return 1;
00285   } else if (ucs < 0x000800U) {
00286     return 2;
00287   } else if (ucs < 0x010000U) {
00288     return 3;
00289   } else if (ucs <= 0x10ffffU) {
00290     return 4;
00291   } else {
00292     return 3; /* length of the illegal character encoding */
00293   }
00294 }
00295 
00312 int fl_utf8encode(unsigned ucs, char* buf) {
00313   if (ucs < 0x000080U) {
00314     buf[0] = ucs;
00315     return 1;
00316   } else if (ucs < 0x000800U) {
00317     buf[0] = 0xc0 | (ucs >> 6);
00318     buf[1] = 0x80 | (ucs & 0x3F);
00319     return 2;
00320   } else if (ucs < 0x010000U) {
00321     buf[0] = 0xe0 | (ucs >> 12);
00322     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
00323     buf[2] = 0x80 | (ucs & 0x3F);
00324     return 3;
00325   } else if (ucs <= 0x0010ffffU) {
00326     buf[0] = 0xf0 | (ucs >> 18);
00327     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
00328     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
00329     buf[3] = 0x80 | (ucs & 0x3F);
00330     return 4;
00331   } else {
00332     /* encode 0xfffd: */
00333     buf[0] = 0xefU;
00334     buf[1] = 0xbfU;
00335     buf[2] = 0xbdU;
00336     return 3;
00337   }
00338 }
00339 
00371 unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
00372                   unsigned short* dst, unsigned dstlen)
00373 {
00374   const char* p = src;
00375   const char* e = src+srclen;
00376   unsigned count = 0;
00377   if (dstlen) for (;;) {
00378     if (p >= e) {dst[count] = 0; return count;}
00379     if (!(*p & 0x80)) { /* ascii */
00380       dst[count] = *p++;
00381     } else {
00382       int len; unsigned ucs = fl_utf8decode(p,e,&len);
00383       p += len;
00384       if (ucs < 0x10000) {
00385         dst[count] = ucs;
00386       } else {
00387         /* make a surrogate pair: */
00388         if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
00389         dst[count] = (((ucs-0x10000u)>>10)&0x3ff) | 0xd800;
00390         dst[++count] = (ucs&0x3ff) | 0xdc00;
00391       }
00392     }
00393     if (++count == dstlen) {dst[count-1] = 0; break;}
00394   }
00395   /* we filled dst, measure the rest: */
00396   while (p < e) {
00397     if (!(*p & 0x80)) p++;
00398     else {
00399       int len; unsigned ucs = fl_utf8decode(p,e,&len);
00400       p += len;
00401       if (ucs >= 0x10000) ++count;
00402     }
00403     ++count;
00404   }
00405   return count;
00406 }
00407 
00408 
00418 unsigned fl_utf8towc(const char* src, unsigned srclen,
00419                   wchar_t* dst, unsigned dstlen)
00420 {
00421 #if defined(WIN32) || defined(__CYGWIN__)
00422   return fl_utf8toUtf16(src, srclen, (unsigned short*)dst, dstlen);
00423 #else
00424   const char* p = src;
00425   const char* e = src+srclen;
00426   unsigned count = 0;
00427   if (dstlen) for (;;) {
00428     if (p >= e) {
00429       dst[count] = 0;
00430       return count;
00431     }
00432     if (!(*p & 0x80)) { /* ascii */
00433       dst[count] = *p++;
00434     } else {
00435       int len; unsigned ucs = fl_utf8decode(p,e,&len);
00436       p += len;
00437       dst[count] = (wchar_t)ucs;
00438     }
00439     if (++count == dstlen) {dst[count-1] = 0; break;}
00440   }
00441   /* we filled dst, measure the rest: */
00442   while (p < e) {
00443     if (!(*p & 0x80)) p++;
00444     else {
00445       int len; fl_utf8decode(p,e,&len);
00446       p += len;
00447     }
00448     ++count;
00449   }
00450   return count;
00451 #endif
00452 }
00453 
00474 unsigned fl_utf8toa(const char* src, unsigned srclen,
00475                  char* dst, unsigned dstlen)
00476 {
00477   const char* p = src;
00478   const char* e = src+srclen;
00479   unsigned count = 0;
00480   if (dstlen) for (;;) {
00481     unsigned char c;
00482     if (p >= e) {dst[count] = 0; return count;}
00483     c = *(unsigned char*)p;
00484     if (c < 0xC2) { /* ascii or bad code */
00485       dst[count] = c;
00486       p++;
00487     } else {
00488       int len; unsigned ucs = fl_utf8decode(p,e,&len);
00489       p += len;
00490       if (ucs < 0x100) dst[count] = ucs;
00491       else dst[count] = '?';
00492     }
00493     if (++count >= dstlen) {dst[count-1] = 0; break;}
00494   }
00495   /* we filled dst, measure the rest: */
00496   while (p < e) {
00497     if (!(*p & 0x80)) p++;
00498     else {
00499       int len;
00500       fl_utf8decode(p,e,&len);
00501       p += len;
00502     }
00503     ++count;
00504   }
00505   return count;
00506 }
00507 
00535 unsigned fl_utf8fromwc(char* dst, unsigned dstlen,
00536                     const wchar_t* src, unsigned srclen) {
00537   unsigned i = 0;
00538   unsigned count = 0;
00539   if (dstlen) for (;;) {
00540     unsigned ucs;
00541     if (i >= srclen) {dst[count] = 0; return count;}
00542     ucs = src[i++];
00543     if (ucs < 0x80U) {
00544       dst[count++] = ucs;
00545       if (count >= dstlen) {dst[count-1] = 0; break;}
00546     } else if (ucs < 0x800U) { /* 2 bytes */
00547       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
00548       dst[count++] = 0xc0 | (ucs >> 6);
00549       dst[count++] = 0x80 | (ucs & 0x3F);
00550 #if defined(WIN32) || defined(__CYGWIN__)
00551     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
00552                src[i] >= 0xdc00 && src[i] <= 0xdfff) {
00553       /* surrogate pair */
00554       unsigned ucs2 = src[i++];
00555       ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
00556       /* all surrogate pairs turn into 4-byte utf8 */
00557 #else
00558     } else if (ucs >= 0x10000) {
00559       if (ucs > 0x10ffff) {
00560         ucs = 0xfffd;
00561         goto J1;
00562       }
00563 #endif
00564       if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
00565       dst[count++] = 0xf0 | (ucs >> 18);
00566       dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
00567       dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
00568       dst[count++] = 0x80 | (ucs & 0x3F);
00569     } else {
00570 #if !(defined(WIN32) || defined(__CYGWIN__))
00571     J1:
00572 #endif
00573       /* all others are 3 bytes: */
00574       if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
00575       dst[count++] = 0xe0 | (ucs >> 12);
00576       dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
00577       dst[count++] = 0x80 | (ucs & 0x3F);
00578     }
00579   }
00580   /* we filled dst, measure the rest: */
00581   while (i < srclen) {
00582     unsigned ucs = src[i++];
00583     if (ucs < 0x80U) {
00584       count++;
00585     } else if (ucs < 0x800U) { /* 2 bytes */
00586       count += 2;
00587 #if defined(WIN32) || defined(__CYGWIN__)
00588     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
00589                src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
00590       /* surrogate pair */
00591       ++i;
00592 #else
00593     } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
00594 #endif
00595       count += 4;
00596     } else {
00597       count += 3;
00598     }
00599   }
00600   return count;
00601 }
00602 
00623 unsigned fl_utf8froma(char* dst, unsigned dstlen,
00624                    const char* src, unsigned srclen) {
00625   const char* p = src;
00626   const char* e = src+srclen;
00627   unsigned count = 0;
00628   if (dstlen) for (;;) {
00629     unsigned char ucs;
00630     if (p >= e) {dst[count] = 0; return count;}
00631     ucs = *(unsigned char*)p++;
00632     if (ucs < 0x80U) {
00633       dst[count++] = ucs;
00634       if (count >= dstlen) {dst[count-1] = 0; break;}
00635     } else { /* 2 bytes (note that CP1252 translate could make 3 bytes!) */
00636       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
00637       dst[count++] = 0xc0 | (ucs >> 6);
00638       dst[count++] = 0x80 | (ucs & 0x3F);
00639     }
00640   }
00641   /* we filled dst, measure the rest: */
00642   while (p < e) {
00643     unsigned char ucs = *(unsigned char*)p++;
00644     if (ucs < 0x80U) {
00645       count++;
00646     } else {
00647       count += 2;
00648     }
00649   }
00650   return count;
00651 }
00652 
00653 #ifdef WIN32
00654 # include <windows.h>
00655 #endif
00656 
00669 int fl_utf8locale(void) {
00670   static int ret = 2;
00671   if (ret == 2) {
00672 #ifdef WIN32
00673     ret = GetACP() == CP_UTF8;
00674 #else
00675     char* s;
00676     ret = 1; /* assume UTF-8 if no locale */
00677     if (((s = getenv("LC_CTYPE")) && *s) ||
00678         ((s = getenv("LC_ALL"))   && *s) ||
00679         ((s = getenv("LANG"))     && *s)) {
00680       ret = (strstr(s,"utf") || strstr(s,"UTF"));
00681     }
00682 #endif
00683   }
00684   return ret;
00685 }
00686 
00702 unsigned fl_utf8to_mb(const char* src, unsigned srclen,
00703                   char* dst, unsigned dstlen)
00704 {
00705   if (!fl_utf8locale()) {
00706 #ifdef WIN32
00707     wchar_t lbuf[1024];
00708     wchar_t* buf = lbuf;
00709     unsigned length = fl_utf8towc(src, srclen, buf, 1024);
00710     unsigned ret;
00711     if (length >= 1024) {
00712       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
00713       fl_utf8towc(src, srclen, buf, length+1);
00714     }
00715     if (dstlen) {
00716       /* apparently this does not null-terminate, even though msdn
00717        * documentation claims it does:
00718        */
00719       ret =
00720         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
00721       dst[ret] = 0;
00722     }
00723     /* if it overflows or measuring length, get the actual length: */
00724     if (dstlen==0 || ret >= dstlen-1)
00725       ret =
00726         WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
00727     if (buf != lbuf) free((void*)buf);
00728     return ret;
00729 #else
00730     wchar_t lbuf[1024];
00731     wchar_t* buf = lbuf;
00732     unsigned length = fl_utf8towc(src, srclen, buf, 1024);
00733     int ret;
00734     if (length >= 1024) {
00735       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
00736       fl_utf8towc(src, srclen, buf, length+1);
00737     }
00738     if (dstlen) {
00739       ret = wcstombs(dst, buf, dstlen);
00740       if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
00741     } else {
00742       ret = wcstombs(0,buf,0);
00743     }
00744     if (buf != lbuf) free((void*)buf);
00745     if (ret >= 0) return (unsigned)ret;
00746     /* on any errors we return the UTF-8 as raw text...*/
00747 #endif
00748   }
00749   /* identity transform: */
00750   if (srclen < dstlen) {
00751     memcpy(dst, src, srclen);
00752     dst[srclen] = 0;
00753   } else {
00754     // Buffer insufficent or buffer query
00755   }
00756   return srclen;
00757 }
00758 
00776 unsigned fl_utf8from_mb(char* dst, unsigned dstlen,
00777                     const char* src, unsigned srclen)
00778 {
00779   if (!fl_utf8locale()) {
00780 #ifdef WIN32
00781     wchar_t lbuf[1024];
00782     wchar_t* buf = lbuf;
00783     unsigned length;
00784     unsigned ret;
00785     length = MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
00786     if ((length == 0)&&(GetLastError()==ERROR_INSUFFICIENT_BUFFER)) {
00787       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
00788       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
00789       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
00790     }
00791     ret = fl_utf8fromwc(dst, dstlen, buf, length);
00792     if (buf != lbuf) free((void*)buf);
00793     return ret;
00794 #else
00795     wchar_t lbuf[1024];
00796     wchar_t* buf = lbuf;
00797     int length;
00798     unsigned ret;
00799     length = mbstowcs(buf, src, 1024);
00800     if (length >= 1024) {
00801       length = mbstowcs(0, src, 0)+1;
00802       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
00803       mbstowcs(buf, src, length);
00804     }
00805     if (length >= 0) {
00806       ret = fl_utf8fromwc(dst, dstlen, buf, length);
00807       if (buf != lbuf) free((void*)buf);
00808       return ret;
00809     }
00810     /* errors in conversion return the UTF-8 unchanged */
00811 #endif
00812   }
00813   /* identity transform: */
00814   if (srclen < dstlen) {
00815     memcpy(dst, src, srclen);
00816     dst[srclen] = 0;
00817   } else {
00818     // Buffer insufficent or buffer query
00819   }
00820   return srclen;
00821 }
00822 
00843 int fl_utf8test(const char* src, unsigned srclen) {
00844   int ret = 1;
00845   const char* p = src;
00846   const char* e = src+srclen;
00847   while (p < e) {
00848     if (*p & 0x80) {
00849       int len; fl_utf8decode(p,e,&len);
00850       if (len < 2) return 0;
00851       if (len > ret) ret = len;
00852       p += len;
00853     } else {
00854       p++;
00855     }
00856   }
00857   return ret;
00858 }
00859 
00860 /* forward declare mk_wcwidth() as static so the name is not visible.
00861  */
00862  static int mk_wcwidth(unsigned int ucs);
00863 
00864  /* include the c source directly so it's contents are only visible here
00865   */
00866 #include "xutf8/mk_wcwidth.c"
00867 
00881 int fl_wcwidth_(unsigned int ucs) {
00882   return mk_wcwidth(ucs);
00883 }
00884 
00898 int fl_wcwidth(const char* src) {
00899   int len = fl_utf8len(*src);
00900   int ret = 0;
00901   unsigned int ucs = fl_utf8decode(src, src+len, &ret);
00902   int width = fl_wcwidth_(ucs);
00903   return width;
00904 }
00905 
00908 /*
00909  * End of "$Id: fl_utf.c 8214 2011-01-07 17:23:02Z AlbrechtS $".
00910  */