fltk 1.3.0rc3
About: FLTK (Fast Light Tool Kit) is a cross-platform C++ GUI toolkit for UNIX/Linux (X11), Microsoft Windows, and MacOS X. Release candidate.
  SfR Fresh Dox: fltk-1.3.0rc3-source.tar.gz ("inofficial" and yet experimental doxygen-generated source code documentation)  

utf8Utils.c

Go to the documentation of this file.
00001 /* "$Id: $"
00002  *
00003  * Author: Jean-Marc Lienher ( http://oksid.ch )
00004  * Copyright 2000-2003 by O'ksi'D.
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Library General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Library General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Library General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00019  * USA.
00020  *
00021  * Please report all bugs and problems on the following page:
00022  *
00023  *     http://www.fltk.org/str.php
00024  */
00025 
00026 /*
00027  * Unicode to UTF-8 conversion functions.
00028  */
00029 
00030 #if !defined(WIN32) && !defined(__APPLE__)
00031 
00032 #include "../../FL/Xutf8.h"
00033 
00034 /*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
00035 
00036 /* 
00037  * Converts the first char of the UTF-8 string to an Unicode value 
00038  * Returns the byte length of the converted UTF-8 char 
00039  * Returns -1 if the UTF-8 string is not valid 
00040  */
00041 int
00042 XConvertUtf8ToUcs(const unsigned char     *buf,
00043                   int                     len,
00044                   unsigned int            *ucs) {
00045 
00046   if (buf[0] & 0x80) {
00047     if (buf[0] & 0x40) {
00048       if (buf[0] & 0x20) {
00049         if (buf[0] & 0x10) {
00050           if (buf[0] & 0x08) {
00051             if (buf[0] & 0x04) {
00052               if (buf[0] & 0x02) {
00053                 /* bad UTF-8 string */
00054               } else {
00055                 /* 0x04000000 - 0x7FFFFFFF */
00056               } 
00057             } else if (len > 4 
00058                        && (buf[1] & 0xC0) == 0x80
00059                        && (buf[2] & 0xC0) == 0x80
00060                        && (buf[3] & 0xC0) == 0x80
00061                        && (buf[4] & 0xC0) == 0x80) {
00062               /* 0x00200000 - 0x03FFFFFF */
00063               *ucs =  ((buf[0] & ~0xF8) << 24) +
00064                       ((buf[1] & ~0x80) << 18) +
00065                       ((buf[2] & ~0x80) << 12) +
00066                       ((buf[3] & ~0x80) << 6) +
00067                        (buf[4] & ~0x80);
00068               if (*ucs > 0x001FFFFF && *ucs < 0x01000000) return 5;
00069             }
00070           } else if (len > 3 
00071                      && (buf[1] & 0xC0) == 0x80
00072                      && (buf[2] & 0xC0) == 0x80
00073                      && (buf[3] & 0xC0) == 0x80) {
00074             /* 0x00010000 - 0x001FFFFF */
00075             *ucs =  ((buf[0] & ~0xF0) << 18) +
00076                     ((buf[1] & ~0x80) << 12) +
00077                     ((buf[2] & ~0x80) << 6) +
00078                      (buf[3] & ~0x80);
00079             if (*ucs > 0x0000FFFF) return 4;
00080           }
00081         } else if (len > 2
00082                    && (buf[1] & 0xC0) == 0x80 
00083                    && (buf[2] & 0xC0) == 0x80) {
00084           /* 0x00000800 - 0x0000FFFF */
00085           *ucs =  ((buf[0] & ~0xE0) << 12) +
00086                   ((buf[1] & ~0x80) << 6) +
00087                    (buf[2] & ~0x80);
00088           if (*ucs > 0x000007FF) return 3;
00089         }       
00090       } else if (len > 1 && (buf[1] & 0xC0) == 0x80) {
00091         /* 0x00000080 - 0x000007FF */
00092         *ucs = ((buf[0] & ~0xC0) << 6) +
00093                 (buf[1] & ~0x80);
00094         if (*ucs > 0x0000007F) return 2;
00095       }
00096     }
00097   } else if (len > 0) {
00098     /* 0x00000000 - 0x0000007F */
00099     *ucs = buf[0];
00100     return 1;
00101   } 
00102 
00103   *ucs = (unsigned int) '?'; /* bad utf-8 string */
00104   return -1;
00105 }
00106 
00107 /* 
00108  * Converts an Unicode value to an UTF-8 string 
00109  * NOTE : the buffer (buf) must be at least 5 bytes long !!!  
00110  */
00111 int 
00112 XConvertUcsToUtf8(unsigned int  ucs, 
00113                   char          *buf) {
00114 
00115   if (ucs < 0x000080) {
00116     buf[0] = ucs;
00117     return 1;
00118   } else if (ucs < 0x000800) {
00119     buf[0] = 0xC0 | (ucs >> 6);
00120     buf[1] = 0x80 | (ucs & 0x3F);
00121     return 2;
00122   } else if (ucs < 0x010000) { 
00123     buf[0] = 0xE0 | (ucs >> 12);
00124     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
00125     buf[2] = 0x80 | (ucs & 0x3F);
00126     return 3;
00127   } else if (ucs < 0x00200000) {
00128     buf[0] = 0xF0 | (ucs >> 18);
00129     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
00130     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
00131     buf[3] = 0x80 | (ucs & 0x3F);
00132     return 4;
00133   } else if (ucs < 0x01000000) {
00134     buf[0] = 0xF8 | (ucs >> 24);
00135     buf[1] = 0x80 | ((ucs >> 18) & 0x3F);
00136     buf[2] = 0x80 | ((ucs >> 12) & 0x3F);
00137     buf[3] = 0x80 | ((ucs >> 6) & 0x3F);
00138     buf[4] = 0x80 | (ucs & 0x3F);
00139     return 5;
00140   }
00141   buf[0] = '?';
00142   return -1;
00143 }
00144 
00145 /* 
00146  * returns the byte length of the first UTF-8 char 
00147  * (returns -1 if not valid) 
00148  */
00149 int
00150 XUtf8CharByteLen(const unsigned char     *buf,
00151                  int                     len) {
00152   unsigned int ucs;
00153   return XConvertUtf8ToUcs(buf, len, &ucs);
00154 }
00155 
00156 /*
00157  * returns the quantity of Unicode chars in the UTF-8 string 
00158  */
00159 int 
00160 XCountUtf8Char(const unsigned char      *buf, 
00161                int                      len) {
00162 
00163   int i = 0;
00164   int nbc = 0;
00165   while (i < len) {
00166     int cl = XUtf8CharByteLen(buf + i, len - i);
00167     if (cl < 1) cl = 1;
00168     nbc++;
00169     i += cl;
00170   }
00171   return nbc;
00172 }
00173 
00174 /* 
00175  * Same as XConvertUtf8ToUcs but no sanity check is done.
00176  */
00177 int
00178 XFastConvertUtf8ToUcs(const unsigned char     *buf,
00179                       int                     len,
00180                       unsigned int            *ucs) {
00181 
00182   if (buf[0] & 0x80) {
00183     if (buf[0] & 0x40) {
00184       if (buf[0] & 0x20) {
00185         if (buf[0] & 0x10) {
00186           if (buf[0] & 0x08) {
00187             if (buf[0] & 0x04) {
00188               if (buf[0] & 0x02) {
00189                 /* bad UTF-8 string */
00190               } else {
00191                 /* 0x04000000 - 0x7FFFFFFF */
00192               } 
00193             } else if (len > 4) {
00194               /* 0x00200000 - 0x03FFFFFF */
00195               *ucs =  ((buf[0] & ~0xF8) << 24) +
00196                       ((buf[1] & ~0x80) << 18) +
00197                       ((buf[2] & ~0x80) << 12) +
00198                       ((buf[3] & ~0x80) << 6) +
00199                        (buf[4] & ~0x80);
00200               return 5;
00201             }
00202           } else if (len > 3) {
00203             /* 0x00010000 - 0x001FFFFF */
00204             *ucs =  ((buf[0] & ~0xF0) << 18) +
00205                     ((buf[1] & ~0x80) << 12) +
00206                     ((buf[2] & ~0x80) << 6) +
00207                      (buf[3] & ~0x80);
00208             return 4;
00209           }
00210         } else if (len > 2) {
00211           /* 0x00000800 - 0x0000FFFF */
00212           *ucs =  ((buf[0] & ~0xE0) << 12) +
00213                   ((buf[1] & ~0x80) << 6) +
00214                    (buf[2] & ~0x80);
00215           return 3;
00216         }       
00217       } else if (len > 1) {
00218         /* 0x00000080 - 0x000007FF */
00219         *ucs = ((buf[0] & ~0xC0) << 6) +
00220                 (buf[1] & ~0x80);
00221         return 2;
00222       }
00223     }
00224   } else if (len > 0) {
00225     /* 0x00000000 - 0x0000007F */
00226     *ucs = buf[0];
00227     return 1;
00228   } 
00229 
00230   *ucs = (unsigned int) '?'; /* bad utf-8 string */
00231   return -1;
00232 }
00233 
00234 #endif /* X11 only */
00235 
00236 /*
00237  * End of "$Id: $".
00238  */