fltk 1.3.0rc3
About: FLTK (Fast Light Tool Kit) is a cross-platform C++ GUI toolkit for UNIX/Linux (X11), Microsoft Windows, and MacOS X. Release candidate.
  SfR Fresh Dox: fltk-1.3.0rc3-source.tar.gz ("inofficial" and yet experimental doxygen-generated source code documentation)  

gbk_tab_to_h.c

Go to the documentation of this file.
00001 /* $XFree86: xc/lib/X11/lcUniConv/cjk_tab_to_h.c,v 1.2 2000/12/04 18:49:31 dawes Exp $ */
00002 
00003 /* 2009-02-17 <sparkaround@gmail.com>: Create gbk_tab_to_h.c from 
00004  * cjk_tab_to_h.c to generate GBK(cp936ext) table correctly.
00005  *
00006  *
00007  * Generates a CJK character set table from a .TXT table as found on
00008  * ftp.unicode.org or in the X nls directory.
00009  * Examples:
00010  *
00011  *   ./gbk_tab_to_h CP936EXT cp936ext > cp936ext.h < CP936EXT.TXT
00012  *
00013  */
00014 
00015 #include <stdio.h>
00016 #include <stdlib.h>
00017 #include <stdbool.h>
00018 #include <string.h>
00019 
00020 typedef struct {
00021   int start;
00022   int end;
00023 } Block;
00024 
00025 typedef struct {
00026   int rows;    /* number of possible values for the 1st byte */
00027   int cols;    /* number of possible values for the 2nd byte */
00028   int (*row_byte) (int row); /* returns the 1st byte value for a given row */
00029   int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
00030   int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
00031   int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
00032   const char* check_row_expr; /* format string for 1st byte value checking */
00033   const char* check_col_expr; /* format string for 2nd byte value checking */
00034   const char* byte_row_expr; /* format string for 1st byte value to row */
00035   const char* byte_col_expr; /* format string for 2nd byte value to col */
00036   int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
00037   /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
00038      Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
00039   int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
00040   int ncharsetblocks;
00041   Block* charsetblocks; /* blocks[0..nblocks-1] */
00042   int* uni2charset; /* uni2charset[0x0000..0xffff] */
00043 } Encoding;
00044 
00045 /*
00046  * Outputs the file title.
00047  */
00048 static void output_title (const char *charsetname)
00049 {
00050   printf("\n");
00051   printf("/*\n");
00052   printf(" * %s\n", charsetname);
00053   printf(" */\n");
00054   printf("\n");
00055 }
00056 
00057 /*
00058  * Reads the charset2uni table from standard input.
00059  */
00060 static void read_table (Encoding* enc)
00061 {
00062   int row, col, i, i1, i2, c, j;
00063 
00064   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
00065   for (row = 0; row < enc->rows; row++)
00066     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
00067 
00068   for (row = 0; row < enc->rows; row++)
00069     for (col = 0; col < enc->cols; col++)
00070       enc->charset2uni[row][col] = 0xfffd;
00071 
00072   c = getc(stdin);
00073   ungetc(c,stdin);
00074   if (c == '#') {
00075     /* Read a unicode.org style .TXT file. */
00076     for (;;) {
00077       c = getc(stdin);
00078       if (c == EOF)
00079         break;
00080       if (c == '\n' || c == ' ' || c == '\t')
00081         continue;
00082       if (c == '#') {
00083         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
00084         continue;
00085       }
00086       ungetc(c,stdin);
00087       if (scanf("0x%x", &j) != 1)
00088         exit(1);
00089       i1 = j >> 8;
00090       i2 = j & 0xff;
00091       row = enc->byte_row(i1);
00092       col = enc->byte_col(i2);
00093       if (row < 0 || col < 0) {
00094         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
00095         exit(1);
00096       }
00097       if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
00098         exit(1);
00099     }
00100   } else {
00101     /* Read a table of hexadecimal Unicode values. */
00102     for (i1 = 32; i1 < 132; i1++)
00103       for (i2 = 32; i2 < 132; i2++) {
00104         i = scanf("%x", &j);
00105         if (i == EOF)
00106           goto read_done;
00107         if (i != 1)
00108           exit(1);
00109         if (j < 0 || j == 0xffff)
00110           j = 0xfffd;
00111         if (j != 0xfffd) {
00112           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
00113             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
00114             exit (1);
00115           }
00116           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
00117         }
00118       }
00119    read_done: ;
00120   }
00121 }
00122 
00123 /*
00124  * Computes the charsetpage[0..rows] array.
00125  */
00126 static void find_charset2uni_pages (Encoding* enc)
00127 {
00128   int row, col;
00129 
00130   enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
00131 
00132   for (row = 0; row <= enc->rows; row++)
00133     enc->charsetpage[row] = 0;
00134 
00135   for (row = 0; row < enc->rows; row++) {
00136     int used = 0;
00137     for (col = 0; col < enc->cols; col++)
00138       if (enc->charset2uni[row][col] != 0xfffd)
00139         used = col+1;
00140     enc->charsetpage[row] = used;
00141   }
00142 }
00143 
00144 /*
00145  * Fills in nblocks and blocks.
00146  */
00147 static void find_charset2uni_blocks (Encoding* enc)
00148 {
00149   int n, row, lastrow;
00150 
00151   enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
00152 
00153   n = 0;
00154   for (row = 0; row < enc->rows; row++)
00155     if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
00156       for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
00157       enc->charsetblocks[n].start = row * enc->cols;
00158       enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
00159       n++;
00160     }
00161   enc->ncharsetblocks = n;
00162 }
00163 
00164 /*
00165  * Outputs the charset to unicode table and function.
00166  */
00167 static void output_charset2uni (const char* name, Encoding* enc)
00168 {
00169   int row, col, lastrow, col_max, i, i1_min, i1_max;
00170 
00171   find_charset2uni_pages(enc);
00172 
00173   find_charset2uni_blocks(enc);
00174 
00175   for (row = 0; row < enc->rows; row++)
00176     if (enc->charsetpage[row] > 0) {
00177       if (row == 0 || enc->charsetpage[row-1] == 0) {
00178         /* Start a new block. */
00179         for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
00180         printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
00181                name, enc->row_byte(row),
00182                (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
00183       }
00184       printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
00185       col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
00186       for (col = 0; col < col_max; col++) {
00187         printf(" 0x%04x,", enc->charset2uni[row][col]);
00188         if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
00189       }
00190       printf("\n");
00191       if (enc->charsetpage[row+1] == 0) {
00192         /* End a block. */
00193         printf("};\n");
00194       }
00195     }
00196   printf("\n");
00197 
00198   printf("static int\n");
00199   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
00200   printf("{\n");
00201   printf("  unsigned char c1 = s[0];\n");
00202   printf("  if (");
00203   for (i = 0; i < enc->ncharsetblocks; i++) {
00204     i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
00205     i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
00206     if (i > 0)
00207       printf(" || ");
00208     if (i1_min == i1_max)
00209       printf("(c1 == 0x%02x)", i1_min);
00210     else
00211       printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
00212   }
00213   printf(") {\n");
00214   printf("    if (n >= 2) {\n");
00215   printf("      unsigned char c2 = s[1];\n");
00216   printf("      if (");
00217   printf(enc->check_col_expr, "c2");
00218   printf(") {\n");
00219   printf("        unsigned int i = %d * (", enc->cols);
00220   printf(enc->byte_row_expr, "c1");
00221   printf(") + (");
00222   printf(enc->byte_col_expr, "c2");
00223   printf(");\n");
00224   printf("        unsigned short wc = 0xfffd;\n");
00225   for (i = 0; i < enc->ncharsetblocks; i++) {
00226     printf("        ");
00227     if (i > 0)
00228       printf("} else ");
00229     if (i < enc->ncharsetblocks-1)
00230       printf("if (i < %d) ", enc->charsetblocks[i+1].start);
00231     printf("{\n");
00232     printf("          if (i < %d)\n", enc->charsetblocks[i].end);
00233     printf("            wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
00234     if (enc->charsetblocks[i].start > 0)
00235       printf("-%d", enc->charsetblocks[i].start);
00236     printf("];\n");
00237   }
00238   printf("        }\n");
00239   printf("        if (wc != 0xfffd) {\n");
00240   printf("          *pwc = (ucs4_t) wc;\n");
00241   printf("          return 2;\n");
00242   printf("        }\n");
00243   printf("      }\n");
00244   printf("      return RET_ILSEQ;\n");
00245   printf("    }\n");
00246   printf("    return RET_TOOFEW(0);\n");
00247   printf("  }\n");
00248   printf("  return RET_ILSEQ;\n");
00249   printf("}\n");
00250   printf("\n");
00251 }
00252 
00253 /*
00254  * Computes the uni2charset[0x0000..0xffff] array.
00255  */
00256 static void invert (Encoding* enc)
00257 {
00258   int row, col, j;
00259 
00260   enc->uni2charset = (int*) malloc(0x10000*sizeof(int));
00261 
00262   for (j = 0; j < 0x10000; j++)
00263     enc->uni2charset[j] = 0;
00264 
00265   for (row = 0; row < enc->rows; row++)
00266     for (col = 0; col < enc->cols; col++) {
00267       j = enc->charset2uni[row][col];
00268       if (j != 0xfffd)
00269         enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
00270     }
00271 }
00272 
00273 /*
00274  * Outputs the unicode to charset table and function, using a linear array.
00275  * (Suitable if the table is dense.)
00276  */
00277 static void output_uni2charset_dense (const char* name, Encoding* enc)
00278 {
00279   /* Like in 8bit_tab_to_h.c */
00280   bool pages[0x100];
00281   int line[0x2000];
00282   int tableno;
00283   struct { int minline; int maxline; int usecount; } tables[0x2000];
00284   bool first;
00285   int row, col, j, p, j1, j2, t;
00286 
00287   for (p = 0; p < 0x100; p++)
00288     pages[p] = false;
00289   for (row = 0; row < enc->rows; row++)
00290     for (col = 0; col < enc->cols; col++) {
00291       j = enc->charset2uni[row][col];
00292       if (j != 0xfffd)
00293         pages[j>>8] = true;
00294     }
00295   for (j1 = 0; j1 < 0x2000; j1++) {
00296     bool all_invalid = true;
00297     for (j2 = 0; j2 < 8; j2++) {
00298       j = 8*j1+j2;
00299       if (enc->uni2charset[j] != 0)
00300         all_invalid = false;
00301     }
00302     if (all_invalid)
00303       line[j1] = -1;
00304     else
00305       line[j1] = 0;
00306   }
00307   tableno = 0;
00308   for (j1 = 0; j1 < 0x2000; j1++) {
00309     if (line[j1] >= 0) {
00310       if (tableno > 0
00311           && ((j1 > 0 && line[j1-1] == tableno-1)
00312               || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
00313                   && j1 - tables[tableno-1].maxline <= 8))) {
00314         line[j1] = tableno-1;
00315         tables[tableno-1].maxline = j1;
00316       } else {
00317         tableno++;
00318         line[j1] = tableno-1;
00319         tables[tableno-1].minline = tables[tableno-1].maxline = j1;
00320       }
00321     }
00322   }
00323   for (t = 0; t < tableno; t++) {
00324     tables[t].usecount = 0;
00325     j1 = 8*tables[t].minline;
00326     j2 = 8*(tables[t].maxline+1);
00327     for (j = j1; j < j2; j++)
00328       if (enc->uni2charset[j] != 0)
00329         tables[t].usecount++;
00330   }
00331   {
00332     p = -1;
00333     for (t = 0; t < tableno; t++)
00334       if (tables[t].usecount > 1) {
00335         //p = tables[t].minline >> 5;
00336         p = tables[t].minline ;
00337         //printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
00338         printf("static const unsigned short %s_page%04x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
00339         for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
00340           if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
00341             printf("  /* 0x%04x */\n", 8*j1);
00342           printf(" ");
00343           for (j2 = 0; j2 < 8; j2++) {
00344             j = 8*j1+j2;
00345             printf(" 0x%04x,", enc->uni2charset[j]);
00346           }
00347           printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
00348         }
00349         printf("};\n");
00350       }
00351     if (p >= 0)
00352       printf("\n");
00353   }
00354   printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
00355   printf("{\n");
00356   printf("  if (n >= 2) {\n");
00357   printf("    unsigned short c = 0;\n");
00358   first = true;
00359   for (j1 = 0; j1 < 0x2000;) {
00360     t = line[j1];
00361     for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
00362     if (t >= 0) {
00363       if (j1 != tables[t].minline) abort();
00364       if (j2 > tables[t].maxline+1) abort();
00365       j2 = tables[t].maxline+1;
00366       if (first)
00367         printf("    ");
00368       else
00369         printf("    else ");
00370       first = false;
00371       if (tables[t].usecount == 0) abort();
00372       if (tables[t].usecount == 1) {
00373         if (j2 != j1+1) abort();
00374         for (j = 8*j1; j < 8*j2; j++)
00375           if (enc->uni2charset[j] != 0) {
00376             printf("if (wc == 0x%04x)\n      c = 0x%02x;\n", j, enc->uni2charset[j]);
00377             break;
00378           }
00379       } else {
00380         if (j1 == 0) {
00381           printf("if (wc < 0x%04x)", 8*j2);
00382         } else {
00383           printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
00384         }
00385         //printf("\n      c = %s_page%02x[wc", name, j1 >> 5);
00386         printf("\n      c = %s_page%04x[wc", name, j1);
00387         if (tables[t].minline > 0)
00388           printf("-0x%04x", 8*j1);
00389         printf("];\n");
00390       }
00391     }
00392     j1 = j2;
00393   }
00394   printf("    if (c != 0) {\n");
00395   printf("      r[0] = (c >> 8); r[1] = (c & 0xff);\n");
00396   printf("      return 2;\n");
00397   printf("    }\n");
00398   printf("    return RET_ILSEQ;\n");
00399   printf("  }\n");
00400   printf("  return RET_TOOSMALL;\n");
00401   printf("}\n");
00402 }
00403 
00404 /*
00405  * Outputs the unicode to charset table and function, using a packed array.
00406  * (Suitable if the table is sparse.)
00407  */
00408 static void output_uni2charset_sparse (const char* name, Encoding* enc)
00409 {
00410   bool pages[0x100];
00411   Block pageblocks[0x100]; int npageblocks;
00412   int indx2charset[0x10000];
00413   int summary_indx[0x1000];
00414   int summary_used[0x1000];
00415   int i, row, col, j, p, j1, j2, indx;
00416 
00417   /* Fill pages[0x100]. */
00418   for (p = 0; p < 0x100; p++)
00419     pages[p] = false;
00420   for (row = 0; row < enc->rows; row++)
00421     for (col = 0; col < enc->cols; col++) {
00422       j = enc->charset2uni[row][col];
00423       if (j != 0xfffd)
00424         pages[j>>8] = true;
00425     }
00426 
00427 #if 0
00428   for (p = 0; p < 0x100; p++)
00429     if (pages[p]) {
00430       printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
00431       for (j1 = 0; j1 < 32; j1++) {
00432         printf("  ");
00433         for (j2 = 0; j2 < 8; j2++)
00434           printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
00435         printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
00436       }
00437       printf("};\n");
00438     }
00439   printf("\n");
00440 #endif
00441 
00442   /* Fill summary_indx[] and summary_used[]. */
00443   indx = 0;
00444   for (j1 = 0; j1 < 0x1000; j1++) {
00445     summary_indx[j1] = indx;
00446     summary_used[j1] = 0;
00447     for (j2 = 0; j2 < 16; j2++) {
00448       j = 16*j1+j2;
00449       if (enc->uni2charset[j] != 0) {
00450         indx2charset[indx++] = enc->uni2charset[j];
00451         summary_used[j1] |= (1 << j2);
00452       }
00453     }
00454   }
00455 
00456   /* Fill npageblocks and pageblocks[]. */
00457   npageblocks = 0;
00458   for (p = 0; p < 0x100; ) {
00459     if (pages[p] && (p == 0 || !pages[p-1])) {
00460       pageblocks[npageblocks].start = 16*p;
00461       do p++; while (p < 0x100 && pages[p]);
00462       j1 = 16*p;
00463       while (summary_used[j1-1] == 0) j1--;
00464       pageblocks[npageblocks].end = j1;
00465       npageblocks++;
00466     } else
00467       p++;
00468   }
00469 
00470   printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
00471   for (i = 0; i < indx; ) {
00472     if ((i % 8) == 0) printf(" ");
00473     printf(" 0x%04x,", indx2charset[i]);
00474     i++;
00475     if ((i % 8) == 0 || i == indx) printf("\n");
00476   }
00477   printf("};\n");
00478   printf("\n");
00479   for (i = 0; i < npageblocks; i++) {
00480     printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
00481            pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
00482     for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
00483       if (((16*j1) % 0x100) == 0) printf("  /""* 0x%04x *""/\n", 16*j1);
00484       if ((j1 % 4) == 0) printf(" ");
00485       printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
00486       j1++;
00487       if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
00488     }
00489     printf("};\n");
00490   }
00491   printf("\n");
00492 
00493   printf("static int\n");
00494   printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
00495   printf("{\n");
00496   printf("  if (n >= 2) {\n");
00497   printf("    const Summary16 *summary = NULL;\n");
00498   for (i = 0; i < npageblocks; i++) {
00499     printf("    ");
00500     if (i > 0)
00501       printf("else ");
00502     printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
00503            16*pageblocks[i].start, 16*pageblocks[i].end);
00504     printf("      summary = &%s_uni2indx_page%02x[(wc>>4)", name,
00505            pageblocks[i].start/16);
00506     if (pageblocks[i].start > 0)
00507       printf("-0x%03x", pageblocks[i].start);
00508     printf("];\n");
00509   }
00510   printf("    if (summary) {\n");
00511   printf("      unsigned short used = summary->used;\n");
00512   printf("      unsigned int i = wc & 0x0f;\n");
00513   printf("      if (used & ((unsigned short) 1 << i)) {\n");
00514   printf("        unsigned short c;\n");
00515   printf("        /* Keep in `used' only the bits 0..i-1. */\n");
00516   printf("        used &= ((unsigned short) 1 << i) - 1;\n");
00517   printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
00518   printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
00519   printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
00520   printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
00521   printf("        used = (used & 0x00ff) + (used >> 8);\n");
00522   printf("        c = %s_2charset[summary->indx + used];\n", name);
00523   printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
00524   printf("        return 2;\n");
00525   printf("      }\n");
00526   printf("    }\n");
00527   printf("    return RET_ILSEQ;\n");
00528   printf("  }\n");
00529   printf("  return RET_TOOSMALL;\n");
00530   printf("}\n");
00531 }
00532 
00533 /* ISO-2022/EUC specifics */
00534 
00535 static int row_byte_normal (int row) { return 0x21+row; }
00536 static int col_byte_normal (int col) { return 0x21+col; }
00537 static int byte_row_normal (int byte) { return byte-0x21; }
00538 static int byte_col_normal (int byte) { return byte-0x21; }
00539 
00540 static void do_normal (const char* name)
00541 {
00542   Encoding enc;
00543 
00544   enc.rows = 94;
00545   enc.cols = 94;
00546   enc.row_byte = row_byte_normal;
00547   enc.col_byte = col_byte_normal;
00548   enc.byte_row = byte_row_normal;
00549   enc.byte_col = byte_col_normal;
00550   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00551   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00552   enc.byte_row_expr = "%1$s - 0x21";
00553   enc.byte_col_expr = "%1$s - 0x21";
00554 
00555   read_table(&enc);
00556   output_charset2uni(name,&enc);
00557   invert(&enc); output_uni2charset_sparse(name,&enc);
00558 }
00559 
00560 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
00561    starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
00562    order. There are 75 out-of-order values, scattered all throughout the table.
00563  */
00564 
00565 static void do_normal_only_charset2uni (const char* name)
00566 {
00567   Encoding enc;
00568 
00569   enc.rows = 94;
00570   enc.cols = 94;
00571   enc.row_byte = row_byte_normal;
00572   enc.col_byte = col_byte_normal;
00573   enc.byte_row = byte_row_normal;
00574   enc.byte_col = byte_col_normal;
00575   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00576   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00577   enc.byte_row_expr = "%1$s - 0x21";
00578   enc.byte_col_expr = "%1$s - 0x21";
00579 
00580   read_table(&enc);
00581   output_charset2uni(name,&enc);
00582 }
00583 
00584 /* CNS 11643 specifics - trick to put two tables into one */
00585 
00586 static int row_byte_cns11643 (int row) {
00587   return 0x100 * (row / 94) + (row % 94) + 0x21;
00588 }
00589 static int byte_row_cns11643 (int byte) {
00590   return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
00591           byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
00592           byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
00593           -1);
00594 }
00595 
00596 static void do_cns11643_only_uni2charset (const char* name)
00597 {
00598   Encoding enc;
00599   int j, x;
00600 
00601   enc.rows = 3*94;
00602   enc.cols = 94;
00603   enc.row_byte = row_byte_cns11643;
00604   enc.col_byte = col_byte_normal;
00605   enc.byte_row = byte_row_cns11643;
00606   enc.byte_col = byte_col_normal;
00607   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00608   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00609   enc.byte_row_expr = "%1$s - 0x21";
00610   enc.byte_col_expr = "%1$s - 0x21";
00611 
00612   read_table(&enc);
00613   invert(&enc);
00614   /* Move the 2 plane bits into the unused bits 15 and 7. */
00615   for (j = 0; j < 0x10000; j++) {
00616     x = enc.uni2charset[j];
00617     if (x != 0) {
00618       if (x & 0x8080) abort();
00619       switch (x >> 16) {
00620         case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
00621         case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
00622         case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
00623         default: abort();
00624       }
00625       enc.uni2charset[j] = x;
00626     }
00627   }
00628   output_uni2charset_sparse(name,&enc);
00629 }
00630 
00631 /* GBK specifics */
00632 
00633 static int row_byte_gbk1 (int row) {
00634   return 0x81+row;
00635 }
00636 static int col_byte_gbk1 (int col) {
00637   return (col >= 0x3f ? 0x41 : 0x40) + col;
00638 }
00639 static int byte_row_gbk1 (int byte) {
00640   if (byte >= 0x81 && byte < 0xff)
00641     return byte-0x81;
00642   else
00643     return -1;
00644 }
00645 static int byte_col_gbk1 (int byte) {
00646   if (byte >= 0x40 && byte < 0x7f)
00647     return byte-0x40;
00648   else if (byte >= 0x80 && byte < 0xff)
00649     return byte-0x41;
00650   else
00651     return -1;
00652 }
00653 
00654 static void do_gbk1 (const char* name)
00655 {
00656   Encoding enc;
00657 
00658   enc.rows = 126;
00659   enc.cols = 190;
00660   enc.row_byte = row_byte_gbk1;
00661   enc.col_byte = col_byte_gbk1;
00662   enc.byte_row = byte_row_gbk1;
00663   enc.byte_col = byte_col_gbk1;
00664   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00665   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
00666   enc.byte_row_expr = "%1$s - 0x81";
00667   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00668 
00669   read_table(&enc);
00670   output_charset2uni(name,&enc);
00671   invert(&enc); output_uni2charset_dense(name,&enc);
00672 }
00673 
00674 static void do_gbk1_only_charset2uni (const char* name)
00675 {
00676   Encoding enc;
00677 
00678   enc.rows = 126;
00679   enc.cols = 190;
00680   enc.row_byte = row_byte_gbk1;
00681   enc.col_byte = col_byte_gbk1;
00682   enc.byte_row = byte_row_gbk1;
00683   enc.byte_col = byte_col_gbk1;
00684   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00685   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
00686   enc.byte_row_expr = "%1$s - 0x81";
00687   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00688 
00689   read_table(&enc);
00690   output_charset2uni(name,&enc);
00691 }
00692 
00693 static int row_byte_gbk2 (int row) {
00694   return 0x81+row;
00695 }
00696 static int col_byte_gbk2 (int col) {
00697   return (col >= 0x3f ? 0x41 : 0x40) + col;
00698 }
00699 static int byte_row_gbk2 (int byte) {
00700   if (byte >= 0x81 && byte < 0xff)
00701     return byte-0x81;
00702   else
00703     return -1;
00704 }
00705 static int byte_col_gbk2 (int byte) {
00706   if (byte >= 0x40 && byte < 0x7f)
00707     return byte-0x40;
00708   else if (byte >= 0x80 && byte < 0xa1)
00709     return byte-0x41;
00710   else
00711     return -1;
00712 }
00713 
00714 static void do_gbk2_only_charset2uni (const char* name)
00715 {
00716   Encoding enc;
00717 
00718   enc.rows = 126;
00719   enc.cols = 96;
00720   enc.row_byte = row_byte_gbk2;
00721   enc.col_byte = col_byte_gbk2;
00722   enc.byte_row = byte_row_gbk2;
00723   enc.byte_col = byte_col_gbk2;
00724   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00725   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
00726   enc.byte_row_expr = "%1$s - 0x81";
00727   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00728 
00729   read_table(&enc);
00730   output_charset2uni(name,&enc);
00731 }
00732 
00733 static void do_gbk1_only_uni2charset (const char* name)
00734 {
00735   Encoding enc;
00736 
00737   enc.rows = 126;
00738   enc.cols = 190;
00739   enc.row_byte = row_byte_gbk1;
00740   enc.col_byte = col_byte_gbk1;
00741   enc.byte_row = byte_row_gbk1;
00742   enc.byte_col = byte_col_gbk1;
00743   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00744   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
00745   enc.byte_row_expr = "%1$s - 0x81";
00746   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00747 
00748   read_table(&enc);
00749   invert(&enc); output_uni2charset_sparse(name,&enc);
00750 }
00751 
00752 /* KSC 5601 specifics */
00753 
00754 /*
00755  * Reads the charset2uni table from standard input.
00756  */
00757 static void read_table_ksc5601 (Encoding* enc)
00758 {
00759   int row, col, i, i1, i2, c, j;
00760 
00761   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
00762   for (row = 0; row < enc->rows; row++)
00763     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
00764 
00765   for (row = 0; row < enc->rows; row++)
00766     for (col = 0; col < enc->cols; col++)
00767       enc->charset2uni[row][col] = 0xfffd;
00768 
00769   c = getc(stdin);
00770   ungetc(c,stdin);
00771   if (c == '#') {
00772     /* Read a unicode.org style .TXT file. */
00773     for (;;) {
00774       c = getc(stdin);
00775       if (c == EOF)
00776         break;
00777       if (c == '\n' || c == ' ' || c == '\t')
00778         continue;
00779       if (c == '#') {
00780         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
00781         continue;
00782       }
00783       ungetc(c,stdin);
00784       if (scanf("0x%x", &j) != 1)
00785         exit(1);
00786       i1 = j >> 8;
00787       i2 = j & 0xff;
00788       if (scanf(" 0x%x", &j) != 1)
00789         exit(1);
00790       /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
00791          = KS X 1001.1992, ignore the rest. */
00792       if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
00793         continue;  /* KSC5601 specific */
00794       i1 &= 0x7f;  /* KSC5601 specific */
00795       i2 &= 0x7f;  /* KSC5601 specific */
00796       row = enc->byte_row(i1);
00797       col = enc->byte_col(i2);
00798       if (row < 0 || col < 0) {
00799         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
00800         exit(1);
00801       }
00802       enc->charset2uni[row][col] = j;
00803     }
00804   } else {
00805     /* Read a table of hexadecimal Unicode values. */
00806     for (i1 = 33; i1 < 127; i1++)
00807       for (i2 = 33; i2 < 127; i2++) {
00808         i = scanf("%x", &j);
00809         if (i == EOF)
00810           goto read_done;
00811         if (i != 1)
00812           exit(1);
00813         if (j < 0 || j == 0xffff)
00814           j = 0xfffd;
00815         if (j != 0xfffd) {
00816           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
00817             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
00818             exit (1);
00819           }
00820           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
00821         }
00822       }
00823    read_done: ;
00824   }
00825 }
00826 
00827 static void do_ksc5601 (const char* name)
00828 {
00829   Encoding enc;
00830 
00831   enc.rows = 94;
00832   enc.cols = 94;
00833   enc.row_byte = row_byte_normal;
00834   enc.col_byte = col_byte_normal;
00835   enc.byte_row = byte_row_normal;
00836   enc.byte_col = byte_col_normal;
00837   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00838   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00839   enc.byte_row_expr = "%1$s - 0x21";
00840   enc.byte_col_expr = "%1$s - 0x21";
00841 
00842   read_table_ksc5601(&enc);
00843   output_charset2uni(name,&enc);
00844   invert(&enc); output_uni2charset_sparse(name,&enc);
00845 }
00846 
00847 /* Big5 specifics */
00848 
00849 static int row_byte_big5 (int row) {
00850   return 0xa1+row;
00851 }
00852 static int col_byte_big5 (int col) {
00853   return (col >= 0x3f ? 0x62 : 0x40) + col;
00854 }
00855 static int byte_row_big5 (int byte) {
00856   if (byte >= 0xa1 && byte < 0xff)
00857     return byte-0xa1;
00858   else
00859     return -1;
00860 }
00861 static int byte_col_big5 (int byte) {
00862   if (byte >= 0x40 && byte < 0x7f)
00863     return byte-0x40;
00864   else if (byte >= 0xa1 && byte < 0xff)
00865     return byte-0x62;
00866   else
00867     return -1;
00868 }
00869 
00870 static void do_big5 (const char* name)
00871 {
00872   Encoding enc;
00873 
00874   enc.rows = 94;
00875   enc.cols = 157;
00876   enc.row_byte = row_byte_big5;
00877   enc.col_byte = col_byte_big5;
00878   enc.byte_row = byte_row_big5;
00879   enc.byte_col = byte_col_big5;
00880   enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
00881   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
00882   enc.byte_row_expr = "%1$s - 0xa1";
00883   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
00884 
00885   read_table(&enc);
00886   output_charset2uni(name,&enc);
00887   invert(&enc); output_uni2charset_sparse(name,&enc);
00888 }
00889 
00890 /* Johab Hangul specifics */
00891 
00892 static int row_byte_johab_hangul (int row) {
00893   return 0x84+row;
00894 }
00895 static int col_byte_johab_hangul (int col) {
00896   return (col >= 0x3e ? 0x43 : 0x41) + col;
00897 }
00898 static int byte_row_johab_hangul (int byte) {
00899   if (byte >= 0x84 && byte < 0xd4)
00900     return byte-0x84;
00901   else
00902     return -1;
00903 }
00904 static int byte_col_johab_hangul (int byte) {
00905   if (byte >= 0x41 && byte < 0x7f)
00906     return byte-0x41;
00907   else if (byte >= 0x81 && byte < 0xff)
00908     return byte-0x43;
00909   else
00910     return -1;
00911 }
00912 
00913 static void do_johab_hangul (const char* name)
00914 {
00915   Encoding enc;
00916 
00917   enc.rows = 80;
00918   enc.cols = 188;
00919   enc.row_byte = row_byte_johab_hangul;
00920   enc.col_byte = col_byte_johab_hangul;
00921   enc.byte_row = byte_row_johab_hangul;
00922   enc.byte_col = byte_col_johab_hangul;
00923   enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
00924   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
00925   enc.byte_row_expr = "%1$s - 0x84";
00926   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
00927 
00928   read_table(&enc);
00929   output_charset2uni(name,&enc);
00930   invert(&enc); output_uni2charset_dense(name,&enc);
00931 }
00932 
00933 /* SJIS specifics */
00934 
00935 static int row_byte_sjis (int row) {
00936   return (row >= 0x1f ? 0xc1 : 0x81) + row;
00937 }
00938 static int col_byte_sjis (int col) {
00939   return (col >= 0x3f ? 0x41 : 0x40) + col;
00940 }
00941 static int byte_row_sjis (int byte) {
00942   if (byte >= 0x81 && byte < 0xa0)
00943     return byte-0x81;
00944   else if (byte >= 0xe0)
00945     return byte-0xc1;
00946   else
00947     return -1;
00948 }
00949 static int byte_col_sjis (int byte) {
00950   if (byte >= 0x40 && byte < 0x7f)
00951     return byte-0x40;
00952   else if (byte >= 0x80 && byte < 0xfd)
00953     return byte-0x41;
00954   else
00955     return -1;
00956 }
00957 
00958 static void do_sjis (const char* name)
00959 {
00960   Encoding enc;
00961 
00962   enc.rows = 94;
00963   enc.cols = 188;
00964   enc.row_byte = row_byte_sjis;
00965   enc.col_byte = col_byte_sjis;
00966   enc.byte_row = byte_row_sjis;
00967   enc.byte_col = byte_col_sjis;
00968   enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
00969   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
00970   enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
00971   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00972 
00973   read_table(&enc);
00974   output_charset2uni(name,&enc);
00975   invert(&enc); output_uni2charset_sparse(name,&enc);
00976 }
00977 
00978 /* Main program */
00979 
00980 int main (int argc, char *argv[])
00981 {
00982   const char* charsetname;
00983   const char* name;
00984 
00985   if (argc != 3)
00986     exit(1);
00987   charsetname = argv[1];
00988   name = argv[2];
00989 
00990   output_title(charsetname);
00991 
00992   if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext")
00993       || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
00994     do_normal(name);
00995   else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
00996            || !strcmp(name,"cns11643_3"))
00997     do_normal_only_charset2uni(name);
00998   else if (!strcmp(name,"cns11643_inv"))
00999     do_cns11643_only_uni2charset(name);
01000   else if (!strcmp(name,"gbkext1"))
01001     do_gbk1_only_charset2uni(name);
01002   else if (!strcmp(name,"gbkext2"))
01003     do_gbk2_only_charset2uni(name);
01004   else if (!strcmp(name,"gbkext_inv"))
01005     do_gbk1_only_uni2charset(name);
01006   else if (!strcmp(name,"cp936ext"))
01007     do_gbk1(name);
01008   else if (!strcmp(name,"ksc5601"))
01009     do_ksc5601(name);
01010   else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
01011     do_big5(name);
01012   else if (!strcmp(name,"johab_hangul"))
01013     do_johab_hangul(name);
01014   else if (!strcmp(name,"cp932ext"))
01015     do_sjis(name);
01016   else
01017     exit(1);
01018 
01019   return 0;
01020 }