fltk 1.3.0rc3
About: FLTK (Fast Light Tool Kit) is a cross-platform C++ GUI toolkit for UNIX/Linux (X11), Microsoft Windows, and MacOS X. Release candidate.
  SfR Fresh Dox: fltk-1.3.0rc3-source.tar.gz ("inofficial" and yet experimental doxygen-generated source code documentation)  

cjk_tab_to_h.c

Go to the documentation of this file.
00001 /* $XFree86: xc/lib/X11/lcUniConv/cjk_tab_to_h.c,v 1.2 2000/12/04 18:49:31 dawes Exp $ */
00002 
00003 /*
00004  * Generates a CJK character set table from a .TXT table as found on
00005  * ftp.unicode.org or in the X nls directory.
00006  * Examples:
00007  *
00008  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
00009  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
00010  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
00011  *
00012  *   ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
00013  *   ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
00014  *   ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
00015  *   ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
00016  *   ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
00017  *
00018  *   ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
00019  *
00020  *   ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
00021  */
00022 
00023 #include <stdio.h>
00024 #include <stdlib.h>
00025 #include <stdbool.h>
00026 #include <string.h>
00027 
00028 typedef struct {
00029   int start;
00030   int end;
00031 } Block;
00032 
00033 typedef struct {
00034   int rows;    /* number of possible values for the 1st byte */
00035   int cols;    /* number of possible values for the 2nd byte */
00036   int (*row_byte) (int row); /* returns the 1st byte value for a given row */
00037   int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
00038   int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
00039   int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
00040   const char* check_row_expr; /* format string for 1st byte value checking */
00041   const char* check_col_expr; /* format string for 2nd byte value checking */
00042   const char* byte_row_expr; /* format string for 1st byte value to row */
00043   const char* byte_col_expr; /* format string for 2nd byte value to col */
00044   int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
00045   /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
00046      Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
00047   int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
00048   int ncharsetblocks;
00049   Block* charsetblocks; /* blocks[0..nblocks-1] */
00050   int* uni2charset; /* uni2charset[0x0000..0xffff] */
00051 } Encoding;
00052 
00053 /*
00054  * Outputs the file title.
00055  */
00056 static void output_title (const char *charsetname)
00057 {
00058   printf("\n");
00059   printf("/*\n");
00060   printf(" * %s\n", charsetname);
00061   printf(" */\n");
00062   printf("\n");
00063 }
00064 
00065 /*
00066  * Reads the charset2uni table from standard input.
00067  */
00068 static void read_table (Encoding* enc)
00069 {
00070   int row, col, i, i1, i2, c, j;
00071 
00072   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
00073   for (row = 0; row < enc->rows; row++)
00074     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
00075 
00076   for (row = 0; row < enc->rows; row++)
00077     for (col = 0; col < enc->cols; col++)
00078       enc->charset2uni[row][col] = 0xfffd;
00079 
00080   c = getc(stdin);
00081   ungetc(c,stdin);
00082   if (c == '#') {
00083     /* Read a unicode.org style .TXT file. */
00084     for (;;) {
00085       c = getc(stdin);
00086       if (c == EOF)
00087         break;
00088       if (c == '\n' || c == ' ' || c == '\t')
00089         continue;
00090       if (c == '#') {
00091         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
00092         continue;
00093       }
00094       ungetc(c,stdin);
00095       if (scanf("0x%x", &j) != 1)
00096         exit(1);
00097       i1 = j >> 8;
00098       i2 = j & 0xff;
00099       row = enc->byte_row(i1);
00100       col = enc->byte_col(i2);
00101       if (row < 0 || col < 0) {
00102         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
00103         exit(1);
00104       }
00105       if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
00106         exit(1);
00107     }
00108   } else {
00109     /* Read a table of hexadecimal Unicode values. */
00110     for (i1 = 32; i1 < 132; i1++)
00111       for (i2 = 32; i2 < 132; i2++) {
00112         i = scanf("%x", &j);
00113         if (i == EOF)
00114           goto read_done;
00115         if (i != 1)
00116           exit(1);
00117         if (j < 0 || j == 0xffff)
00118           j = 0xfffd;
00119         if (j != 0xfffd) {
00120           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
00121             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
00122             exit (1);
00123           }
00124           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
00125         }
00126       }
00127    read_done: ;
00128   }
00129 }
00130 
00131 /*
00132  * Computes the charsetpage[0..rows] array.
00133  */
00134 static void find_charset2uni_pages (Encoding* enc)
00135 {
00136   int row, col;
00137 
00138   enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
00139 
00140   for (row = 0; row <= enc->rows; row++)
00141     enc->charsetpage[row] = 0;
00142 
00143   for (row = 0; row < enc->rows; row++) {
00144     int used = 0;
00145     for (col = 0; col < enc->cols; col++)
00146       if (enc->charset2uni[row][col] != 0xfffd)
00147         used = col+1;
00148     enc->charsetpage[row] = used;
00149   }
00150 }
00151 
00152 /*
00153  * Fills in nblocks and blocks.
00154  */
00155 static void find_charset2uni_blocks (Encoding* enc)
00156 {
00157   int n, row, lastrow;
00158 
00159   enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
00160 
00161   n = 0;
00162   for (row = 0; row < enc->rows; row++)
00163     if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
00164       for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
00165       enc->charsetblocks[n].start = row * enc->cols;
00166       enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
00167       n++;
00168     }
00169   enc->ncharsetblocks = n;
00170 }
00171 
00172 /*
00173  * Outputs the charset to unicode table and function.
00174  */
00175 static void output_charset2uni (const char* name, Encoding* enc)
00176 {
00177   int row, col, lastrow, col_max, i, i1_min, i1_max;
00178 
00179   find_charset2uni_pages(enc);
00180 
00181   find_charset2uni_blocks(enc);
00182 
00183   for (row = 0; row < enc->rows; row++)
00184     if (enc->charsetpage[row] > 0) {
00185       if (row == 0 || enc->charsetpage[row-1] == 0) {
00186         /* Start a new block. */
00187         for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
00188         printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
00189                name, enc->row_byte(row),
00190                (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
00191       }
00192       printf("  /""* 0x%02x *""/\n ", enc->row_byte(row));
00193       col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
00194       for (col = 0; col < col_max; col++) {
00195         printf(" 0x%04x,", enc->charset2uni[row][col]);
00196         if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
00197       }
00198       printf("\n");
00199       if (enc->charsetpage[row+1] == 0) {
00200         /* End a block. */
00201         printf("};\n");
00202       }
00203     }
00204   printf("\n");
00205 
00206   printf("static int\n");
00207   printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
00208   printf("{\n");
00209   printf("  unsigned char c1 = s[0];\n");
00210   printf("  if (");
00211   for (i = 0; i < enc->ncharsetblocks; i++) {
00212     i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
00213     i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
00214     if (i > 0)
00215       printf(" || ");
00216     if (i1_min == i1_max)
00217       printf("(c1 == 0x%02x)", i1_min);
00218     else
00219       printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
00220   }
00221   printf(") {\n");
00222   printf("    if (n >= 2) {\n");
00223   printf("      unsigned char c2 = s[1];\n");
00224   printf("      if (");
00225   printf(enc->check_col_expr, "c2");
00226   printf(") {\n");
00227   printf("        unsigned int i = %d * (", enc->cols);
00228   printf(enc->byte_row_expr, "c1");
00229   printf(") + (");
00230   printf(enc->byte_col_expr, "c2");
00231   printf(");\n");
00232   printf("        unsigned short wc = 0xfffd;\n");
00233   for (i = 0; i < enc->ncharsetblocks; i++) {
00234     printf("        ");
00235     if (i > 0)
00236       printf("} else ");
00237     if (i < enc->ncharsetblocks-1)
00238       printf("if (i < %d) ", enc->charsetblocks[i+1].start);
00239     printf("{\n");
00240     printf("          if (i < %d)\n", enc->charsetblocks[i].end);
00241     printf("            wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
00242     if (enc->charsetblocks[i].start > 0)
00243       printf("-%d", enc->charsetblocks[i].start);
00244     printf("];\n");
00245   }
00246   printf("        }\n");
00247   printf("        if (wc != 0xfffd) {\n");
00248   printf("          *pwc = (ucs4_t) wc;\n");
00249   printf("          return 2;\n");
00250   printf("        }\n");
00251   printf("      }\n");
00252   printf("      return RET_ILSEQ;\n");
00253   printf("    }\n");
00254   printf("    return RET_TOOFEW(0);\n");
00255   printf("  }\n");
00256   printf("  return RET_ILSEQ;\n");
00257   printf("}\n");
00258   printf("\n");
00259 }
00260 
00261 /*
00262  * Computes the uni2charset[0x0000..0xffff] array.
00263  */
00264 static void invert (Encoding* enc)
00265 {
00266   int row, col, j;
00267 
00268   enc->uni2charset = (int*) malloc(0x10000*sizeof(int));
00269 
00270   for (j = 0; j < 0x10000; j++)
00271     enc->uni2charset[j] = 0;
00272 
00273   for (row = 0; row < enc->rows; row++)
00274     for (col = 0; col < enc->cols; col++) {
00275       j = enc->charset2uni[row][col];
00276       if (j != 0xfffd)
00277         enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
00278     }
00279 }
00280 
00281 /*
00282  * Outputs the unicode to charset table and function, using a linear array.
00283  * (Suitable if the table is dense.)
00284  */
00285 static void output_uni2charset_dense (const char* name, Encoding* enc)
00286 {
00287   /* Like in 8bit_tab_to_h.c */
00288   bool pages[0x100];
00289   int line[0x2000];
00290   int tableno;
00291   struct { int minline; int maxline; int usecount; } tables[0x2000];
00292   bool first;
00293   int row, col, j, p, j1, j2, t;
00294 
00295   for (p = 0; p < 0x100; p++)
00296     pages[p] = false;
00297   for (row = 0; row < enc->rows; row++)
00298     for (col = 0; col < enc->cols; col++) {
00299       j = enc->charset2uni[row][col];
00300       if (j != 0xfffd)
00301         pages[j>>8] = true;
00302     }
00303   for (j1 = 0; j1 < 0x2000; j1++) {
00304     bool all_invalid = true;
00305     for (j2 = 0; j2 < 8; j2++) {
00306       j = 8*j1+j2;
00307       if (enc->uni2charset[j] != 0)
00308         all_invalid = false;
00309     }
00310     if (all_invalid)
00311       line[j1] = -1;
00312     else
00313       line[j1] = 0;
00314   }
00315   tableno = 0;
00316   for (j1 = 0; j1 < 0x2000; j1++) {
00317     if (line[j1] >= 0) {
00318       if (tableno > 0
00319           && ((j1 > 0 && line[j1-1] == tableno-1)
00320               || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
00321                   && j1 - tables[tableno-1].maxline <= 8))) {
00322         line[j1] = tableno-1;
00323         tables[tableno-1].maxline = j1;
00324       } else {
00325         tableno++;
00326         line[j1] = tableno-1;
00327         tables[tableno-1].minline = tables[tableno-1].maxline = j1;
00328       }
00329     }
00330   }
00331   for (t = 0; t < tableno; t++) {
00332     tables[t].usecount = 0;
00333     j1 = 8*tables[t].minline;
00334     j2 = 8*(tables[t].maxline+1);
00335     for (j = j1; j < j2; j++)
00336       if (enc->uni2charset[j] != 0)
00337         tables[t].usecount++;
00338   }
00339   {
00340     p = -1;
00341     for (t = 0; t < tableno; t++)
00342       if (tables[t].usecount > 1) {
00343         p = tables[t].minline >> 5;
00344         printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
00345         for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
00346           if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
00347             printf("  /* 0x%04x */\n", 8*j1);
00348           printf(" ");
00349           for (j2 = 0; j2 < 8; j2++) {
00350             j = 8*j1+j2;
00351             printf(" 0x%04x,", enc->uni2charset[j]);
00352           }
00353           printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
00354         }
00355         printf("};\n");
00356       }
00357     if (p >= 0)
00358       printf("\n");
00359   }
00360   printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
00361   printf("{\n");
00362   printf("  if (n >= 2) {\n");
00363   printf("    unsigned short c = 0;\n");
00364   first = true;
00365   for (j1 = 0; j1 < 0x2000;) {
00366     t = line[j1];
00367     for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
00368     if (t >= 0) {
00369       if (j1 != tables[t].minline) abort();
00370       if (j2 > tables[t].maxline+1) abort();
00371       j2 = tables[t].maxline+1;
00372       if (first)
00373         printf("    ");
00374       else
00375         printf("    else ");
00376       first = false;
00377       if (tables[t].usecount == 0) abort();
00378       if (tables[t].usecount == 1) {
00379         if (j2 != j1+1) abort();
00380         for (j = 8*j1; j < 8*j2; j++)
00381           if (enc->uni2charset[j] != 0) {
00382             printf("if (wc == 0x%04x)\n      c = 0x%02x;\n", j, enc->uni2charset[j]);
00383             break;
00384           }
00385       } else {
00386         if (j1 == 0) {
00387           printf("if (wc < 0x%04x)", 8*j2);
00388         } else {
00389           printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
00390         }
00391         printf("\n      c = %s_page%02x[wc", name, j1 >> 5);
00392         if (tables[t].minline > 0)
00393           printf("-0x%04x", 8*j1);
00394         printf("];\n");
00395       }
00396     }
00397     j1 = j2;
00398   }
00399   printf("    if (c != 0) {\n");
00400   printf("      r[0] = (c >> 8); r[1] = (c & 0xff);\n");
00401   printf("      return 2;\n");
00402   printf("    }\n");
00403   printf("    return RET_ILSEQ;\n");
00404   printf("  }\n");
00405   printf("  return RET_TOOSMALL;\n");
00406   printf("}\n");
00407 }
00408 
00409 /*
00410  * Outputs the unicode to charset table and function, using a packed array.
00411  * (Suitable if the table is sparse.)
00412  */
00413 static void output_uni2charset_sparse (const char* name, Encoding* enc)
00414 {
00415   bool pages[0x100];
00416   Block pageblocks[0x100]; int npageblocks;
00417   int indx2charset[0x10000];
00418   int summary_indx[0x1000];
00419   int summary_used[0x1000];
00420   int i, row, col, j, p, j1, j2, indx;
00421 
00422   /* Fill pages[0x100]. */
00423   for (p = 0; p < 0x100; p++)
00424     pages[p] = false;
00425   for (row = 0; row < enc->rows; row++)
00426     for (col = 0; col < enc->cols; col++) {
00427       j = enc->charset2uni[row][col];
00428       if (j != 0xfffd)
00429         pages[j>>8] = true;
00430     }
00431 
00432 #if 0
00433   for (p = 0; p < 0x100; p++)
00434     if (pages[p]) {
00435       printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
00436       for (j1 = 0; j1 < 32; j1++) {
00437         printf("  ");
00438         for (j2 = 0; j2 < 8; j2++)
00439           printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
00440         printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
00441       }
00442       printf("};\n");
00443     }
00444   printf("\n");
00445 #endif
00446 
00447   /* Fill summary_indx[] and summary_used[]. */
00448   indx = 0;
00449   for (j1 = 0; j1 < 0x1000; j1++) {
00450     summary_indx[j1] = indx;
00451     summary_used[j1] = 0;
00452     for (j2 = 0; j2 < 16; j2++) {
00453       j = 16*j1+j2;
00454       if (enc->uni2charset[j] != 0) {
00455         indx2charset[indx++] = enc->uni2charset[j];
00456         summary_used[j1] |= (1 << j2);
00457       }
00458     }
00459   }
00460 
00461   /* Fill npageblocks and pageblocks[]. */
00462   npageblocks = 0;
00463   for (p = 0; p < 0x100; ) {
00464     if (pages[p] && (p == 0 || !pages[p-1])) {
00465       pageblocks[npageblocks].start = 16*p;
00466       do p++; while (p < 0x100 && pages[p]);
00467       j1 = 16*p;
00468       while (summary_used[j1-1] == 0) j1--;
00469       pageblocks[npageblocks].end = j1;
00470       npageblocks++;
00471     } else
00472       p++;
00473   }
00474 
00475   printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
00476   for (i = 0; i < indx; ) {
00477     if ((i % 8) == 0) printf(" ");
00478     printf(" 0x%04x,", indx2charset[i]);
00479     i++;
00480     if ((i % 8) == 0 || i == indx) printf("\n");
00481   }
00482   printf("};\n");
00483   printf("\n");
00484   for (i = 0; i < npageblocks; i++) {
00485     printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
00486            pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
00487     for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
00488       if (((16*j1) % 0x100) == 0) printf("  /""* 0x%04x *""/\n", 16*j1);
00489       if ((j1 % 4) == 0) printf(" ");
00490       printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
00491       j1++;
00492       if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
00493     }
00494     printf("};\n");
00495   }
00496   printf("\n");
00497 
00498   printf("static int\n");
00499   printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
00500   printf("{\n");
00501   printf("  if (n >= 2) {\n");
00502   printf("    const Summary16 *summary = NULL;\n");
00503   for (i = 0; i < npageblocks; i++) {
00504     printf("    ");
00505     if (i > 0)
00506       printf("else ");
00507     printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
00508            16*pageblocks[i].start, 16*pageblocks[i].end);
00509     printf("      summary = &%s_uni2indx_page%02x[(wc>>4)", name,
00510            pageblocks[i].start/16);
00511     if (pageblocks[i].start > 0)
00512       printf("-0x%03x", pageblocks[i].start);
00513     printf("];\n");
00514   }
00515   printf("    if (summary) {\n");
00516   printf("      unsigned short used = summary->used;\n");
00517   printf("      unsigned int i = wc & 0x0f;\n");
00518   printf("      if (used & ((unsigned short) 1 << i)) {\n");
00519   printf("        unsigned short c;\n");
00520   printf("        /* Keep in `used' only the bits 0..i-1. */\n");
00521   printf("        used &= ((unsigned short) 1 << i) - 1;\n");
00522   printf("        /* Add `summary->indx' and the number of bits set in `used'. */\n");
00523   printf("        used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
00524   printf("        used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
00525   printf("        used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
00526   printf("        used = (used & 0x00ff) + (used >> 8);\n");
00527   printf("        c = %s_2charset[summary->indx + used];\n", name);
00528   printf("        r[0] = (c >> 8); r[1] = (c & 0xff);\n");
00529   printf("        return 2;\n");
00530   printf("      }\n");
00531   printf("    }\n");
00532   printf("    return RET_ILSEQ;\n");
00533   printf("  }\n");
00534   printf("  return RET_TOOSMALL;\n");
00535   printf("}\n");
00536 }
00537 
00538 /* ISO-2022/EUC specifics */
00539 
00540 static int row_byte_normal (int row) { return 0x21+row; }
00541 static int col_byte_normal (int col) { return 0x21+col; }
00542 static int byte_row_normal (int byte) { return byte-0x21; }
00543 static int byte_col_normal (int byte) { return byte-0x21; }
00544 
00545 static void do_normal (const char* name)
00546 {
00547   Encoding enc;
00548 
00549   enc.rows = 94;
00550   enc.cols = 94;
00551   enc.row_byte = row_byte_normal;
00552   enc.col_byte = col_byte_normal;
00553   enc.byte_row = byte_row_normal;
00554   enc.byte_col = byte_col_normal;
00555   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00556   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00557   enc.byte_row_expr = "%1$s - 0x21";
00558   enc.byte_col_expr = "%1$s - 0x21";
00559 
00560   read_table(&enc);
00561   output_charset2uni(name,&enc);
00562   invert(&enc); output_uni2charset_sparse(name,&enc);
00563 }
00564 
00565 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
00566    starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
00567    order. There are 75 out-of-order values, scattered all throughout the table.
00568  */
00569 
00570 static void do_normal_only_charset2uni (const char* name)
00571 {
00572   Encoding enc;
00573 
00574   enc.rows = 94;
00575   enc.cols = 94;
00576   enc.row_byte = row_byte_normal;
00577   enc.col_byte = col_byte_normal;
00578   enc.byte_row = byte_row_normal;
00579   enc.byte_col = byte_col_normal;
00580   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00581   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00582   enc.byte_row_expr = "%1$s - 0x21";
00583   enc.byte_col_expr = "%1$s - 0x21";
00584 
00585   read_table(&enc);
00586   output_charset2uni(name,&enc);
00587 }
00588 
00589 /* CNS 11643 specifics - trick to put two tables into one */
00590 
00591 static int row_byte_cns11643 (int row) {
00592   return 0x100 * (row / 94) + (row % 94) + 0x21;
00593 }
00594 static int byte_row_cns11643 (int byte) {
00595   return (byte >= 0x100 && byte < 0x200 ? byte-0x121 :
00596           byte >= 0x200 && byte < 0x300 ? byte-0x221+94 :
00597           byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 :
00598           -1);
00599 }
00600 
00601 static void do_cns11643_only_uni2charset (const char* name)
00602 {
00603   Encoding enc;
00604   int j, x;
00605 
00606   enc.rows = 3*94;
00607   enc.cols = 94;
00608   enc.row_byte = row_byte_cns11643;
00609   enc.col_byte = col_byte_normal;
00610   enc.byte_row = byte_row_cns11643;
00611   enc.byte_col = byte_col_normal;
00612   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00613   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00614   enc.byte_row_expr = "%1$s - 0x21";
00615   enc.byte_col_expr = "%1$s - 0x21";
00616 
00617   read_table(&enc);
00618   invert(&enc);
00619   /* Move the 2 plane bits into the unused bits 15 and 7. */
00620   for (j = 0; j < 0x10000; j++) {
00621     x = enc.uni2charset[j];
00622     if (x != 0) {
00623       if (x & 0x8080) abort();
00624       switch (x >> 16) {
00625         case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break;
00626         case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break;
00627         case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break;
00628         default: abort();
00629       }
00630       enc.uni2charset[j] = x;
00631     }
00632   }
00633   output_uni2charset_sparse(name,&enc);
00634 }
00635 
00636 /* GBK specifics */
00637 
00638 static int row_byte_gbk1 (int row) {
00639   return 0x81+row;
00640 }
00641 static int col_byte_gbk1 (int col) {
00642   return (col >= 0x3f ? 0x41 : 0x40) + col;
00643 }
00644 static int byte_row_gbk1 (int byte) {
00645   if (byte >= 0x81 && byte < 0xff)
00646     return byte-0x81;
00647   else
00648     return -1;
00649 }
00650 static int byte_col_gbk1 (int byte) {
00651   if (byte >= 0x40 && byte < 0x7f)
00652     return byte-0x40;
00653   else if (byte >= 0x80 && byte < 0xff)
00654     return byte-0x41;
00655   else
00656     return -1;
00657 }
00658 
00659 static void do_gbk1 (const char* name)
00660 {
00661   Encoding enc;
00662 
00663   enc.rows = 126;
00664   enc.cols = 190;
00665   enc.row_byte = row_byte_gbk1;
00666   enc.col_byte = col_byte_gbk1;
00667   enc.byte_row = byte_row_gbk1;
00668   enc.byte_col = byte_col_gbk1;
00669   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00670   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
00671   enc.byte_row_expr = "%1$s - 0x81";
00672   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00673 
00674   read_table(&enc);
00675   output_charset2uni(name,&enc);
00676   invert(&enc); output_uni2charset_dense(name,&enc);
00677 }
00678 
00679 static void do_gbk1_only_charset2uni (const char* name)
00680 {
00681   Encoding enc;
00682 
00683   enc.rows = 126;
00684   enc.cols = 190;
00685   enc.row_byte = row_byte_gbk1;
00686   enc.col_byte = col_byte_gbk1;
00687   enc.byte_row = byte_row_gbk1;
00688   enc.byte_col = byte_col_gbk1;
00689   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00690   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
00691   enc.byte_row_expr = "%1$s - 0x81";
00692   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00693 
00694   read_table(&enc);
00695   output_charset2uni(name,&enc);
00696 }
00697 
00698 static int row_byte_gbk2 (int row) {
00699   return 0x81+row;
00700 }
00701 static int col_byte_gbk2 (int col) {
00702   return (col >= 0x3f ? 0x41 : 0x40) + col;
00703 }
00704 static int byte_row_gbk2 (int byte) {
00705   if (byte >= 0x81 && byte < 0xff)
00706     return byte-0x81;
00707   else
00708     return -1;
00709 }
00710 static int byte_col_gbk2 (int byte) {
00711   if (byte >= 0x40 && byte < 0x7f)
00712     return byte-0x40;
00713   else if (byte >= 0x80 && byte < 0xa1)
00714     return byte-0x41;
00715   else
00716     return -1;
00717 }
00718 
00719 static void do_gbk2_only_charset2uni (const char* name)
00720 {
00721   Encoding enc;
00722 
00723   enc.rows = 126;
00724   enc.cols = 96;
00725   enc.row_byte = row_byte_gbk2;
00726   enc.col_byte = col_byte_gbk2;
00727   enc.byte_row = byte_row_gbk2;
00728   enc.byte_col = byte_col_gbk2;
00729   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00730   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
00731   enc.byte_row_expr = "%1$s - 0x81";
00732   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00733 
00734   read_table(&enc);
00735   output_charset2uni(name,&enc);
00736 }
00737 
00738 static void do_gbk1_only_uni2charset (const char* name)
00739 {
00740   Encoding enc;
00741 
00742   enc.rows = 126;
00743   enc.cols = 190;
00744   enc.row_byte = row_byte_gbk1;
00745   enc.col_byte = col_byte_gbk1;
00746   enc.byte_row = byte_row_gbk1;
00747   enc.byte_col = byte_col_gbk1;
00748   enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
00749   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
00750   enc.byte_row_expr = "%1$s - 0x81";
00751   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00752 
00753   read_table(&enc);
00754   invert(&enc); output_uni2charset_sparse(name,&enc);
00755 }
00756 
00757 /* KSC 5601 specifics */
00758 
00759 /*
00760  * Reads the charset2uni table from standard input.
00761  */
00762 static void read_table_ksc5601 (Encoding* enc)
00763 {
00764   int row, col, i, i1, i2, c, j;
00765 
00766   enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
00767   for (row = 0; row < enc->rows; row++)
00768     enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
00769 
00770   for (row = 0; row < enc->rows; row++)
00771     for (col = 0; col < enc->cols; col++)
00772       enc->charset2uni[row][col] = 0xfffd;
00773 
00774   c = getc(stdin);
00775   ungetc(c,stdin);
00776   if (c == '#') {
00777     /* Read a unicode.org style .TXT file. */
00778     for (;;) {
00779       c = getc(stdin);
00780       if (c == EOF)
00781         break;
00782       if (c == '\n' || c == ' ' || c == '\t')
00783         continue;
00784       if (c == '#') {
00785         do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
00786         continue;
00787       }
00788       ungetc(c,stdin);
00789       if (scanf("0x%x", &j) != 1)
00790         exit(1);
00791       i1 = j >> 8;
00792       i2 = j & 0xff;
00793       if (scanf(" 0x%x", &j) != 1)
00794         exit(1);
00795       /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
00796          = KS X 1001.1992, ignore the rest. */
00797       if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
00798         continue;  /* KSC5601 specific */
00799       i1 &= 0x7f;  /* KSC5601 specific */
00800       i2 &= 0x7f;  /* KSC5601 specific */
00801       row = enc->byte_row(i1);
00802       col = enc->byte_col(i2);
00803       if (row < 0 || col < 0) {
00804         fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
00805         exit(1);
00806       }
00807       enc->charset2uni[row][col] = j;
00808     }
00809   } else {
00810     /* Read a table of hexadecimal Unicode values. */
00811     for (i1 = 33; i1 < 127; i1++)
00812       for (i2 = 33; i2 < 127; i2++) {
00813         i = scanf("%x", &j);
00814         if (i == EOF)
00815           goto read_done;
00816         if (i != 1)
00817           exit(1);
00818         if (j < 0 || j == 0xffff)
00819           j = 0xfffd;
00820         if (j != 0xfffd) {
00821           if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
00822             fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
00823             exit (1);
00824           }
00825           enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
00826         }
00827       }
00828    read_done: ;
00829   }
00830 }
00831 
00832 static void do_ksc5601 (const char* name)
00833 {
00834   Encoding enc;
00835 
00836   enc.rows = 94;
00837   enc.cols = 94;
00838   enc.row_byte = row_byte_normal;
00839   enc.col_byte = col_byte_normal;
00840   enc.byte_row = byte_row_normal;
00841   enc.byte_col = byte_col_normal;
00842   enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00843   enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
00844   enc.byte_row_expr = "%1$s - 0x21";
00845   enc.byte_col_expr = "%1$s - 0x21";
00846 
00847   read_table_ksc5601(&enc);
00848   output_charset2uni(name,&enc);
00849   invert(&enc); output_uni2charset_sparse(name,&enc);
00850 }
00851 
00852 /* Big5 specifics */
00853 
00854 static int row_byte_big5 (int row) {
00855   return 0xa1+row;
00856 }
00857 static int col_byte_big5 (int col) {
00858   return (col >= 0x3f ? 0x62 : 0x40) + col;
00859 }
00860 static int byte_row_big5 (int byte) {
00861   if (byte >= 0xa1 && byte < 0xff)
00862     return byte-0xa1;
00863   else
00864     return -1;
00865 }
00866 static int byte_col_big5 (int byte) {
00867   if (byte >= 0x40 && byte < 0x7f)
00868     return byte-0x40;
00869   else if (byte >= 0xa1 && byte < 0xff)
00870     return byte-0x62;
00871   else
00872     return -1;
00873 }
00874 
00875 static void do_big5 (const char* name)
00876 {
00877   Encoding enc;
00878 
00879   enc.rows = 94;
00880   enc.cols = 157;
00881   enc.row_byte = row_byte_big5;
00882   enc.col_byte = col_byte_big5;
00883   enc.byte_row = byte_row_big5;
00884   enc.byte_col = byte_col_big5;
00885   enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
00886   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
00887   enc.byte_row_expr = "%1$s - 0xa1";
00888   enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
00889 
00890   read_table(&enc);
00891   output_charset2uni(name,&enc);
00892   invert(&enc); output_uni2charset_sparse(name,&enc);
00893 }
00894 
00895 /* Johab Hangul specifics */
00896 
00897 static int row_byte_johab_hangul (int row) {
00898   return 0x84+row;
00899 }
00900 static int col_byte_johab_hangul (int col) {
00901   return (col >= 0x3e ? 0x43 : 0x41) + col;
00902 }
00903 static int byte_row_johab_hangul (int byte) {
00904   if (byte >= 0x84 && byte < 0xd4)
00905     return byte-0x84;
00906   else
00907     return -1;
00908 }
00909 static int byte_col_johab_hangul (int byte) {
00910   if (byte >= 0x41 && byte < 0x7f)
00911     return byte-0x41;
00912   else if (byte >= 0x81 && byte < 0xff)
00913     return byte-0x43;
00914   else
00915     return -1;
00916 }
00917 
00918 static void do_johab_hangul (const char* name)
00919 {
00920   Encoding enc;
00921 
00922   enc.rows = 80;
00923   enc.cols = 188;
00924   enc.row_byte = row_byte_johab_hangul;
00925   enc.col_byte = col_byte_johab_hangul;
00926   enc.byte_row = byte_row_johab_hangul;
00927   enc.byte_col = byte_col_johab_hangul;
00928   enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
00929   enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
00930   enc.byte_row_expr = "%1$s - 0x84";
00931   enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
00932 
00933   read_table(&enc);
00934   output_charset2uni(name,&enc);
00935   invert(&enc); output_uni2charset_dense(name,&enc);
00936 }
00937 
00938 /* SJIS specifics */
00939 
00940 static int row_byte_sjis (int row) {
00941   return (row >= 0x1f ? 0xc1 : 0x81) + row;
00942 }
00943 static int col_byte_sjis (int col) {
00944   return (col >= 0x3f ? 0x41 : 0x40) + col;
00945 }
00946 static int byte_row_sjis (int byte) {
00947   if (byte >= 0x81 && byte < 0xa0)
00948     return byte-0x81;
00949   else if (byte >= 0xe0)
00950     return byte-0xc1;
00951   else
00952     return -1;
00953 }
00954 static int byte_col_sjis (int byte) {
00955   if (byte >= 0x40 && byte < 0x7f)
00956     return byte-0x40;
00957   else if (byte >= 0x80 && byte < 0xfd)
00958     return byte-0x41;
00959   else
00960     return -1;
00961 }
00962 
00963 static void do_sjis (const char* name)
00964 {
00965   Encoding enc;
00966 
00967   enc.rows = 94;
00968   enc.cols = 188;
00969   enc.row_byte = row_byte_sjis;
00970   enc.col_byte = col_byte_sjis;
00971   enc.byte_row = byte_row_sjis;
00972   enc.byte_col = byte_col_sjis;
00973   enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
00974   enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
00975   enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
00976   enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
00977 
00978   read_table(&enc);
00979   output_charset2uni(name,&enc);
00980   invert(&enc); output_uni2charset_sparse(name,&enc);
00981 }
00982 
00983 /* Main program */
00984 
00985 int main (int argc, char *argv[])
00986 {
00987   const char* charsetname;
00988   const char* name;
00989 
00990   if (argc != 3)
00991     exit(1);
00992   charsetname = argv[1];
00993   name = argv[2];
00994 
00995   output_title(charsetname);
00996 
00997   if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext")
00998       || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
00999     do_normal(name);
01000   else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
01001            || !strcmp(name,"cns11643_3"))
01002     do_normal_only_charset2uni(name);
01003   else if (!strcmp(name,"cns11643_inv"))
01004     do_cns11643_only_uni2charset(name);
01005   else if (!strcmp(name,"gbkext1"))
01006     do_gbk1_only_charset2uni(name);
01007   else if (!strcmp(name,"gbkext2"))
01008     do_gbk2_only_charset2uni(name);
01009   else if (!strcmp(name,"gbkext_inv"))
01010     do_gbk1_only_uni2charset(name);
01011   else if (!strcmp(name,"cp936ext"))
01012     do_gbk1(name);
01013   else if (!strcmp(name,"ksc5601"))
01014     do_ksc5601(name);
01015   else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
01016     do_big5(name);
01017   else if (!strcmp(name,"johab_hangul"))
01018     do_johab_hangul(name);
01019   else if (!strcmp(name,"cp932ext"))
01020     do_sjis(name);
01021   else
01022     exit(1);
01023 
01024   return 0;
01025 }