|
fltk 1.3.0rc3
About: FLTK (Fast Light Tool Kit) is a cross-platform C++ GUI toolkit for UNIX/Linux (X11), Microsoft Windows, and MacOS X. Release candidate.
SfR Fresh Dox: fltk-1.3.0rc3-source.tar.gz ("inofficial" and yet experimental doxygen-generated source code documentation) ![]() |
00001 /* $XFree86: xc/lib/X11/lcUniConv/cjk_tab_to_h.c,v 1.2 2000/12/04 18:49:31 dawes Exp $ */ 00002 00003 /* 00004 * Generates a CJK character set table from a .TXT table as found on 00005 * ftp.unicode.org or in the X nls directory. 00006 * Examples: 00007 * 00008 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312 00009 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208 00010 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601 00011 * 00012 * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT 00013 * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT 00014 * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT 00015 * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT 00016 * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT 00017 * 00018 * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT 00019 * 00020 * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT 00021 */ 00022 00023 #include <stdio.h> 00024 #include <stdlib.h> 00025 #include <stdbool.h> 00026 #include <string.h> 00027 00028 typedef struct { 00029 int start; 00030 int end; 00031 } Block; 00032 00033 typedef struct { 00034 int rows; /* number of possible values for the 1st byte */ 00035 int cols; /* number of possible values for the 2nd byte */ 00036 int (*row_byte) (int row); /* returns the 1st byte value for a given row */ 00037 int (*col_byte) (int col); /* returns the 2nd byte value for a given col */ 00038 int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */ 00039 int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */ 00040 const char* check_row_expr; /* format string for 1st byte value checking */ 00041 const char* check_col_expr; /* format string for 2nd byte value checking */ 00042 const char* byte_row_expr; /* format string for 1st byte value to row */ 00043 const char* byte_col_expr; /* format string for 2nd byte value to col */ 00044 int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */ 00045 /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book. 00046 Once a row is fixed, choosing a "col" is the same as choosing a "cell". */ 00047 int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */ 00048 int ncharsetblocks; 00049 Block* charsetblocks; /* blocks[0..nblocks-1] */ 00050 int* uni2charset; /* uni2charset[0x0000..0xffff] */ 00051 } Encoding; 00052 00053 /* 00054 * Outputs the file title. 00055 */ 00056 static void output_title (const char *charsetname) 00057 { 00058 printf("\n"); 00059 printf("/*\n"); 00060 printf(" * %s\n", charsetname); 00061 printf(" */\n"); 00062 printf("\n"); 00063 } 00064 00065 /* 00066 * Reads the charset2uni table from standard input. 00067 */ 00068 static void read_table (Encoding* enc) 00069 { 00070 int row, col, i, i1, i2, c, j; 00071 00072 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); 00073 for (row = 0; row < enc->rows; row++) 00074 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); 00075 00076 for (row = 0; row < enc->rows; row++) 00077 for (col = 0; col < enc->cols; col++) 00078 enc->charset2uni[row][col] = 0xfffd; 00079 00080 c = getc(stdin); 00081 ungetc(c,stdin); 00082 if (c == '#') { 00083 /* Read a unicode.org style .TXT file. */ 00084 for (;;) { 00085 c = getc(stdin); 00086 if (c == EOF) 00087 break; 00088 if (c == '\n' || c == ' ' || c == '\t') 00089 continue; 00090 if (c == '#') { 00091 do { c = getc(stdin); } while (!(c == EOF || c == '\n')); 00092 continue; 00093 } 00094 ungetc(c,stdin); 00095 if (scanf("0x%x", &j) != 1) 00096 exit(1); 00097 i1 = j >> 8; 00098 i2 = j & 0xff; 00099 row = enc->byte_row(i1); 00100 col = enc->byte_col(i2); 00101 if (row < 0 || col < 0) { 00102 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); 00103 exit(1); 00104 } 00105 if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1) 00106 exit(1); 00107 } 00108 } else { 00109 /* Read a table of hexadecimal Unicode values. */ 00110 for (i1 = 32; i1 < 132; i1++) 00111 for (i2 = 32; i2 < 132; i2++) { 00112 i = scanf("%x", &j); 00113 if (i == EOF) 00114 goto read_done; 00115 if (i != 1) 00116 exit(1); 00117 if (j < 0 || j == 0xffff) 00118 j = 0xfffd; 00119 if (j != 0xfffd) { 00120 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { 00121 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); 00122 exit (1); 00123 } 00124 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; 00125 } 00126 } 00127 read_done: ; 00128 } 00129 } 00130 00131 /* 00132 * Computes the charsetpage[0..rows] array. 00133 */ 00134 static void find_charset2uni_pages (Encoding* enc) 00135 { 00136 int row, col; 00137 00138 enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int)); 00139 00140 for (row = 0; row <= enc->rows; row++) 00141 enc->charsetpage[row] = 0; 00142 00143 for (row = 0; row < enc->rows; row++) { 00144 int used = 0; 00145 for (col = 0; col < enc->cols; col++) 00146 if (enc->charset2uni[row][col] != 0xfffd) 00147 used = col+1; 00148 enc->charsetpage[row] = used; 00149 } 00150 } 00151 00152 /* 00153 * Fills in nblocks and blocks. 00154 */ 00155 static void find_charset2uni_blocks (Encoding* enc) 00156 { 00157 int n, row, lastrow; 00158 00159 enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block)); 00160 00161 n = 0; 00162 for (row = 0; row < enc->rows; row++) 00163 if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) { 00164 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); 00165 enc->charsetblocks[n].start = row * enc->cols; 00166 enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow]; 00167 n++; 00168 } 00169 enc->ncharsetblocks = n; 00170 } 00171 00172 /* 00173 * Outputs the charset to unicode table and function. 00174 */ 00175 static void output_charset2uni (const char* name, Encoding* enc) 00176 { 00177 int row, col, lastrow, col_max, i, i1_min, i1_max; 00178 00179 find_charset2uni_pages(enc); 00180 00181 find_charset2uni_blocks(enc); 00182 00183 for (row = 0; row < enc->rows; row++) 00184 if (enc->charsetpage[row] > 0) { 00185 if (row == 0 || enc->charsetpage[row-1] == 0) { 00186 /* Start a new block. */ 00187 for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); 00188 printf("static const unsigned short %s_2uni_page%02x[%d] = {\n", 00189 name, enc->row_byte(row), 00190 (lastrow-row) * enc->cols + enc->charsetpage[lastrow]); 00191 } 00192 printf(" /""* 0x%02x *""/\n ", enc->row_byte(row)); 00193 col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]); 00194 for (col = 0; col < col_max; col++) { 00195 printf(" 0x%04x,", enc->charset2uni[row][col]); 00196 if ((col % 8) == 7 && (col+1 < col_max)) printf("\n "); 00197 } 00198 printf("\n"); 00199 if (enc->charsetpage[row+1] == 0) { 00200 /* End a block. */ 00201 printf("};\n"); 00202 } 00203 } 00204 printf("\n"); 00205 00206 printf("static int\n"); 00207 printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name); 00208 printf("{\n"); 00209 printf(" unsigned char c1 = s[0];\n"); 00210 printf(" if ("); 00211 for (i = 0; i < enc->ncharsetblocks; i++) { 00212 i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols); 00213 i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols); 00214 if (i > 0) 00215 printf(" || "); 00216 if (i1_min == i1_max) 00217 printf("(c1 == 0x%02x)", i1_min); 00218 else 00219 printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max); 00220 } 00221 printf(") {\n"); 00222 printf(" if (n >= 2) {\n"); 00223 printf(" unsigned char c2 = s[1];\n"); 00224 printf(" if ("); 00225 printf(enc->check_col_expr, "c2"); 00226 printf(") {\n"); 00227 printf(" unsigned int i = %d * (", enc->cols); 00228 printf(enc->byte_row_expr, "c1"); 00229 printf(") + ("); 00230 printf(enc->byte_col_expr, "c2"); 00231 printf(");\n"); 00232 printf(" unsigned short wc = 0xfffd;\n"); 00233 for (i = 0; i < enc->ncharsetblocks; i++) { 00234 printf(" "); 00235 if (i > 0) 00236 printf("} else "); 00237 if (i < enc->ncharsetblocks-1) 00238 printf("if (i < %d) ", enc->charsetblocks[i+1].start); 00239 printf("{\n"); 00240 printf(" if (i < %d)\n", enc->charsetblocks[i].end); 00241 printf(" wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols)); 00242 if (enc->charsetblocks[i].start > 0) 00243 printf("-%d", enc->charsetblocks[i].start); 00244 printf("];\n"); 00245 } 00246 printf(" }\n"); 00247 printf(" if (wc != 0xfffd) {\n"); 00248 printf(" *pwc = (ucs4_t) wc;\n"); 00249 printf(" return 2;\n"); 00250 printf(" }\n"); 00251 printf(" }\n"); 00252 printf(" return RET_ILSEQ;\n"); 00253 printf(" }\n"); 00254 printf(" return RET_TOOFEW(0);\n"); 00255 printf(" }\n"); 00256 printf(" return RET_ILSEQ;\n"); 00257 printf("}\n"); 00258 printf("\n"); 00259 } 00260 00261 /* 00262 * Computes the uni2charset[0x0000..0xffff] array. 00263 */ 00264 static void invert (Encoding* enc) 00265 { 00266 int row, col, j; 00267 00268 enc->uni2charset = (int*) malloc(0x10000*sizeof(int)); 00269 00270 for (j = 0; j < 0x10000; j++) 00271 enc->uni2charset[j] = 0; 00272 00273 for (row = 0; row < enc->rows; row++) 00274 for (col = 0; col < enc->cols; col++) { 00275 j = enc->charset2uni[row][col]; 00276 if (j != 0xfffd) 00277 enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col); 00278 } 00279 } 00280 00281 /* 00282 * Outputs the unicode to charset table and function, using a linear array. 00283 * (Suitable if the table is dense.) 00284 */ 00285 static void output_uni2charset_dense (const char* name, Encoding* enc) 00286 { 00287 /* Like in 8bit_tab_to_h.c */ 00288 bool pages[0x100]; 00289 int line[0x2000]; 00290 int tableno; 00291 struct { int minline; int maxline; int usecount; } tables[0x2000]; 00292 bool first; 00293 int row, col, j, p, j1, j2, t; 00294 00295 for (p = 0; p < 0x100; p++) 00296 pages[p] = false; 00297 for (row = 0; row < enc->rows; row++) 00298 for (col = 0; col < enc->cols; col++) { 00299 j = enc->charset2uni[row][col]; 00300 if (j != 0xfffd) 00301 pages[j>>8] = true; 00302 } 00303 for (j1 = 0; j1 < 0x2000; j1++) { 00304 bool all_invalid = true; 00305 for (j2 = 0; j2 < 8; j2++) { 00306 j = 8*j1+j2; 00307 if (enc->uni2charset[j] != 0) 00308 all_invalid = false; 00309 } 00310 if (all_invalid) 00311 line[j1] = -1; 00312 else 00313 line[j1] = 0; 00314 } 00315 tableno = 0; 00316 for (j1 = 0; j1 < 0x2000; j1++) { 00317 if (line[j1] >= 0) { 00318 if (tableno > 0 00319 && ((j1 > 0 && line[j1-1] == tableno-1) 00320 || ((tables[tableno-1].maxline >> 5) == (j1 >> 5) 00321 && j1 - tables[tableno-1].maxline <= 8))) { 00322 line[j1] = tableno-1; 00323 tables[tableno-1].maxline = j1; 00324 } else { 00325 tableno++; 00326 line[j1] = tableno-1; 00327 tables[tableno-1].minline = tables[tableno-1].maxline = j1; 00328 } 00329 } 00330 } 00331 for (t = 0; t < tableno; t++) { 00332 tables[t].usecount = 0; 00333 j1 = 8*tables[t].minline; 00334 j2 = 8*(tables[t].maxline+1); 00335 for (j = j1; j < j2; j++) 00336 if (enc->uni2charset[j] != 0) 00337 tables[t].usecount++; 00338 } 00339 { 00340 p = -1; 00341 for (t = 0; t < tableno; t++) 00342 if (tables[t].usecount > 1) { 00343 p = tables[t].minline >> 5; 00344 printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1)); 00345 for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) { 00346 if ((j1 % 0x20) == 0 && j1 > tables[t].minline) 00347 printf(" /* 0x%04x */\n", 8*j1); 00348 printf(" "); 00349 for (j2 = 0; j2 < 8; j2++) { 00350 j = 8*j1+j2; 00351 printf(" 0x%04x,", enc->uni2charset[j]); 00352 } 00353 printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7); 00354 } 00355 printf("};\n"); 00356 } 00357 if (p >= 0) 00358 printf("\n"); 00359 } 00360 printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); 00361 printf("{\n"); 00362 printf(" if (n >= 2) {\n"); 00363 printf(" unsigned short c = 0;\n"); 00364 first = true; 00365 for (j1 = 0; j1 < 0x2000;) { 00366 t = line[j1]; 00367 for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++); 00368 if (t >= 0) { 00369 if (j1 != tables[t].minline) abort(); 00370 if (j2 > tables[t].maxline+1) abort(); 00371 j2 = tables[t].maxline+1; 00372 if (first) 00373 printf(" "); 00374 else 00375 printf(" else "); 00376 first = false; 00377 if (tables[t].usecount == 0) abort(); 00378 if (tables[t].usecount == 1) { 00379 if (j2 != j1+1) abort(); 00380 for (j = 8*j1; j < 8*j2; j++) 00381 if (enc->uni2charset[j] != 0) { 00382 printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]); 00383 break; 00384 } 00385 } else { 00386 if (j1 == 0) { 00387 printf("if (wc < 0x%04x)", 8*j2); 00388 } else { 00389 printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2); 00390 } 00391 printf("\n c = %s_page%02x[wc", name, j1 >> 5); 00392 if (tables[t].minline > 0) 00393 printf("-0x%04x", 8*j1); 00394 printf("];\n"); 00395 } 00396 } 00397 j1 = j2; 00398 } 00399 printf(" if (c != 0) {\n"); 00400 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); 00401 printf(" return 2;\n"); 00402 printf(" }\n"); 00403 printf(" return RET_ILSEQ;\n"); 00404 printf(" }\n"); 00405 printf(" return RET_TOOSMALL;\n"); 00406 printf("}\n"); 00407 } 00408 00409 /* 00410 * Outputs the unicode to charset table and function, using a packed array. 00411 * (Suitable if the table is sparse.) 00412 */ 00413 static void output_uni2charset_sparse (const char* name, Encoding* enc) 00414 { 00415 bool pages[0x100]; 00416 Block pageblocks[0x100]; int npageblocks; 00417 int indx2charset[0x10000]; 00418 int summary_indx[0x1000]; 00419 int summary_used[0x1000]; 00420 int i, row, col, j, p, j1, j2, indx; 00421 00422 /* Fill pages[0x100]. */ 00423 for (p = 0; p < 0x100; p++) 00424 pages[p] = false; 00425 for (row = 0; row < enc->rows; row++) 00426 for (col = 0; col < enc->cols; col++) { 00427 j = enc->charset2uni[row][col]; 00428 if (j != 0xfffd) 00429 pages[j>>8] = true; 00430 } 00431 00432 #if 0 00433 for (p = 0; p < 0x100; p++) 00434 if (pages[p]) { 00435 printf("static const unsigned short %s_page%02x[256] = {\n", name, p); 00436 for (j1 = 0; j1 < 32; j1++) { 00437 printf(" "); 00438 for (j2 = 0; j2 < 8; j2++) 00439 printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]); 00440 printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7); 00441 } 00442 printf("};\n"); 00443 } 00444 printf("\n"); 00445 #endif 00446 00447 /* Fill summary_indx[] and summary_used[]. */ 00448 indx = 0; 00449 for (j1 = 0; j1 < 0x1000; j1++) { 00450 summary_indx[j1] = indx; 00451 summary_used[j1] = 0; 00452 for (j2 = 0; j2 < 16; j2++) { 00453 j = 16*j1+j2; 00454 if (enc->uni2charset[j] != 0) { 00455 indx2charset[indx++] = enc->uni2charset[j]; 00456 summary_used[j1] |= (1 << j2); 00457 } 00458 } 00459 } 00460 00461 /* Fill npageblocks and pageblocks[]. */ 00462 npageblocks = 0; 00463 for (p = 0; p < 0x100; ) { 00464 if (pages[p] && (p == 0 || !pages[p-1])) { 00465 pageblocks[npageblocks].start = 16*p; 00466 do p++; while (p < 0x100 && pages[p]); 00467 j1 = 16*p; 00468 while (summary_used[j1-1] == 0) j1--; 00469 pageblocks[npageblocks].end = j1; 00470 npageblocks++; 00471 } else 00472 p++; 00473 } 00474 00475 printf("static const unsigned short %s_2charset[%d] = {\n", name, indx); 00476 for (i = 0; i < indx; ) { 00477 if ((i % 8) == 0) printf(" "); 00478 printf(" 0x%04x,", indx2charset[i]); 00479 i++; 00480 if ((i % 8) == 0 || i == indx) printf("\n"); 00481 } 00482 printf("};\n"); 00483 printf("\n"); 00484 for (i = 0; i < npageblocks; i++) { 00485 printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name, 00486 pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start); 00487 for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) { 00488 if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1); 00489 if ((j1 % 4) == 0) printf(" "); 00490 printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]); 00491 j1++; 00492 if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n"); 00493 } 00494 printf("};\n"); 00495 } 00496 printf("\n"); 00497 00498 printf("static int\n"); 00499 printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); 00500 printf("{\n"); 00501 printf(" if (n >= 2) {\n"); 00502 printf(" const Summary16 *summary = NULL;\n"); 00503 for (i = 0; i < npageblocks; i++) { 00504 printf(" "); 00505 if (i > 0) 00506 printf("else "); 00507 printf("if (wc >= 0x%04x && wc < 0x%04x)\n", 00508 16*pageblocks[i].start, 16*pageblocks[i].end); 00509 printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name, 00510 pageblocks[i].start/16); 00511 if (pageblocks[i].start > 0) 00512 printf("-0x%03x", pageblocks[i].start); 00513 printf("];\n"); 00514 } 00515 printf(" if (summary) {\n"); 00516 printf(" unsigned short used = summary->used;\n"); 00517 printf(" unsigned int i = wc & 0x0f;\n"); 00518 printf(" if (used & ((unsigned short) 1 << i)) {\n"); 00519 printf(" unsigned short c;\n"); 00520 printf(" /* Keep in `used' only the bits 0..i-1. */\n"); 00521 printf(" used &= ((unsigned short) 1 << i) - 1;\n"); 00522 printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n"); 00523 printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n"); 00524 printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n"); 00525 printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n"); 00526 printf(" used = (used & 0x00ff) + (used >> 8);\n"); 00527 printf(" c = %s_2charset[summary->indx + used];\n", name); 00528 printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); 00529 printf(" return 2;\n"); 00530 printf(" }\n"); 00531 printf(" }\n"); 00532 printf(" return RET_ILSEQ;\n"); 00533 printf(" }\n"); 00534 printf(" return RET_TOOSMALL;\n"); 00535 printf("}\n"); 00536 } 00537 00538 /* ISO-2022/EUC specifics */ 00539 00540 static int row_byte_normal (int row) { return 0x21+row; } 00541 static int col_byte_normal (int col) { return 0x21+col; } 00542 static int byte_row_normal (int byte) { return byte-0x21; } 00543 static int byte_col_normal (int byte) { return byte-0x21; } 00544 00545 static void do_normal (const char* name) 00546 { 00547 Encoding enc; 00548 00549 enc.rows = 94; 00550 enc.cols = 94; 00551 enc.row_byte = row_byte_normal; 00552 enc.col_byte = col_byte_normal; 00553 enc.byte_row = byte_row_normal; 00554 enc.byte_col = byte_col_normal; 00555 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00556 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00557 enc.byte_row_expr = "%1$s - 0x21"; 00558 enc.byte_col_expr = "%1$s - 0x21"; 00559 00560 read_table(&enc); 00561 output_charset2uni(name,&enc); 00562 invert(&enc); output_uni2charset_sparse(name,&enc); 00563 } 00564 00565 /* Note: On first sight, the jisx0212_2charset[] table seems to be in order, 00566 starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in 00567 order. There are 75 out-of-order values, scattered all throughout the table. 00568 */ 00569 00570 static void do_normal_only_charset2uni (const char* name) 00571 { 00572 Encoding enc; 00573 00574 enc.rows = 94; 00575 enc.cols = 94; 00576 enc.row_byte = row_byte_normal; 00577 enc.col_byte = col_byte_normal; 00578 enc.byte_row = byte_row_normal; 00579 enc.byte_col = byte_col_normal; 00580 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00581 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00582 enc.byte_row_expr = "%1$s - 0x21"; 00583 enc.byte_col_expr = "%1$s - 0x21"; 00584 00585 read_table(&enc); 00586 output_charset2uni(name,&enc); 00587 } 00588 00589 /* CNS 11643 specifics - trick to put two tables into one */ 00590 00591 static int row_byte_cns11643 (int row) { 00592 return 0x100 * (row / 94) + (row % 94) + 0x21; 00593 } 00594 static int byte_row_cns11643 (int byte) { 00595 return (byte >= 0x100 && byte < 0x200 ? byte-0x121 : 00596 byte >= 0x200 && byte < 0x300 ? byte-0x221+94 : 00597 byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 : 00598 -1); 00599 } 00600 00601 static void do_cns11643_only_uni2charset (const char* name) 00602 { 00603 Encoding enc; 00604 int j, x; 00605 00606 enc.rows = 3*94; 00607 enc.cols = 94; 00608 enc.row_byte = row_byte_cns11643; 00609 enc.col_byte = col_byte_normal; 00610 enc.byte_row = byte_row_cns11643; 00611 enc.byte_col = byte_col_normal; 00612 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00613 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00614 enc.byte_row_expr = "%1$s - 0x21"; 00615 enc.byte_col_expr = "%1$s - 0x21"; 00616 00617 read_table(&enc); 00618 invert(&enc); 00619 /* Move the 2 plane bits into the unused bits 15 and 7. */ 00620 for (j = 0; j < 0x10000; j++) { 00621 x = enc.uni2charset[j]; 00622 if (x != 0) { 00623 if (x & 0x8080) abort(); 00624 switch (x >> 16) { 00625 case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break; 00626 case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break; 00627 case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break; 00628 default: abort(); 00629 } 00630 enc.uni2charset[j] = x; 00631 } 00632 } 00633 output_uni2charset_sparse(name,&enc); 00634 } 00635 00636 /* GBK specifics */ 00637 00638 static int row_byte_gbk1 (int row) { 00639 return 0x81+row; 00640 } 00641 static int col_byte_gbk1 (int col) { 00642 return (col >= 0x3f ? 0x41 : 0x40) + col; 00643 } 00644 static int byte_row_gbk1 (int byte) { 00645 if (byte >= 0x81 && byte < 0xff) 00646 return byte-0x81; 00647 else 00648 return -1; 00649 } 00650 static int byte_col_gbk1 (int byte) { 00651 if (byte >= 0x40 && byte < 0x7f) 00652 return byte-0x40; 00653 else if (byte >= 0x80 && byte < 0xff) 00654 return byte-0x41; 00655 else 00656 return -1; 00657 } 00658 00659 static void do_gbk1 (const char* name) 00660 { 00661 Encoding enc; 00662 00663 enc.rows = 126; 00664 enc.cols = 190; 00665 enc.row_byte = row_byte_gbk1; 00666 enc.col_byte = col_byte_gbk1; 00667 enc.byte_row = byte_row_gbk1; 00668 enc.byte_col = byte_col_gbk1; 00669 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 00670 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; 00671 enc.byte_row_expr = "%1$s - 0x81"; 00672 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 00673 00674 read_table(&enc); 00675 output_charset2uni(name,&enc); 00676 invert(&enc); output_uni2charset_dense(name,&enc); 00677 } 00678 00679 static void do_gbk1_only_charset2uni (const char* name) 00680 { 00681 Encoding enc; 00682 00683 enc.rows = 126; 00684 enc.cols = 190; 00685 enc.row_byte = row_byte_gbk1; 00686 enc.col_byte = col_byte_gbk1; 00687 enc.byte_row = byte_row_gbk1; 00688 enc.byte_col = byte_col_gbk1; 00689 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 00690 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; 00691 enc.byte_row_expr = "%1$s - 0x81"; 00692 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 00693 00694 read_table(&enc); 00695 output_charset2uni(name,&enc); 00696 } 00697 00698 static int row_byte_gbk2 (int row) { 00699 return 0x81+row; 00700 } 00701 static int col_byte_gbk2 (int col) { 00702 return (col >= 0x3f ? 0x41 : 0x40) + col; 00703 } 00704 static int byte_row_gbk2 (int byte) { 00705 if (byte >= 0x81 && byte < 0xff) 00706 return byte-0x81; 00707 else 00708 return -1; 00709 } 00710 static int byte_col_gbk2 (int byte) { 00711 if (byte >= 0x40 && byte < 0x7f) 00712 return byte-0x40; 00713 else if (byte >= 0x80 && byte < 0xa1) 00714 return byte-0x41; 00715 else 00716 return -1; 00717 } 00718 00719 static void do_gbk2_only_charset2uni (const char* name) 00720 { 00721 Encoding enc; 00722 00723 enc.rows = 126; 00724 enc.cols = 96; 00725 enc.row_byte = row_byte_gbk2; 00726 enc.col_byte = col_byte_gbk2; 00727 enc.byte_row = byte_row_gbk2; 00728 enc.byte_col = byte_col_gbk2; 00729 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 00730 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)"; 00731 enc.byte_row_expr = "%1$s - 0x81"; 00732 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 00733 00734 read_table(&enc); 00735 output_charset2uni(name,&enc); 00736 } 00737 00738 static void do_gbk1_only_uni2charset (const char* name) 00739 { 00740 Encoding enc; 00741 00742 enc.rows = 126; 00743 enc.cols = 190; 00744 enc.row_byte = row_byte_gbk1; 00745 enc.col_byte = col_byte_gbk1; 00746 enc.byte_row = byte_row_gbk1; 00747 enc.byte_col = byte_col_gbk1; 00748 enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; 00749 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; 00750 enc.byte_row_expr = "%1$s - 0x81"; 00751 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 00752 00753 read_table(&enc); 00754 invert(&enc); output_uni2charset_sparse(name,&enc); 00755 } 00756 00757 /* KSC 5601 specifics */ 00758 00759 /* 00760 * Reads the charset2uni table from standard input. 00761 */ 00762 static void read_table_ksc5601 (Encoding* enc) 00763 { 00764 int row, col, i, i1, i2, c, j; 00765 00766 enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*)); 00767 for (row = 0; row < enc->rows; row++) 00768 enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int)); 00769 00770 for (row = 0; row < enc->rows; row++) 00771 for (col = 0; col < enc->cols; col++) 00772 enc->charset2uni[row][col] = 0xfffd; 00773 00774 c = getc(stdin); 00775 ungetc(c,stdin); 00776 if (c == '#') { 00777 /* Read a unicode.org style .TXT file. */ 00778 for (;;) { 00779 c = getc(stdin); 00780 if (c == EOF) 00781 break; 00782 if (c == '\n' || c == ' ' || c == '\t') 00783 continue; 00784 if (c == '#') { 00785 do { c = getc(stdin); } while (!(c == EOF || c == '\n')); 00786 continue; 00787 } 00788 ungetc(c,stdin); 00789 if (scanf("0x%x", &j) != 1) 00790 exit(1); 00791 i1 = j >> 8; 00792 i2 = j & 0xff; 00793 if (scanf(" 0x%x", &j) != 1) 00794 exit(1); 00795 /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0 00796 = KS X 1001.1992, ignore the rest. */ 00797 if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127)) 00798 continue; /* KSC5601 specific */ 00799 i1 &= 0x7f; /* KSC5601 specific */ 00800 i2 &= 0x7f; /* KSC5601 specific */ 00801 row = enc->byte_row(i1); 00802 col = enc->byte_col(i2); 00803 if (row < 0 || col < 0) { 00804 fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); 00805 exit(1); 00806 } 00807 enc->charset2uni[row][col] = j; 00808 } 00809 } else { 00810 /* Read a table of hexadecimal Unicode values. */ 00811 for (i1 = 33; i1 < 127; i1++) 00812 for (i2 = 33; i2 < 127; i2++) { 00813 i = scanf("%x", &j); 00814 if (i == EOF) 00815 goto read_done; 00816 if (i != 1) 00817 exit(1); 00818 if (j < 0 || j == 0xffff) 00819 j = 0xfffd; 00820 if (j != 0xfffd) { 00821 if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { 00822 fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); 00823 exit (1); 00824 } 00825 enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; 00826 } 00827 } 00828 read_done: ; 00829 } 00830 } 00831 00832 static void do_ksc5601 (const char* name) 00833 { 00834 Encoding enc; 00835 00836 enc.rows = 94; 00837 enc.cols = 94; 00838 enc.row_byte = row_byte_normal; 00839 enc.col_byte = col_byte_normal; 00840 enc.byte_row = byte_row_normal; 00841 enc.byte_col = byte_col_normal; 00842 enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00843 enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; 00844 enc.byte_row_expr = "%1$s - 0x21"; 00845 enc.byte_col_expr = "%1$s - 0x21"; 00846 00847 read_table_ksc5601(&enc); 00848 output_charset2uni(name,&enc); 00849 invert(&enc); output_uni2charset_sparse(name,&enc); 00850 } 00851 00852 /* Big5 specifics */ 00853 00854 static int row_byte_big5 (int row) { 00855 return 0xa1+row; 00856 } 00857 static int col_byte_big5 (int col) { 00858 return (col >= 0x3f ? 0x62 : 0x40) + col; 00859 } 00860 static int byte_row_big5 (int byte) { 00861 if (byte >= 0xa1 && byte < 0xff) 00862 return byte-0xa1; 00863 else 00864 return -1; 00865 } 00866 static int byte_col_big5 (int byte) { 00867 if (byte >= 0x40 && byte < 0x7f) 00868 return byte-0x40; 00869 else if (byte >= 0xa1 && byte < 0xff) 00870 return byte-0x62; 00871 else 00872 return -1; 00873 } 00874 00875 static void do_big5 (const char* name) 00876 { 00877 Encoding enc; 00878 00879 enc.rows = 94; 00880 enc.cols = 157; 00881 enc.row_byte = row_byte_big5; 00882 enc.col_byte = col_byte_big5; 00883 enc.byte_row = byte_row_big5; 00884 enc.byte_col = byte_col_big5; 00885 enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff"; 00886 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)"; 00887 enc.byte_row_expr = "%1$s - 0xa1"; 00888 enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)"; 00889 00890 read_table(&enc); 00891 output_charset2uni(name,&enc); 00892 invert(&enc); output_uni2charset_sparse(name,&enc); 00893 } 00894 00895 /* Johab Hangul specifics */ 00896 00897 static int row_byte_johab_hangul (int row) { 00898 return 0x84+row; 00899 } 00900 static int col_byte_johab_hangul (int col) { 00901 return (col >= 0x3e ? 0x43 : 0x41) + col; 00902 } 00903 static int byte_row_johab_hangul (int byte) { 00904 if (byte >= 0x84 && byte < 0xd4) 00905 return byte-0x84; 00906 else 00907 return -1; 00908 } 00909 static int byte_col_johab_hangul (int byte) { 00910 if (byte >= 0x41 && byte < 0x7f) 00911 return byte-0x41; 00912 else if (byte >= 0x81 && byte < 0xff) 00913 return byte-0x43; 00914 else 00915 return -1; 00916 } 00917 00918 static void do_johab_hangul (const char* name) 00919 { 00920 Encoding enc; 00921 00922 enc.rows = 80; 00923 enc.cols = 188; 00924 enc.row_byte = row_byte_johab_hangul; 00925 enc.col_byte = col_byte_johab_hangul; 00926 enc.byte_row = byte_row_johab_hangul; 00927 enc.byte_col = byte_col_johab_hangul; 00928 enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4"; 00929 enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)"; 00930 enc.byte_row_expr = "%1$s - 0x84"; 00931 enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)"; 00932 00933 read_table(&enc); 00934 output_charset2uni(name,&enc); 00935 invert(&enc); output_uni2charset_dense(name,&enc); 00936 } 00937 00938 /* SJIS specifics */ 00939 00940 static int row_byte_sjis (int row) { 00941 return (row >= 0x1f ? 0xc1 : 0x81) + row; 00942 } 00943 static int col_byte_sjis (int col) { 00944 return (col >= 0x3f ? 0x41 : 0x40) + col; 00945 } 00946 static int byte_row_sjis (int byte) { 00947 if (byte >= 0x81 && byte < 0xa0) 00948 return byte-0x81; 00949 else if (byte >= 0xe0) 00950 return byte-0xc1; 00951 else 00952 return -1; 00953 } 00954 static int byte_col_sjis (int byte) { 00955 if (byte >= 0x40 && byte < 0x7f) 00956 return byte-0x40; 00957 else if (byte >= 0x80 && byte < 0xfd) 00958 return byte-0x41; 00959 else 00960 return -1; 00961 } 00962 00963 static void do_sjis (const char* name) 00964 { 00965 Encoding enc; 00966 00967 enc.rows = 94; 00968 enc.cols = 188; 00969 enc.row_byte = row_byte_sjis; 00970 enc.col_byte = col_byte_sjis; 00971 enc.byte_row = byte_row_sjis; 00972 enc.byte_col = byte_col_sjis; 00973 enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)"; 00974 enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)"; 00975 enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)"; 00976 enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; 00977 00978 read_table(&enc); 00979 output_charset2uni(name,&enc); 00980 invert(&enc); output_uni2charset_sparse(name,&enc); 00981 } 00982 00983 /* Main program */ 00984 00985 int main (int argc, char *argv[]) 00986 { 00987 const char* charsetname; 00988 const char* name; 00989 00990 if (argc != 3) 00991 exit(1); 00992 charsetname = argv[1]; 00993 name = argv[2]; 00994 00995 output_title(charsetname); 00996 00997 if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext") 00998 || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212")) 00999 do_normal(name); 01000 else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2") 01001 || !strcmp(name,"cns11643_3")) 01002 do_normal_only_charset2uni(name); 01003 else if (!strcmp(name,"cns11643_inv")) 01004 do_cns11643_only_uni2charset(name); 01005 else if (!strcmp(name,"gbkext1")) 01006 do_gbk1_only_charset2uni(name); 01007 else if (!strcmp(name,"gbkext2")) 01008 do_gbk2_only_charset2uni(name); 01009 else if (!strcmp(name,"gbkext_inv")) 01010 do_gbk1_only_uni2charset(name); 01011 else if (!strcmp(name,"cp936ext")) 01012 do_gbk1(name); 01013 else if (!strcmp(name,"ksc5601")) 01014 do_ksc5601(name); 01015 else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext")) 01016 do_big5(name); 01017 else if (!strcmp(name,"johab_hangul")) 01018 do_johab_hangul(name); 01019 else if (!strcmp(name,"cp932ext")) 01020 do_sjis(name); 01021 else 01022 exit(1); 01023 01024 return 0; 01025 }