1 : /*
2 : * "streamable kanji code filter and converter"
3 : * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 : *
5 : * LICENSE NOTICES
6 : *
7 : * This file is part of "streamable kanji code filter and converter",
8 : * which is distributed under the terms of GNU Lesser General Public
9 : * License (version 2) as published by the Free Software Foundation.
10 : *
11 : * This software is distributed in the hope that it will be useful,
12 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : * GNU Lesser General Public License for more details.
15 : *
16 : * You should have received a copy of the GNU Lesser General Public
17 : * License along with "streamable kanji code filter and converter";
18 : * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 : * Suite 330, Boston, MA 02111-1307 USA
20 : *
21 : * The author of this file:
22 : *
23 : */
24 : /*
25 : * The source code included in this files was separated from mbfilter_ja.c
26 : * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 : *
28 : */
29 :
30 : #ifdef HAVE_CONFIG_H
31 : #include "config.h"
32 : #endif
33 :
34 : #include "mbfilter.h"
35 : #include "mbfilter_euc_jp_win.h"
36 :
37 : #include "unicode_table_cp932_ext.h"
38 : #include "unicode_table_jis.h"
39 : #include "cp932_table.h"
40 :
41 : static int mbfl_filt_ident_eucjp_win(int c, mbfl_identify_filter *filter);
42 :
43 : static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
44 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
53 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 : 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
55 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
56 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
58 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
59 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
60 : };
61 :
62 :
63 : static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open",
64 : "eucJP-ms", NULL};
65 :
66 : const struct mbfl_identify_vtbl vtbl_identify_eucjpwin = {
67 : mbfl_no_encoding_eucjp_win,
68 : mbfl_filt_ident_common_ctor,
69 : mbfl_filt_ident_common_dtor,
70 : mbfl_filt_ident_eucjp_win
71 : };
72 :
73 : const mbfl_encoding mbfl_encoding_eucjp_win = {
74 : mbfl_no_encoding_eucjp_win,
75 : "eucJP-win",
76 : "EUC-JP",
77 : (const char *(*)[])&mbfl_encoding_eucjp_win_aliases,
78 : mblen_table_eucjp,
79 : MBFL_ENCTYPE_MBCS
80 : };
81 :
82 : const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
83 : mbfl_no_encoding_eucjp_win,
84 : mbfl_no_encoding_wchar,
85 : mbfl_filt_conv_common_ctor,
86 : mbfl_filt_conv_common_dtor,
87 : mbfl_filt_conv_eucjpwin_wchar,
88 : mbfl_filt_conv_common_flush
89 : };
90 :
91 : const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = {
92 : mbfl_no_encoding_wchar,
93 : mbfl_no_encoding_eucjp_win,
94 : mbfl_filt_conv_common_ctor,
95 : mbfl_filt_conv_common_dtor,
96 : mbfl_filt_conv_wchar_eucjpwin,
97 : mbfl_filt_conv_common_flush
98 : };
99 :
100 : #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
101 :
102 : /*
103 : * eucJP-win => wchar
104 : */
105 : int
106 : mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter)
107 0 : {
108 : int c1, s, w, n;
109 :
110 0 : switch (filter->status) {
111 : case 0:
112 0 : if (c >= 0 && c < 0x80) { /* latin */
113 0 : CK((*filter->output_function)(c, filter->data));
114 0 : } else if (c > 0xa0 && c < 0xff) { /* CP932 first char */
115 0 : filter->status = 1;
116 0 : filter->cache = c;
117 0 : } else if (c == 0x8e) { /* kana first char */
118 0 : filter->status = 2;
119 0 : } else if (c == 0x8f) { /* X 0212 first char */
120 0 : filter->status = 3;
121 : } else {
122 0 : w = c & MBFL_WCSGROUP_MASK;
123 0 : w |= MBFL_WCSGROUP_THROUGH;
124 0 : CK((*filter->output_function)(w, filter->data));
125 : }
126 0 : break;
127 :
128 : case 1: /* got first half */
129 0 : filter->status = 0;
130 0 : c1 = filter->cache;
131 0 : if (c > 0xa0 && c < 0xff) {
132 0 : w = 0;
133 0 : s = (c1 - 0xa1)*94 + c - 0xa1;
134 0 : if (s <= 137) {
135 0 : if (s == 31) {
136 0 : w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
137 0 : } else if (s == 32) {
138 0 : w = 0xff5e; /* FULLWIDTH TILDE */
139 0 : } else if (s == 33) {
140 0 : w = 0x2225; /* PARALLEL TO */
141 0 : } else if (s == 60) {
142 0 : w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */
143 0 : } else if (s == 80) {
144 0 : w = 0xffe0; /* FULLWIDTH CENT SIGN */
145 0 : } else if (s == 81) {
146 0 : w = 0xffe1; /* FULLWIDTH POUND SIGN */
147 0 : } else if (s == 137) {
148 0 : w = 0xffe2; /* FULLWIDTH NOT SIGN */
149 : }
150 : }
151 0 : if (w == 0) {
152 0 : if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */
153 0 : w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min];
154 0 : } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */
155 0 : w = jisx0208_ucs_table[s];
156 0 : } else if (s >= (84*94)) { /* user (85ku - 94ku) */
157 0 : w = s - (84*94) + 0xe000;
158 : }
159 : }
160 0 : if (w <= 0) {
161 0 : w = ((c1 & 0x7f) << 8) | (c & 0x7f);
162 0 : w &= MBFL_WCSPLANE_MASK;
163 0 : w |= MBFL_WCSPLANE_WINCP932;
164 : }
165 0 : CK((*filter->output_function)(w, filter->data));
166 0 : } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
167 0 : CK((*filter->output_function)(c, filter->data));
168 : } else {
169 0 : w = (c1 << 8) | c;
170 0 : w &= MBFL_WCSGROUP_MASK;
171 0 : w |= MBFL_WCSGROUP_THROUGH;
172 0 : CK((*filter->output_function)(w, filter->data));
173 : }
174 0 : break;
175 :
176 : case 2: /* got 0x8e, X0201 kana */
177 0 : filter->status = 0;
178 0 : if (c > 0xa0 && c < 0xe0) {
179 0 : w = 0xfec0 + c;
180 0 : CK((*filter->output_function)(w, filter->data));
181 0 : } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
182 0 : CK((*filter->output_function)(c, filter->data));
183 : } else {
184 0 : w = 0x8e00 | c;
185 0 : w &= MBFL_WCSGROUP_MASK;
186 0 : w |= MBFL_WCSGROUP_THROUGH;
187 0 : CK((*filter->output_function)(w, filter->data));
188 : }
189 0 : break;
190 :
191 : case 3: /* got 0x8f, X 0212 first char */
192 0 : if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
193 0 : CK((*filter->output_function)(c, filter->data));
194 0 : filter->status = 0;
195 : } else {
196 0 : filter->status++;
197 0 : filter->cache = c;
198 : }
199 0 : break;
200 : case 4: /* got 0x8f, X 0212 second char */
201 0 : filter->status = 0;
202 0 : c1 = filter->cache;
203 0 : if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) {
204 0 : s = (c1 - 0xa1)*94 + c - 0xa1;
205 0 : if (s >= 0 && s < jisx0212_ucs_table_size) {
206 0 : w = jisx0212_ucs_table[s];
207 0 : if (w == 0x007e) {
208 0 : w = 0xff5e; /* FULLWIDTH TILDE */
209 : }
210 0 : } else if (s >= (82*94) && s < (84*94)) { /* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */
211 0 : s = (c1<< 8) | c;
212 0 : w = 0;
213 0 : n = 0;
214 0 : while (n < cp932ext3_eucjp_table_size) {
215 0 : if (s == cp932ext3_eucjp_table[n]) {
216 0 : if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) {
217 0 : w = cp932ext3_ucs_table[n];
218 : }
219 0 : break;
220 : }
221 0 : n++;
222 : }
223 0 : } else if (s >= (84*94)) { /* user (85ku - 94ku) */
224 0 : w = s - (84*94) + (0xe000 + (94*10));
225 : } else {
226 0 : w = 0;
227 : }
228 0 : if (w == 0x00A6) {
229 0 : w = 0xFFE4; /* FULLWIDTH BROKEN BAR */
230 : }
231 0 : if (w <= 0) {
232 0 : w = ((c1 & 0x7f) << 8) | (c & 0x7f);
233 0 : w &= MBFL_WCSPLANE_MASK;
234 0 : w |= MBFL_WCSPLANE_JIS0212;
235 : }
236 0 : CK((*filter->output_function)(w, filter->data));
237 0 : } else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
238 0 : CK((*filter->output_function)(c, filter->data));
239 : } else {
240 0 : w = (c1 << 8) | c | 0x8f0000;
241 0 : w &= MBFL_WCSGROUP_MASK;
242 0 : w |= MBFL_WCSGROUP_THROUGH;
243 0 : CK((*filter->output_function)(w, filter->data));
244 : }
245 0 : break;
246 :
247 : default:
248 0 : filter->status = 0;
249 : break;
250 : }
251 :
252 0 : return c;
253 : }
254 :
255 : /*
256 : * wchar => eucJP-win
257 : */
258 : int
259 : mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter)
260 0 : {
261 : int c1, c2, s1;
262 :
263 0 : s1 = 0;
264 0 : if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
265 0 : s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
266 0 : } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
267 0 : s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
268 0 : } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
269 0 : s1 = ucs_i_jis_table[c - ucs_i_jis_table_min];
270 0 : } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
271 0 : s1 = ucs_r_jis_table[c - ucs_r_jis_table_min];
272 0 : } else if (c >= 0xe000 && c < (0xe000 + 10*94)) { /* user (X0208 85ku - 94ku) */
273 0 : s1 = c - 0xe000;
274 0 : c1 = s1/94 + 0x75;
275 0 : c2 = s1%94 + 0x21;
276 0 : s1 = (c1 << 8) | c2;
277 0 : } else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) { /* user (X0212 85ku - 94ku) */
278 0 : s1 = c - (0xe000 + 10*94);
279 0 : c1 = s1/94 + 0xf5;
280 0 : c2 = s1%94 + 0xa1;
281 0 : s1 = (c1 << 8) | c2;
282 : }
283 0 : if (s1 == 0xa2f1) {
284 0 : s1 = 0x2d62; /* NUMERO SIGN */
285 : }
286 0 : if (s1 <= 0) {
287 0 : c1 = c & ~MBFL_WCSPLANE_MASK;
288 0 : if (c1 == MBFL_WCSPLANE_WINCP932) {
289 0 : s1 = c & MBFL_WCSPLANE_MASK;
290 0 : if (s1 >= ((85 + 0x20) << 8)) { /* 85ku - 120ku */
291 0 : s1 = -1;
292 : }
293 0 : } else if (c1 == MBFL_WCSPLANE_JIS0208) {
294 0 : s1 = c & MBFL_WCSPLANE_MASK;
295 0 : if (s1 >= ((85 + 0x20) << 8)) { /* 85ku - 94ku */
296 0 : s1 = -1;
297 : }
298 0 : } else if (c1 == MBFL_WCSPLANE_JIS0212) {
299 0 : s1 = c & MBFL_WCSPLANE_MASK;
300 0 : if (s1 >= ((83 + 0x20) << 8)) { /* 83ku - 94ku */
301 0 : s1 = -1;
302 : } else {
303 0 : s1 |= 0x8080;
304 : }
305 0 : } else if (c == 0xa5) { /* YEN SIGN */
306 0 : s1 = 0x216f; /* FULLWIDTH YEN SIGN */
307 0 : } else if (c == 0x203e) { /* OVER LINE */
308 0 : s1 = 0x2131; /* FULLWIDTH MACRON */
309 0 : } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
310 0 : s1 = 0x2140;
311 0 : } else if (c == 0xff5e) { /* FULLWIDTH TILDE */
312 0 : s1 = 0x2141;
313 0 : } else if (c == 0x2225) { /* PARALLEL TO */
314 0 : s1 = 0x2142;
315 0 : } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
316 0 : s1 = 0x215d;
317 0 : } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
318 0 : s1 = 0x2171;
319 0 : } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
320 0 : s1 = 0x2172;
321 0 : } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
322 0 : s1 = 0x224c;
323 0 : } else if (c == 0xff5e) { /* FULLWIDTH TILDE */
324 0 : s1 = 0x2141;
325 : } else {
326 0 : s1 = -1;
327 0 : c1 = 0;
328 0 : c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min;
329 0 : while (c1 < c2) { /* CP932 vendor ext1 (13ku) */
330 0 : if (c == cp932ext1_ucs_table[c1]) {
331 0 : s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21);
332 0 : break;
333 : }
334 0 : c1++;
335 : }
336 0 : if (s1 < 0) {
337 0 : c1 = 0;
338 0 : c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
339 0 : while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */
340 0 : if (c == cp932ext3_ucs_table[c1]) {
341 0 : if (c1 < cp932ext3_eucjp_table_size) {
342 0 : s1 = cp932ext3_eucjp_table[c1];
343 : }
344 0 : break;
345 : }
346 0 : c1++;
347 : }
348 : }
349 : }
350 0 : if (c == 0) {
351 0 : s1 = 0;
352 0 : } else if (s1 <= 0) {
353 0 : s1 = -1;
354 : }
355 : }
356 :
357 0 : if (s1 >= 0) {
358 0 : if (s1 < 0x80) { /* latin */
359 0 : CK((*filter->output_function)(s1, filter->data));
360 0 : } else if (s1 < 0x100) { /* kana */
361 0 : CK((*filter->output_function)(0x8e, filter->data));
362 0 : CK((*filter->output_function)(s1, filter->data));
363 0 : } else if (s1 < 0x8080) { /* X 0208 */
364 0 : CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
365 0 : CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
366 : } else { /* X 0212 */
367 0 : CK((*filter->output_function)(0x8f, filter->data));
368 0 : CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data));
369 0 : CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data));
370 : }
371 : } else {
372 0 : if (filter->illegal_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
373 0 : CK(mbfl_filt_conv_illegal_output(c, filter));
374 : }
375 : }
376 :
377 0 : return c;
378 : }
379 :
380 : static int mbfl_filt_ident_eucjp_win(int c, mbfl_identify_filter *filter)
381 0 : {
382 0 : switch (filter->status) {
383 : case 0: /* latin */
384 0 : if (c >= 0 && c < 0x80) { /* ok */
385 : ;
386 0 : } else if (c > 0xa0 && c < 0xff) { /* kanji first char */
387 0 : filter->status = 1;
388 0 : } else if (c == 0x8e) { /* kana first char */
389 0 : filter->status = 2;
390 0 : } else if (c == 0x8f) { /* X 0212 first char */
391 0 : filter->status = 3;
392 : } else { /* bad */
393 0 : filter->flag = 1;
394 : }
395 0 : break;
396 :
397 : case 1: /* got first half */
398 0 : if (c < 0xa1 || c > 0xfe) { /* bad */
399 0 : filter->flag = 1;
400 : }
401 0 : filter->status = 0;
402 0 : break;
403 :
404 : case 2: /* got 0x8e */
405 0 : if (c < 0xa1 || c > 0xdf) { /* bad */
406 0 : filter->flag = 1;
407 : }
408 0 : filter->status = 0;
409 0 : break;
410 :
411 : case 3: /* got 0x8f */
412 0 : if (c < 0xa1 || c > 0xfe) { /* bad */
413 0 : filter->flag = 1;
414 : }
415 0 : filter->status++;
416 0 : break;
417 : case 4: /* got 0x8f */
418 0 : if (c < 0xa1 || c > 0xfe) { /* bad */
419 0 : filter->flag = 1;
420 : }
421 0 : filter->status = 0;
422 0 : break;
423 :
424 : default:
425 0 : filter->status = 0;
426 : break;
427 : }
428 :
429 0 : return c;
430 : }
431 :
432 :
|