1 : /**********************************************************************
2 : sjis.c - Oniguruma (regular expression library)
3 : **********************************************************************/
4 : /*-
5 : * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 : * All rights reserved.
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 : * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 : * SUCH DAMAGE.
28 : */
29 :
30 : #include "regenc.h"
31 :
32 : static const int EncLen_SJIS[] = {
33 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 : 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
42 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
43 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
49 : };
50 :
51 : static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
52 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
60 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
68 : };
69 :
70 : #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
71 : #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
72 :
73 : static int
74 : sjis_mbc_enc_len(const UChar* p)
75 1185 : {
76 1185 : return EncLen_SJIS[*p];
77 : }
78 :
79 : static int
80 : sjis_code_to_mbclen(OnigCodePoint code)
81 27 : {
82 27 : if (code < 256) {
83 19 : if (EncLen_SJIS[(int )code] == 1)
84 19 : return 1;
85 : else
86 0 : return 0;
87 : }
88 8 : else if (code <= 0xffff) {
89 8 : return 2;
90 : }
91 : else
92 0 : return 0;
93 : }
94 :
95 : static OnigCodePoint
96 : sjis_mbc_to_code(const UChar* p, const UChar* end)
97 399 : {
98 : int c, i, len;
99 : OnigCodePoint n;
100 :
101 399 : len = enc_len(ONIG_ENCODING_SJIS, p);
102 399 : c = *p++;
103 399 : n = c;
104 399 : if (len == 1) return n;
105 :
106 184 : for (i = 1; i < len; i++) {
107 92 : if (p >= end) break;
108 92 : c = *p++;
109 92 : n <<= 8; n += c;
110 : }
111 92 : return n;
112 : }
113 :
114 : static int
115 : sjis_code_to_mbc(OnigCodePoint code, UChar *buf)
116 0 : {
117 0 : UChar *p = buf;
118 :
119 0 : if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
120 0 : *p++ = (UChar )(code & 0xff);
121 :
122 : #if 0
123 : if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf))
124 : return REGERR_INVALID_WIDE_CHAR_VALUE;
125 : #endif
126 0 : return p - buf;
127 : }
128 :
129 : static int
130 : sjis_mbc_to_normalize(OnigAmbigType flag,
131 : const UChar** pp, const UChar* end, UChar* lower)
132 0 : {
133 0 : const UChar* p = *pp;
134 :
135 0 : if (ONIGENC_IS_MBC_ASCII(p)) {
136 0 : if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
137 0 : *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
138 : }
139 : else {
140 0 : *lower = *p;
141 : }
142 :
143 0 : (*pp)++;
144 0 : return 1;
145 : }
146 : else {
147 0 : int len = enc_len(ONIG_ENCODING_SJIS, p);
148 :
149 0 : if (lower != p) {
150 : int i;
151 0 : for (i = 0; i < len; i++) {
152 0 : *lower++ = *p++;
153 : }
154 : }
155 0 : (*pp) += len;
156 0 : return len; /* return byte length of converted char to lower */
157 : }
158 : }
159 :
160 : static int
161 : sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
162 0 : {
163 0 : return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end);
164 :
165 : }
166 :
167 : static int
168 : sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype)
169 1024 : {
170 1024 : if (code < 128)
171 512 : return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
172 : else {
173 512 : if ((ctype & (ONIGENC_CTYPE_WORD |
174 : ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
175 0 : return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE);
176 : }
177 : }
178 :
179 512 : return FALSE;
180 : }
181 :
182 : static UChar*
183 : sjis_left_adjust_char_head(const UChar* start, const UChar* s)
184 48 : {
185 : const UChar *p;
186 : int len;
187 :
188 48 : if (s <= start) return (UChar* )s;
189 44 : p = s;
190 :
191 44 : if (SJIS_ISMB_TRAIL(*p)) {
192 92 : while (p > start) {
193 54 : if (! SJIS_ISMB_FIRST(*--p)) {
194 38 : p++;
195 38 : break;
196 : }
197 : }
198 : }
199 44 : len = enc_len(ONIG_ENCODING_SJIS, p);
200 44 : if (p + len > s) return (UChar* )p;
201 0 : p += len;
202 0 : return (UChar* )(p + ((s - p) & ~1));
203 : }
204 :
205 : static int
206 : sjis_is_allowed_reverse_match(const UChar* s, const UChar* end)
207 9 : {
208 9 : const UChar c = *s;
209 9 : return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
210 : }
211 :
212 : OnigEncodingType OnigEncodingSJIS = {
213 : sjis_mbc_enc_len,
214 : "Shift_JIS", /* name */
215 : 2, /* max byte length */
216 : 1, /* min byte length */
217 : ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
218 : {
219 : (OnigCodePoint )'\\' /* esc */
220 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
221 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
222 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
223 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
224 : , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
225 : },
226 : onigenc_is_mbc_newline_0x0a,
227 : sjis_mbc_to_code,
228 : sjis_code_to_mbclen,
229 : sjis_code_to_mbc,
230 : sjis_mbc_to_normalize,
231 : sjis_is_mbc_ambiguous,
232 : onigenc_ascii_get_all_pair_ambig_codes,
233 : onigenc_nothing_get_all_comp_ambig_codes,
234 : sjis_is_code_ctype,
235 : onigenc_not_support_get_ctype_code_range,
236 : sjis_left_adjust_char_head,
237 : sjis_is_allowed_reverse_match
238 : };
|