1 : /*
2 : * "streamable kanji code filter and converter"
3 : * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 : *
5 : * LICENSE NOTICES
6 : *
7 : * This file is part of "streamable kanji code filter and converter",
8 : * which is distributed under the terms of GNU Lesser General Public
9 : * License (version 2) as published by the Free Software Foundation.
10 : *
11 : * This software is distributed in the hope that it will be useful,
12 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : * GNU Lesser General Public License for more details.
15 : *
16 : * You should have received a copy of the GNU Lesser General Public
17 : * License along with "streamable kanji code filter and converter";
18 : * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 : * Suite 330, Boston, MA 02111-1307 USA
20 : *
21 : * The author of this part: Marcus Boerger <helly@php.net>
22 : *
23 : */
24 : /*
25 : * The source code included in this files was separated from mbfilter.c
26 : * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27 : *
28 : */
29 :
30 : #ifdef HAVE_CONFIG_H
31 : #include "config.h"
32 : #endif
33 :
34 : #ifdef HAVE_STRING_H
35 : #include <string.h>
36 : #endif
37 :
38 : #ifdef HAVE_STRINGS_H
39 : #include <strings.h>
40 : #endif
41 :
42 : #include "mbfilter.h"
43 : #include "mbfilter_htmlent.h"
44 : #include "html_entities.h"
45 :
46 : static const int htmlentitifieds[256] = {
47 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 : 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
51 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
63 : };
64 :
65 : static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
66 :
67 : const mbfl_encoding mbfl_encoding_html_ent = {
68 : mbfl_no_encoding_html_ent,
69 : "HTML-ENTITIES",
70 : "HTML-ENTITIES",
71 : (const char *(*)[])&mbfl_encoding_html_ent_aliases,
72 : NULL,
73 : MBFL_ENCTYPE_HTML_ENT
74 : };
75 :
76 : const struct mbfl_convert_vtbl vtbl_wchar_html = {
77 : mbfl_no_encoding_wchar,
78 : mbfl_no_encoding_html_ent,
79 : mbfl_filt_conv_common_ctor,
80 : mbfl_filt_conv_common_dtor,
81 : mbfl_filt_conv_html_enc,
82 : mbfl_filt_conv_html_enc_flush
83 : };
84 :
85 : const struct mbfl_convert_vtbl vtbl_html_wchar = {
86 : mbfl_no_encoding_html_ent,
87 : mbfl_no_encoding_wchar,
88 : mbfl_filt_conv_html_dec_ctor,
89 : mbfl_filt_conv_html_dec_dtor,
90 : mbfl_filt_conv_html_dec,
91 : mbfl_filt_conv_html_dec_flush };
92 :
93 :
94 : #define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
95 :
96 : /*
97 : * any => HTML
98 : */
99 : int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
100 126 : {
101 : int tmp[64];
102 : int i;
103 : unsigned int uc;
104 : const mbfl_html_entity_entry *e;
105 :
106 236 : if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
107 : htmlentitifieds[c] != 1) {
108 110 : CK((*filter->output_function)(c, filter->data));
109 : } else {
110 16 : CK((*filter->output_function)('&', filter->data));
111 2634 : for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
112 2628 : if (c == e->code) {
113 : char *p;
114 :
115 57 : for (p = e->name; *p != '\0'; p++) {
116 47 : CK((*filter->output_function)((int)*p, filter->data));
117 : }
118 10 : goto last;
119 : }
120 : }
121 :
122 : {
123 6 : int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
124 :
125 6 : CK((*filter->output_function)('#', filter->data));
126 :
127 6 : uc = (unsigned int)c;
128 :
129 6 : *(--p) = '\0';
130 : do {
131 18 : *(--p) = "0123456789"[uc % 10];
132 18 : uc /= 10;
133 18 : } while (uc);
134 :
135 24 : for (; *p != '\0'; p++) {
136 18 : CK((*filter->output_function)(*p, filter->data));
137 : }
138 : }
139 16 : last:
140 16 : CK((*filter->output_function)(';', filter->data));
141 : }
142 126 : return c;
143 : }
144 :
145 : int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
146 5 : {
147 5 : filter->status = 0;
148 5 : filter->opaque = NULL;
149 5 : return 0;
150 : }
151 :
152 : /*
153 : * HTML => any
154 : */
155 : #define html_enc_buffer_size 16
156 : static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
157 :
158 : void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
159 83 : {
160 83 : filter->status = 0;
161 83 : filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
162 83 : }
163 :
164 : void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
165 83 : {
166 83 : filter->status = 0;
167 83 : if (filter->opaque)
168 : {
169 83 : mbfl_free((void*)filter->opaque);
170 : }
171 83 : filter->opaque = NULL;
172 83 : }
173 :
174 : int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
175 591 : {
176 591 : int pos, ent = 0;
177 : mbfl_html_entity_entry *entity;
178 591 : char *buffer = (char*)filter->opaque;
179 :
180 591 : if (!filter->status) {
181 217 : if (c == '&' ) {
182 91 : filter->status = 1;
183 91 : buffer[0] = '&';
184 : } else {
185 126 : CK((*filter->output_function)(c, filter->data));
186 : }
187 : } else {
188 374 : if (c == ';') {
189 80 : if (buffer[1]=='#') {
190 128 : if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
191 54 : if (filter->status > 3) {
192 : /* numeric entity */
193 120 : for (pos=3; pos<filter->status; pos++) {
194 72 : int v = buffer[pos];
195 108 : if (v >= '0' && v <= '9') {
196 36 : v = v - '0';
197 48 : } else if (v >= 'A' && v <= 'F') {
198 12 : v = v - 'A' + 10;
199 44 : } else if (v >= 'a' && v <= 'f') {
200 20 : v = v - 'a' + 10;
201 : } else {
202 4 : ent = -1;
203 4 : break;
204 : }
205 68 : ent = ent * 16 + v;
206 : }
207 : } else {
208 2 : ent = -1;
209 : }
210 : } else {
211 : /* numeric entity */
212 20 : if (filter->status > 2) {
213 120 : for (pos=2; pos<filter->status; pos++) {
214 41 : int v = buffer[pos];
215 41 : if (v >= '0' && v <= '9') {
216 41 : v = v - '0';
217 : } else {
218 0 : ent = -1;
219 0 : break;
220 : }
221 41 : ent = ent*10 + v;
222 : }
223 : } else {
224 1 : ent = -1;
225 : }
226 : }
227 138 : if (ent >= 0 && ent < 0x110000) {
228 64 : CK((*filter->output_function)(ent, filter->data));
229 : } else {
230 61 : for (pos = 0; pos < filter->status; pos++) {
231 51 : CK((*filter->output_function)(buffer[pos], filter->data));
232 : }
233 10 : CK((*filter->output_function)(c, filter->data));
234 : }
235 74 : filter->status = 0;
236 : /*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
237 : } else {
238 : /* named entity */
239 6 : buffer[filter->status] = 0;
240 6 : entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
241 947 : while (entity->name) {
242 941 : if (!strcmp(buffer+1, entity->name)) {
243 6 : ent = entity->code;
244 6 : break;
245 : }
246 935 : entity++;
247 : }
248 6 : if (ent) {
249 : /* decoded */
250 6 : CK((*filter->output_function)(ent, filter->data));
251 6 : filter->status = 0;
252 : /*php_error_docref("ref.mbstring" TSRMLS_CC, E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
253 : } else {
254 : /* failure */
255 0 : buffer[filter->status++] = ';';
256 0 : buffer[filter->status] = 0;
257 : /* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer); */
258 0 : mbfl_filt_conv_html_dec_flush(filter);
259 : }
260 : }
261 : } else {
262 : /* add character */
263 294 : buffer[filter->status++] = c;
264 : /* add character and check */
265 294 : if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
266 : {
267 : /* illegal character or end of buffer */
268 11 : if (c=='&')
269 0 : filter->status--;
270 11 : buffer[filter->status] = 0;
271 : /* php_error_docref("ref.mbstring" TSRMLS_CC, E_WARNING, "mbstring cannot decode '%s'", buffer)l */
272 11 : mbfl_filt_conv_html_dec_flush(filter);
273 11 : if (c=='&')
274 : {
275 0 : buffer[filter->status++] = '&';
276 : }
277 : }
278 : }
279 : }
280 589 : return c;
281 : }
282 :
283 : int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
284 93 : {
285 93 : int status, pos = 0;
286 : unsigned char *buffer;
287 93 : int err = 0;
288 :
289 93 : buffer = (unsigned char*)filter->opaque;
290 93 : status = filter->status;
291 93 : filter->status = 0;
292 : /* flush fragments */
293 226 : while (status--) {
294 40 : int e = (*filter->output_function)(buffer[pos++], filter->data);
295 40 : if (e != 0)
296 40 : err = e;
297 : }
298 93 : return err;
299 : }
300 :
301 :
|