1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 6 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2009 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Authors: Rasmus Lerdorf <rasmus@php.net> |
16 : | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> |
17 : | Wez Furlong <wez@thebrainroom.com> |
18 : +----------------------------------------------------------------------+
19 : */
20 :
21 : /* $Id: html.c 289605 2009-10-13 05:18:37Z moriyoshi $ */
22 :
23 : /*
24 : * HTML entity resources:
25 : *
26 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 : * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 : *
30 : * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 : *
32 : * UNICODE NOTE:
33 : * The way Unicode support is implemented (namely, IS_UNICODE support) is by
34 : * converting the IS_UNICODE strings to UTF-8 and handing them off to existing
35 : * implementation. This saves on redoing all the code that encodes and decodes
36 : * entities to support UChar*, but it does result in slight performance loss.
37 : * Whoever wants to do this properly, go ahead.
38 : */
39 :
40 : #include "php.h"
41 : #if PHP_WIN32
42 : #include "config.w32.h"
43 : #else
44 : #include <php_config.h>
45 : #endif
46 : #include "html.h"
47 : #include "php_string.h"
48 : #include "SAPI.h"
49 : #if HAVE_LOCALE_H
50 : #include <locale.h>
51 : #endif
52 : #if HAVE_LANGINFO_H
53 : #include <langinfo.h>
54 : #endif
55 :
56 : #if HAVE_MBSTRING
57 : # include "ext/mbstring/mbstring.h"
58 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
59 : #endif
60 :
61 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
62 : cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
63 : cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
64 : cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
65 : };
66 : typedef const char *const entity_table_t;
67 :
68 : /* codepage 1252 is a Windows extension to iso-8859-1. */
69 : static entity_table_t ent_cp_1252[] = {
70 : "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
71 : "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
72 : NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
73 : "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
74 : "oelig", NULL, NULL, "Yuml"
75 : };
76 :
77 : static entity_table_t ent_iso_8859_1[] = {
78 : "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
79 : "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
80 : "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
81 : "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
82 : "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
83 : "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
84 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
85 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
86 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
87 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
88 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
89 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
90 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
91 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
92 : "uuml", "yacute", "thorn", "yuml"
93 : };
94 :
95 : static entity_table_t ent_iso_8859_15[] = {
96 : "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
97 : "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
98 : "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
99 : "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
100 : "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
101 : "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
102 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
103 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
104 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
105 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
106 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
107 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
108 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
109 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
110 : "uuml", "yacute", "thorn", "yuml"
111 : };
112 :
113 : static entity_table_t ent_uni_338_402[] = {
114 : /* 338 (0x0152) */
115 : "OElig", "oelig", NULL, NULL, NULL, NULL,
116 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117 : /* 352 (0x0160) */
118 : "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
119 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
120 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
121 : /* 376 (0x0178) */
122 : "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
123 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
124 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
125 : /* 400 (0x0190) */
126 : NULL, NULL, "fnof"
127 : };
128 :
129 : static entity_table_t ent_uni_spacing[] = {
130 : /* 710 */
131 : "circ",
132 : /* 711 - 730 */
133 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
134 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
135 : /* 731 - 732 */
136 : NULL, "tilde"
137 : };
138 :
139 : static entity_table_t ent_uni_greek[] = {
140 : /* 913 */
141 : "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
142 : "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
143 : NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
144 : /* 938 - 944 are not mapped */
145 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
146 : "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
147 : "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
148 : "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
149 : /* 970 - 976 are not mapped */
150 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
151 : "thetasym", "upsih",
152 : NULL, NULL, NULL,
153 : "piv"
154 : };
155 :
156 : static entity_table_t ent_uni_punct[] = {
157 : /* 8194 */
158 : "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
159 : "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
160 : NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
161 : /* 8216 */
162 : "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
163 : "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
164 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
165 : /* 8242 */
166 : "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
167 : NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
168 : "frasl"
169 : };
170 :
171 : static entity_table_t ent_uni_euro[] = {
172 : "euro"
173 : };
174 :
175 : static entity_table_t ent_uni_8465_8501[] = {
176 : /* 8465 */
177 : "image", NULL, NULL, NULL, NULL, NULL, NULL,
178 : /* 8472 */
179 : "weierp", NULL, NULL, NULL,
180 : /* 8476 */
181 : "real", NULL, NULL, NULL, NULL, NULL,
182 : /* 8482 */
183 : "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185 : /* 8501 */
186 : "alefsym",
187 : };
188 :
189 : static entity_table_t ent_uni_8592_9002[] = {
190 : /* 8592 (0x2190) */
191 : "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
192 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193 : /* 8608 (0x21a0) */
194 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 : /* 8624 (0x21b0) */
197 : NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
198 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199 : /* 8640 (0x21c0) */
200 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 : /* 8656 (0x21d0) */
203 : "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
204 : NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
205 : /* 8672 (0x21e0) */
206 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
207 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
208 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
209 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
210 : /* 8704 (0x2200) */
211 : "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
212 : "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
213 : /* 8720 (0x2210) */
214 : "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
215 : "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
216 : /* 8736 (0x2220) */
217 : "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
218 : "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
219 : /* 8752 (0x2230) */
220 : NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
221 : NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
222 : /* 8768 (0x2240) */
223 : "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
224 : "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
225 : /* 8784 (0x2250) */
226 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
227 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
228 : /* 8800 (0x2260) */
229 : "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
230 : "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
231 : /* 8816 (0x2270) */
232 : "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
233 : NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
234 : /* 8832 (0x2280) */
235 : "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
236 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237 : /* 8848 (0x2290) */
238 : NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
239 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 : /* 8864 (0x22a0) */
241 : NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
242 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243 : /* 8880 (0x22b0) */
244 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 : /* 8896 (0x22c0) */
247 : NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
248 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 : /* 8912 (0x22d0) */
250 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 : /* 8928 (0x22e0) */
253 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
255 : /* 8944 (0x22f0) */
256 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 : /* 8960 (0x2300) */
259 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260 : "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
261 : /* 8976 (0x2310) */
262 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
263 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
264 : /* 8992 (0x2320) */
265 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
266 : NULL, "lang", "rang"
267 : };
268 :
269 : static entity_table_t ent_uni_9674[] = {
270 : /* 9674 */
271 : "loz"
272 : };
273 :
274 : static entity_table_t ent_uni_9824_9830[] = {
275 : /* 9824 */
276 : "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
277 : };
278 :
279 : static entity_table_t ent_koi8r[] = {
280 : "#1105", /* "jo "*/
281 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
282 : NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
283 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
284 : "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
285 : "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
286 : "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
287 : "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
288 : "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
289 : "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
290 : "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
291 : "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
292 : "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
293 : "#1066"
294 : };
295 :
296 : static entity_table_t ent_cp_1251[] = {
297 : "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
298 : "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
299 : "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
300 : "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
301 : "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
302 : "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
303 : "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
304 : "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
305 : "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
306 : "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
307 : "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
308 : "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
309 : "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
310 : "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
311 : "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
312 : "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
313 : "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
314 : "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
315 : "#1103"
316 : };
317 :
318 : static entity_table_t ent_iso_8859_5[] = {
319 : "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
320 : "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
321 : "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
322 : "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
323 : "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
324 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
325 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
326 : "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
327 : "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
328 : "#1119"
329 : };
330 :
331 : static entity_table_t ent_cp_866[] = {
332 :
333 : "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
334 : "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
335 : "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
336 : "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
337 : "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
338 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
339 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
340 : "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
341 : "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
342 : "#160"
343 : };
344 :
345 : /* MacRoman has a couple of low-ascii chars that need mapping too */
346 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
347 : /* DB exports, this mapping changes it to a space */
348 : static entity_table_t ent_macroman[] = {
349 : "sp", NULL, NULL, NULL,
350 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
351 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352 : NULL, NULL, NULL, NULL, NULL, "quot", NULL,
353 : NULL, NULL, "amp", NULL, NULL, NULL, NULL,
354 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 : NULL, NULL, NULL, "lt", NULL, "gt", NULL,
357 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
361 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
362 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
363 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
364 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
365 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
366 : NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
367 : "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
368 : "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
369 : "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
370 : "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
371 : "cent", "pound", "sect", "bull", "para", "szlig", "reg",
372 : "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
373 : "infin", "plusmn", "le", "ge", "yen", "micro", "part",
374 : "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
375 : "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
376 : "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
377 : "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
378 : "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
379 : "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
380 : "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
381 : "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
382 : "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
383 : "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
384 : "#733", "#731", "#711"
385 : };
386 :
387 : struct html_entity_map {
388 : enum entity_charset charset; /* charset identifier */
389 : unsigned int basechar; /* char code at start of table */
390 : unsigned int endchar; /* last char code in the table */
391 : entity_table_t *table; /* the table of mappings */
392 : };
393 :
394 : static const struct html_entity_map entity_map[] = {
395 : { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
396 : { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
397 : { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
398 : { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
399 : { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
400 : { cs_utf_8, 338, 402, ent_uni_338_402 },
401 : { cs_utf_8, 710, 732, ent_uni_spacing },
402 : { cs_utf_8, 913, 982, ent_uni_greek },
403 : { cs_utf_8, 8194, 8260, ent_uni_punct },
404 : { cs_utf_8, 8364, 8364, ent_uni_euro },
405 : { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
406 : { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
407 : { cs_utf_8, 9674, 9674, ent_uni_9674 },
408 : { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
409 : { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
410 : { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
411 : { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
412 : { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
413 : { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
414 : { cs_koi8r, 0xa3, 0xff, ent_koi8r },
415 : { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
416 : { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
417 : { cs_cp866, 0xc0, 0xff, ent_cp_866 },
418 : { cs_macroman, 0x0b, 0xff, ent_macroman },
419 : { cs_terminator }
420 : };
421 :
422 : static const struct {
423 : const char *codeset;
424 : enum entity_charset charset;
425 : } charset_map[] = {
426 : { "ISO-8859-1", cs_8859_1 },
427 : { "ISO8859-1", cs_8859_1 },
428 : { "ISO-8859-15", cs_8859_15 },
429 : { "ISO8859-15", cs_8859_15 },
430 : { "utf-8", cs_utf_8 },
431 : { "cp1252", cs_cp1252 },
432 : { "Windows-1252", cs_cp1252 },
433 : { "1252", cs_cp1252 },
434 : { "BIG5", cs_big5 },
435 : { "950", cs_big5 },
436 : { "GB2312", cs_gb2312 },
437 : { "936", cs_gb2312 },
438 : { "BIG5-HKSCS", cs_big5hkscs },
439 : { "Shift_JIS", cs_sjis },
440 : { "SJIS", cs_sjis },
441 : { "932", cs_sjis },
442 : { "EUCJP", cs_eucjp },
443 : { "EUC-JP", cs_eucjp },
444 : { "KOI8-R", cs_koi8r },
445 : { "koi8-ru", cs_koi8r },
446 : { "koi8r", cs_koi8r },
447 : { "cp1251", cs_cp1251 },
448 : { "Windows-1251", cs_cp1251 },
449 : { "win-1251", cs_cp1251 },
450 : { "iso8859-5", cs_8859_5 },
451 : { "iso-8859-5", cs_8859_5 },
452 : { "cp866", cs_cp866 },
453 : { "866", cs_cp866 },
454 : { "ibm866", cs_cp866 },
455 : { "MacRoman", cs_macroman },
456 : { NULL }
457 : };
458 :
459 : static const struct {
460 : unsigned short charcode;
461 : char *entity;
462 : int entitylen;
463 : int flags;
464 : } basic_entities[] = {
465 : { '"', """, 6, ENT_HTML_QUOTE_DOUBLE },
466 : { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE },
467 : { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE },
468 : { '<', "<", 4, 0 },
469 : { '>', ">", 4, 0 },
470 : { 0, NULL, 0, 0 }
471 : };
472 :
473 : struct basic_entities_dec {
474 : unsigned short charcode;
475 : char entity[8];
476 : int entitylen;
477 : };
478 :
479 : #define MB_RETURN { \
480 : *newpos = pos; \
481 : mbseq[mbpos] = '\0'; \
482 : *mbseqlen = mbpos; \
483 : return this_char; }
484 :
485 : #define MB_WRITE(mbchar) { \
486 : mbspace--; \
487 : if (mbspace == 0) { \
488 : MB_RETURN; \
489 : } \
490 : mbseq[mbpos++] = (mbchar); }
491 :
492 : /* skip one byte and return */
493 : #define MB_FAILURE(pos) do { \
494 : *newpos = pos + 1; \
495 : *status = FAILURE; \
496 : return 0; \
497 : } while (0)
498 :
499 : #define CHECK_LEN(pos, chars_need) \
500 : if (chars_need < 1) { \
501 : if((str_len - (pos)) < chars_need) { \
502 : *newpos = pos; \
503 : *status = FAILURE; \
504 : return 0; \
505 : } \
506 : } else { \
507 : if((str_len - (pos)) < chars_need) { \
508 : *newpos = pos + 1; \
509 : *status = FAILURE; \
510 : return 0; \
511 : } \
512 : }
513 :
514 : /* {{{ get_next_char
515 : */
516 : inline static unsigned int get_next_char(enum entity_charset charset,
517 : unsigned char * str,
518 : int str_len,
519 : int * newpos,
520 : unsigned char * mbseq,
521 : int * mbseqlen,
522 : int *status)
523 38410 : {
524 38410 : int pos = *newpos;
525 38410 : int mbpos = 0;
526 38410 : int mbspace = *mbseqlen;
527 38410 : unsigned int this_char = 0;
528 : unsigned char next_char;
529 :
530 38410 : *status = SUCCESS;
531 :
532 38410 : if (mbspace <= 0) {
533 0 : *mbseqlen = 0;
534 0 : CHECK_LEN(pos, 1);
535 0 : *newpos = pos + 1;
536 0 : return str[pos];
537 : }
538 :
539 38410 : switch (charset) {
540 : case cs_utf_8:
541 : {
542 : unsigned char c;
543 34032 : CHECK_LEN(pos, 1);
544 34032 : c = str[pos];
545 34032 : if (c < 0x80) {
546 26040 : MB_WRITE(c);
547 26040 : this_char = c;
548 26040 : pos++;
549 7992 : } else if (c < 0xc0) {
550 24 : MB_FAILURE(pos);
551 7968 : } else if (c < 0xe0) {
552 7871 : CHECK_LEN(pos, 2);
553 7863 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
554 4 : MB_FAILURE(pos);
555 : }
556 7859 : this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
557 7859 : if (this_char < 0x80) {
558 0 : MB_FAILURE(pos);
559 : }
560 7859 : MB_WRITE((unsigned char)c);
561 7859 : MB_WRITE((unsigned char)str[pos + 1]);
562 7859 : pos += 2;
563 97 : } else if (c < 0xf0) {
564 41 : CHECK_LEN(pos, 3);
565 17 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
566 0 : MB_FAILURE(pos);
567 : }
568 17 : if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
569 0 : MB_FAILURE(pos);
570 : }
571 17 : this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
572 17 : if (this_char < 0x800) {
573 4 : MB_FAILURE(pos);
574 : }
575 13 : MB_WRITE((unsigned char)c);
576 13 : MB_WRITE((unsigned char)str[pos + 1]);
577 13 : MB_WRITE((unsigned char)str[pos + 2]);
578 13 : pos += 3;
579 56 : } else if (c < 0xf8) {
580 16 : CHECK_LEN(pos, 4);
581 4 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
582 0 : MB_FAILURE(pos);
583 : }
584 4 : if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
585 0 : MB_FAILURE(pos);
586 : }
587 4 : if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
588 0 : MB_FAILURE(pos);
589 : }
590 4 : this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
591 4 : if (this_char < 0x10000) {
592 0 : MB_FAILURE(pos);
593 : }
594 4 : MB_WRITE((unsigned char)c);
595 4 : MB_WRITE((unsigned char)str[pos + 1]);
596 4 : MB_WRITE((unsigned char)str[pos + 2]);
597 4 : MB_WRITE((unsigned char)str[pos + 3]);
598 4 : pos += 4;
599 : } else {
600 40 : MB_FAILURE(pos);
601 : }
602 : }
603 33916 : break;
604 : case cs_big5:
605 : case cs_gb2312:
606 : case cs_big5hkscs:
607 : {
608 0 : CHECK_LEN(pos, 1);
609 0 : this_char = str[pos++];
610 : /* check if this is the first of a 2-byte sequence */
611 0 : if (this_char >= 0x81 && this_char <= 0xfe) {
612 : /* peek at the next char */
613 0 : CHECK_LEN(pos, 1);
614 0 : next_char = str[pos++];
615 0 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
616 : (next_char >= 0xa1 && next_char <= 0xfe)) {
617 : /* yes, this a wide char */
618 0 : MB_WRITE(this_char);
619 0 : MB_WRITE(next_char);
620 0 : this_char = (this_char << 8) | next_char;
621 : } else {
622 0 : MB_FAILURE(pos);
623 : }
624 : } else {
625 0 : MB_WRITE(this_char);
626 : }
627 : }
628 0 : break;
629 : case cs_sjis:
630 : {
631 4 : CHECK_LEN(pos, 1);
632 4 : this_char = str[pos++];
633 : /* check if this is the first of a 2-byte sequence */
634 4 : if ((this_char >= 0x81 && this_char <= 0x9f) ||
635 : (this_char >= 0xe0 && this_char <= 0xfc)) {
636 : /* peek at the next char */
637 0 : CHECK_LEN(pos, 1);
638 0 : next_char = str[pos++];
639 0 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
640 : (next_char >= 0x80 && next_char <= 0xfc))
641 : {
642 : /* yes, this a wide char */
643 0 : MB_WRITE(this_char);
644 0 : MB_WRITE(next_char);
645 0 : this_char = (this_char << 8) | next_char;
646 : } else {
647 0 : MB_FAILURE(pos);
648 : }
649 : } else {
650 4 : MB_WRITE(this_char);
651 : }
652 4 : break;
653 : }
654 : case cs_eucjp:
655 : {
656 0 : CHECK_LEN(pos, 1);
657 0 : this_char = str[pos++];
658 : /* check if this is the first of a multi-byte sequence */
659 0 : if (this_char >= 0xa1 && this_char <= 0xfe) {
660 : /* peek at the next char */
661 0 : CHECK_LEN(pos, 1);
662 0 : next_char = str[pos++];
663 0 : if (next_char >= 0xa1 && next_char <= 0xfe) {
664 : /* yes, this a jis kanji char */
665 0 : MB_WRITE(this_char);
666 0 : MB_WRITE(next_char);
667 0 : this_char = (this_char << 8) | next_char;
668 : } else {
669 0 : MB_FAILURE(pos);
670 : }
671 0 : } else if (this_char == 0x8e) {
672 : /* peek at the next char */
673 0 : CHECK_LEN(pos, 1);
674 0 : next_char = str[pos++];
675 0 : if (next_char >= 0xa1 && next_char <= 0xdf) {
676 : /* JIS X 0201 kana */
677 0 : MB_WRITE(this_char);
678 0 : MB_WRITE(next_char);
679 0 : this_char = (this_char << 8) | next_char;
680 : } else {
681 0 : MB_FAILURE(pos);
682 : }
683 0 : } else if (this_char == 0x8f) {
684 : /* peek at the next two char */
685 : unsigned char next2_char;
686 0 : CHECK_LEN(pos, 2);
687 0 : next_char = str[pos];
688 0 : next2_char = str[pos + 1];
689 0 : pos += 2;
690 0 : if ((next_char >= 0xa1 && next_char <= 0xfe) &&
691 : (next2_char >= 0xa1 && next2_char <= 0xfe)) {
692 : /* JIS X 0212 hojo-kanji */
693 0 : MB_WRITE(this_char);
694 0 : MB_WRITE(next_char);
695 0 : MB_WRITE(next2_char);
696 0 : this_char = (this_char << 16) | (next_char << 8) | next2_char;
697 : } else {
698 0 : MB_FAILURE(pos);
699 : }
700 : } else {
701 0 : MB_WRITE(this_char);
702 : }
703 0 : break;
704 : }
705 : default:
706 : /* single-byte charsets */
707 4374 : CHECK_LEN(pos, 1);
708 4374 : this_char = str[pos++];
709 4374 : MB_WRITE(this_char);
710 : break;
711 : }
712 38294 : MB_RETURN;
713 : }
714 : /* }}} */
715 :
716 : /* {{{ entity_charset determine_charset
717 : * returns the charset identifier based on current locale or a hint.
718 : * defaults to iso-8859-1 */
719 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
720 6089 : {
721 : int i;
722 6089 : enum entity_charset charset = cs_8859_1;
723 6089 : int len = 0;
724 6089 : zval *uf_result = NULL;
725 :
726 : /* Guarantee default behaviour for backwards compatibility */
727 6089 : if (charset_hint == NULL)
728 533 : return cs_8859_1;
729 :
730 5556 : if ((len = strlen(charset_hint)) != 0) {
731 5555 : goto det_charset;
732 : }
733 : #if HAVE_MBSTRING
734 : #if !defined(COMPILE_DL_MBSTRING)
735 : /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
736 1 : switch (MBSTRG(current_internal_encoding)) {
737 : case mbfl_no_encoding_8859_1:
738 0 : return cs_8859_1;
739 :
740 : case mbfl_no_encoding_utf8:
741 0 : return cs_utf_8;
742 :
743 : case mbfl_no_encoding_euc_jp:
744 : case mbfl_no_encoding_eucjp_win:
745 0 : return cs_eucjp;
746 :
747 : case mbfl_no_encoding_sjis:
748 : case mbfl_no_encoding_sjis_win:
749 : case mbfl_no_encoding_sjis_mac:
750 0 : return cs_sjis;
751 :
752 : case mbfl_no_encoding_cp1252:
753 0 : return cs_cp1252;
754 :
755 : case mbfl_no_encoding_8859_15:
756 0 : return cs_8859_15;
757 :
758 : case mbfl_no_encoding_big5:
759 0 : return cs_big5;
760 :
761 : case mbfl_no_encoding_euc_cn:
762 : case mbfl_no_encoding_hz:
763 : case mbfl_no_encoding_cp936:
764 0 : return cs_gb2312;
765 :
766 : case mbfl_no_encoding_koi8r:
767 0 : return cs_koi8r;
768 :
769 : case mbfl_no_encoding_cp866:
770 0 : return cs_cp866;
771 :
772 : case mbfl_no_encoding_cp1251:
773 1 : return cs_cp1251;
774 :
775 : case mbfl_no_encoding_8859_5:
776 0 : return cs_8859_5;
777 :
778 : default:
779 : ;
780 : }
781 : #else
782 : {
783 : zval nm_mb_internal_encoding;
784 :
785 : ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
786 :
787 : if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
788 :
789 : charset_hint = Z_STRVAL_P(uf_result);
790 : len = Z_STRLEN_P(uf_result);
791 :
792 : if (len == 4) { /* sizeof(none|auto|pass)-1 */
793 : if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
794 : !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
795 : !memcmp("none", charset_hint, sizeof("none") - 1)) {
796 :
797 : charset_hint = NULL;
798 : len = 0;
799 : }
800 : }
801 : goto det_charset;
802 : }
803 : }
804 : #endif
805 : #endif
806 :
807 0 : charset_hint = SG(default_charset);
808 0 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
809 0 : goto det_charset;
810 : }
811 :
812 : /* try to detect the charset for the locale */
813 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
814 0 : charset_hint = nl_langinfo(CODESET);
815 0 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
816 0 : goto det_charset;
817 : }
818 : #endif
819 :
820 : #if HAVE_LOCALE_H
821 : /* try to figure out the charset from the locale */
822 : {
823 : char *localename;
824 : char *dot, *at;
825 :
826 : /* lang[_territory][.codeset][@modifier] */
827 0 : localename = setlocale(LC_CTYPE, NULL);
828 :
829 0 : dot = strchr(localename, '.');
830 0 : if (dot) {
831 0 : dot++;
832 : /* locale specifies a codeset */
833 0 : at = strchr(dot, '@');
834 0 : if (at)
835 0 : len = at - dot;
836 : else
837 0 : len = strlen(dot);
838 0 : charset_hint = dot;
839 : } else {
840 : /* no explicit name; see if the name itself
841 : * is the charset */
842 0 : charset_hint = localename;
843 0 : len = strlen(charset_hint);
844 : }
845 : }
846 : #endif
847 :
848 5555 : det_charset:
849 :
850 5555 : if (charset_hint) {
851 5555 : int found = 0;
852 :
853 : /* now walk the charset map and look for the codeset */
854 27979 : for (i = 0; charset_map[i].codeset; i++) {
855 27973 : if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
856 5549 : charset = charset_map[i].charset;
857 5549 : found = 1;
858 5549 : break;
859 : }
860 : }
861 5555 : if (!found) {
862 6 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
863 : charset_hint);
864 : }
865 : }
866 5555 : if (uf_result != NULL) {
867 0 : zval_ptr_dtor(&uf_result);
868 : }
869 5555 : return charset;
870 : }
871 : /* }}} */
872 :
873 : /* {{{ php_utf32_utf8 */
874 : size_t php_utf32_utf8(unsigned char *buf, int k)
875 5526 : {
876 5526 : size_t retval = 0;
877 :
878 5526 : if (k < 0x80) {
879 0 : buf[0] = k;
880 0 : retval = 1;
881 5526 : } else if (k < 0x800) {
882 2808 : buf[0] = 0xc0 | (k >> 6);
883 2808 : buf[1] = 0x80 | (k & 0x3f);
884 2808 : retval = 2;
885 2718 : } else if (k < 0x10000) {
886 2718 : buf[0] = 0xe0 | (k >> 12);
887 2718 : buf[1] = 0x80 | ((k >> 6) & 0x3f);
888 2718 : buf[2] = 0x80 | (k & 0x3f);
889 2718 : retval = 3;
890 0 : } else if (k < 0x200000) {
891 0 : buf[0] = 0xf0 | (k >> 18);
892 0 : buf[1] = 0x80 | ((k >> 12) & 0x3f);
893 0 : buf[2] = 0x80 | ((k >> 6) & 0x3f);
894 0 : buf[3] = 0x80 | (k & 0x3f);
895 0 : retval = 4;
896 0 : } else if (k < 0x4000000) {
897 0 : buf[0] = 0xf8 | (k >> 24);
898 0 : buf[1] = 0x80 | ((k >> 18) & 0x3f);
899 0 : buf[2] = 0x80 | ((k >> 12) & 0x3f);
900 0 : buf[3] = 0x80 | ((k >> 6) & 0x3f);
901 0 : buf[4] = 0x80 | (k & 0x3f);
902 0 : retval = 5;
903 : } else {
904 0 : buf[0] = 0xfc | (k >> 30);
905 0 : buf[1] = 0x80 | ((k >> 24) & 0x3f);
906 0 : buf[2] = 0x80 | ((k >> 18) & 0x3f);
907 0 : buf[3] = 0x80 | ((k >> 12) & 0x3f);
908 0 : buf[4] = 0x80 | ((k >> 6) & 0x3f);
909 0 : buf[5] = 0x80 | (k & 0x3f);
910 0 : retval = 6;
911 : }
912 5526 : buf[retval] = '\0';
913 :
914 5526 : return retval;
915 : }
916 : /* }}} */
917 :
918 : /* {{{ php_unescape_html_entities
919 : */
920 : PHPAPI char *php_unescape_html_entities(char *orig, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
921 20 : {
922 20 : unsigned char *old = (unsigned char*)orig;
923 : int retlen;
924 : int j, k;
925 : char *replaced, *ret, *p, *q, *lim, *next;
926 20 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
927 : unsigned char replacement[15];
928 : int replacement_len;
929 :
930 20 : ret = estrndup((char*)old, oldlen);
931 20 : retlen = oldlen;
932 20 : if (!retlen) {
933 2 : goto empty_source;
934 : }
935 :
936 18 : if (all) {
937 : /* look for a match in the maps for this charset */
938 450 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
939 432 : if (entity_map[j].charset != charset)
940 252 : continue;
941 :
942 14184 : for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
943 : char entity[32];
944 14004 : int entity_length = 0;
945 :
946 14004 : if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
947 8478 : continue;
948 :
949 5526 : entity[0] = '&';
950 5526 : entity_length = strlen(entity_map[j].table[k - entity_map[j].basechar]);
951 5526 : strncpy(&entity[1], entity_map[j].table[k - entity_map[j].basechar], sizeof(entity) - 2);
952 5526 : entity[entity_length+1] = ';';
953 5526 : entity[entity_length+2] = '\0';
954 5526 : entity_length += 2;
955 :
956 : /* When we have MBCS entities in the tables above, this will need to handle it */
957 5526 : replacement_len = 0;
958 5526 : switch (charset) {
959 : case cs_8859_1:
960 : case cs_cp1252:
961 : case cs_8859_15:
962 : case cs_cp1251:
963 : case cs_8859_5:
964 : case cs_cp866:
965 : case cs_koi8r:
966 0 : replacement[0] = k;
967 0 : replacement[1] = '\0';
968 0 : replacement_len = 1;
969 0 : break;
970 :
971 : case cs_big5:
972 : case cs_gb2312:
973 : case cs_big5hkscs:
974 : case cs_sjis:
975 : case cs_eucjp:
976 : /* we cannot properly handle those multibyte encodings
977 : * with php_str_to_str. skip it. */
978 0 : continue;
979 :
980 : case cs_utf_8:
981 5526 : replacement_len = php_utf32_utf8(replacement, k);
982 5526 : break;
983 :
984 : default:
985 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
986 0 : efree(ret);
987 0 : return NULL;
988 : }
989 :
990 5526 : if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
991 18 : replaced = php_str_to_str(ret, retlen, entity, entity_length, (char*)replacement, replacement_len, &retlen);
992 18 : efree(ret);
993 18 : ret = replaced;
994 : }
995 : }
996 : }
997 : }
998 :
999 108 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1000 :
1001 90 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1002 12 : continue;
1003 :
1004 78 : replacement[0] = (unsigned char)basic_entities[j].charcode;
1005 78 : replacement[1] = '\0';
1006 :
1007 78 : if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
1008 0 : replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, (char*)replacement, 1, &retlen);
1009 0 : efree(ret);
1010 0 : ret = replaced;
1011 : }
1012 : }
1013 :
1014 : /* replace numeric entities & "&" */
1015 18 : lim = ret + retlen;
1016 124 : for (p = ret, q = ret; p < lim;) {
1017 : int code;
1018 :
1019 88 : if (p[0] == '&') {
1020 6 : if (p + 2 < lim) {
1021 6 : if (p[1] == '#') {
1022 0 : int invalid_code = 0;
1023 :
1024 0 : if (p[2] == 'x' || p[2] == 'X') {
1025 0 : code = strtol(p + 3, &next, 16);
1026 : } else {
1027 0 : code = strtol(p + 2, &next, 10);
1028 : }
1029 :
1030 0 : if (next != NULL && *next == ';') {
1031 0 : switch (charset) {
1032 : case cs_utf_8:
1033 0 : q += php_utf32_utf8((unsigned char*)q, code);
1034 0 : break;
1035 :
1036 : case cs_8859_1:
1037 : case cs_8859_5:
1038 : case cs_8859_15:
1039 0 : if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1040 0 : invalid_code = 1;
1041 : } else {
1042 0 : if (code == 39 || !quote_style) {
1043 0 : invalid_code = 1;
1044 : } else {
1045 0 : *(q++) = code;
1046 : }
1047 : }
1048 0 : break;
1049 :
1050 : case cs_cp1252:
1051 0 : if (code > 0xff) {
1052 0 : invalid_code = 1;
1053 : } else {
1054 0 : *(q++) = code;
1055 : }
1056 0 : break;
1057 :
1058 : case cs_cp1251:
1059 : case cs_cp866:
1060 : case cs_big5:
1061 : case cs_big5hkscs:
1062 : case cs_sjis:
1063 : case cs_eucjp:
1064 0 : if (code >= 0x80) {
1065 0 : invalid_code = 1;
1066 : } else {
1067 0 : *(q++) = code;
1068 : }
1069 0 : break;
1070 :
1071 : case cs_gb2312:
1072 0 : if (code >= 0x81) {
1073 0 : invalid_code = 1;
1074 : } else {
1075 0 : *(q++) = code;
1076 : }
1077 0 : break;
1078 :
1079 : default:
1080 : /* for backwards compatilibity */
1081 0 : invalid_code = 1;
1082 : break;
1083 : }
1084 0 : if (invalid_code) {
1085 0 : for (; p <= next; p++) {
1086 0 : *(q++) = *p;
1087 : }
1088 : }
1089 0 : p = next + 1;
1090 : } else {
1091 0 : *(q++) = *(p++);
1092 0 : *(q++) = *(p++);
1093 : }
1094 12 : } else if (p + 4 < lim &&
1095 : p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1096 : p[4] == ';') {
1097 6 : *(q++) = '&';
1098 6 : p += 5;
1099 : } else {
1100 0 : *(q++) = *(p++);
1101 0 : *(q++) = *(p++);
1102 : }
1103 : } else {
1104 0 : *(q++) = *(p++);
1105 : }
1106 : } else {
1107 82 : *(q++) = *(p++);
1108 : }
1109 : }
1110 18 : *q = '\0';
1111 18 : retlen = (size_t)(q - ret);
1112 20 : empty_source:
1113 20 : *newlen = retlen;
1114 20 : return ret;
1115 : }
1116 : /* }}} */
1117 :
1118 : /* {{{ php_escape_html_entities
1119 : */
1120 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1121 6069 : {
1122 : int i, j, maxlen, len;
1123 : char *replaced;
1124 6069 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1125 : int matches_map;
1126 :
1127 6069 : maxlen = 2 * oldlen;
1128 6069 : if (maxlen < 128)
1129 6018 : maxlen = 128;
1130 6069 : replaced = emalloc (maxlen);
1131 6069 : len = 0;
1132 6069 : i = 0;
1133 50508 : while (i < oldlen) {
1134 : unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
1135 38410 : int mbseqlen = sizeof(mbsequence);
1136 38410 : int status = SUCCESS;
1137 38410 : unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
1138 :
1139 38410 : if(status == FAILURE) {
1140 : /* invalid MB sequence */
1141 116 : if (quote_style & ENT_HTML_IGNORE_ERRORS) {
1142 76 : continue;
1143 : }
1144 40 : efree(replaced);
1145 40 : if(!PG(display_errors)) {
1146 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
1147 : }
1148 40 : *newlen = 0;
1149 40 : return STR_EMPTY_ALLOC();
1150 : }
1151 38294 : matches_map = 0;
1152 :
1153 38294 : if (len + 16 > maxlen)
1154 1 : replaced = erealloc (replaced, maxlen += 128);
1155 :
1156 38294 : if (all) {
1157 : /* look for a match in the maps for this charset */
1158 2567 : char *rep = NULL;
1159 :
1160 :
1161 61346 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1162 58932 : if (entity_map[j].charset == charset
1163 : && this_char >= entity_map[j].basechar
1164 : && this_char <= entity_map[j].endchar) {
1165 153 : rep = (char*)entity_map[j].table[this_char - entity_map[j].basechar];
1166 153 : if (rep == NULL) {
1167 : /* there is no entity for this position; fall through and
1168 : * just output the character itself */
1169 0 : break;
1170 : }
1171 :
1172 153 : matches_map = 1;
1173 153 : break;
1174 : }
1175 : }
1176 :
1177 2567 : if (matches_map) {
1178 153 : int l = strlen(rep);
1179 : /* increase the buffer size */
1180 153 : if (len + 2 + l >= maxlen) {
1181 0 : replaced = erealloc(replaced, maxlen += 128);
1182 : }
1183 :
1184 153 : replaced[len++] = '&';
1185 153 : strcpy(replaced + len, rep);
1186 153 : len += l;
1187 153 : replaced[len++] = ';';
1188 : }
1189 : }
1190 38294 : if (!matches_map) {
1191 38141 : int is_basic = 0;
1192 :
1193 38141 : if (this_char == '&') {
1194 385 : if (double_encode) {
1195 355 : encode_amp:
1196 355 : memcpy(replaced + len, "&", sizeof("&") - 1);
1197 355 : len += sizeof("&") - 1;
1198 : } else {
1199 50 : char *e = memchr(old + i, ';', oldlen - i);
1200 50 : char *s = (char*)old + i;
1201 :
1202 50 : if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
1203 : goto encode_amp;
1204 : } else {
1205 40 : if (*s == '#') { /* numeric entities */
1206 12 : s++;
1207 : /* Hex (Z) */
1208 16 : if (*s == 'x' || *s == 'X') {
1209 6 : s++;
1210 20 : while (s < e) {
1211 10 : if (!isxdigit((int)*(unsigned char *)s++)) {
1212 2 : goto encode_amp;
1213 : }
1214 : }
1215 : /* Dec (Z)*/
1216 : } else {
1217 22 : while (s < e) {
1218 12 : if (!isdigit((int)*(unsigned char *)s++)) {
1219 2 : goto encode_amp;
1220 : }
1221 : }
1222 : }
1223 : } else { /* text entities */
1224 124 : while (s < e) {
1225 74 : if (!isalnum((int)*(unsigned char *)s++)) {
1226 6 : goto encode_amp;
1227 : }
1228 : }
1229 : }
1230 30 : replaced[len++] = '&';
1231 : }
1232 : }
1233 385 : is_basic = 1;
1234 : } else {
1235 225739 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1236 188282 : if ((basic_entities[j].charcode != this_char) ||
1237 : (basic_entities[j].flags &&
1238 : (quote_style & basic_entities[j].flags) == 0)) {
1239 : continue;
1240 : }
1241 :
1242 299 : memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1243 299 : len += basic_entities[j].entitylen;
1244 :
1245 299 : is_basic = 1;
1246 299 : break;
1247 : }
1248 : }
1249 :
1250 38141 : if (!is_basic) {
1251 : /* a wide char without a named entity; pass through the original sequence */
1252 37457 : if (mbseqlen > 1) {
1253 7743 : memcpy(replaced + len, mbsequence, mbseqlen);
1254 7743 : len += mbseqlen;
1255 : } else {
1256 29714 : replaced[len++] = (unsigned char)this_char;
1257 : }
1258 : }
1259 : }
1260 : }
1261 6029 : replaced[len] = '\0';
1262 6029 : *newlen = len;
1263 :
1264 6029 : return replaced;
1265 :
1266 :
1267 : }
1268 : /* }}} */
1269 :
1270 : PHPAPI char *php_escape_html_entities(char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) /* {{{ */
1271 1341 : {
1272 1341 : return php_escape_html_entities_ex((unsigned char*)old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
1273 : }
1274 : /* }}} */
1275 :
1276 : /* {{{ php_html_entities
1277 : */
1278 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1279 4730 : {
1280 : zstr str;
1281 4730 : char *hint_charset = NULL;
1282 4730 : int str_len, hint_charset_len = 0;
1283 : char *str_utf8;
1284 : int str_utf8_len;
1285 : int len;
1286 4730 : long quote_style = ENT_COMPAT;
1287 : zend_uchar type;
1288 : char *replaced;
1289 4730 : zend_bool double_encode = 1;
1290 :
1291 4730 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|ls!b", &str, &str_len, &type, "e_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1292 2 : return;
1293 : }
1294 :
1295 4728 : if (type == IS_UNICODE) {
1296 4068 : zend_unicode_to_string(UG(utf8_conv), &str_utf8, &str_utf8_len, str.u, str_len TSRMLS_CC);
1297 4068 : str.s = str_utf8;
1298 4068 : str_len = str_utf8_len;
1299 4068 : hint_charset = "utf-8";
1300 : }
1301 :
1302 4728 : replaced = php_escape_html_entities_ex((unsigned char*)str.s, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
1303 :
1304 4728 : if (type == IS_UNICODE) {
1305 4068 : RETVAL_U_STRINGL(UG(utf8_conv), replaced, len, ZSTR_AUTOFREE);
1306 4068 : efree(str_utf8);
1307 : } else {
1308 660 : RETVAL_STRINGL(replaced, len, 0);
1309 : }
1310 : }
1311 : /* }}} */
1312 :
1313 : #define HTML_SPECIALCHARS 0
1314 : #define HTML_ENTITIES 1
1315 :
1316 : /* {{{ register_html_constants
1317 : */
1318 : void register_html_constants(INIT_FUNC_ARGS)
1319 17007 : {
1320 17007 : REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1321 17007 : REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1322 17007 : REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1323 17007 : REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1324 17007 : REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1325 17007 : REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
1326 17007 : }
1327 : /* }}} */
1328 :
1329 : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
1330 : Convert special characters to HTML entities */
1331 : PHP_FUNCTION(htmlspecialchars)
1332 4320 : {
1333 4320 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1334 4320 : }
1335 : /* }}} */
1336 :
1337 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style]) U
1338 : Convert special HTML entities back to characters */
1339 : PHP_FUNCTION(htmlspecialchars_decode)
1340 113 : {
1341 : zstr str;
1342 : char *str_utf8;
1343 : int str_utf8_len;
1344 : zend_uchar type;
1345 : char *new_str, *e, *p;
1346 : int len, j, i, new_len;
1347 113 : long quote_style = ENT_COMPAT;
1348 : struct basic_entities_dec basic_entities_dec[8];
1349 :
1350 113 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|l", &str, &len, &type, "e_style) == FAILURE) {
1351 19 : return;
1352 : }
1353 :
1354 94 : if (type == IS_UNICODE) {
1355 90 : if (!u_memchr(str.u, 0x26 /*'&'*/, len)) {
1356 24 : RETURN_UNICODEL(str.u, len, 1);
1357 : }
1358 :
1359 66 : zend_unicode_to_string(UG(utf8_conv), &str_utf8, &str_utf8_len, str.u, len TSRMLS_CC);
1360 66 : new_str = str_utf8;
1361 66 : new_len = str_utf8_len;
1362 66 : p = memchr(new_str, '&', new_len);
1363 : } else {
1364 4 : new_str = estrndup(str.s, len);
1365 4 : new_len = len;
1366 :
1367 4 : if (!(p = memchr(new_str, '&', new_len))) {
1368 1 : RETURN_STRINGL(new_str, new_len, 0);
1369 : }
1370 : }
1371 :
1372 69 : e = new_str + new_len;
1373 :
1374 414 : for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1375 345 : if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1376 133 : continue;
1377 : }
1378 212 : basic_entities_dec[j].charcode = basic_entities[i].charcode;
1379 212 : memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1380 212 : basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1381 212 : j++;
1382 : }
1383 69 : basic_entities_dec[j].charcode = '&';
1384 69 : basic_entities_dec[j].entitylen = sizeof("&") - 1;
1385 69 : memcpy(basic_entities_dec[j].entity, "&", sizeof("&"));
1386 69 : i = j + 1;
1387 :
1388 : do {
1389 343 : int l = e - p;
1390 :
1391 1188 : for (j = 0; j < i; j++) {
1392 1014 : if (basic_entities_dec[j].entitylen > l) {
1393 0 : continue;
1394 : }
1395 1014 : if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1396 169 : int e_len = basic_entities_dec[j].entitylen - 1;
1397 :
1398 169 : *p++ = (char)basic_entities_dec[j].charcode;
1399 169 : memmove(p, p + e_len, (e - p - e_len));
1400 169 : e -= e_len;
1401 169 : goto done;
1402 : }
1403 : }
1404 174 : p++;
1405 :
1406 343 : done:
1407 343 : if (p >= e) {
1408 20 : break;
1409 : }
1410 323 : } while ((p = memchr(p, '&', (e - p))));
1411 :
1412 69 : new_len = e - new_str;
1413 :
1414 69 : new_str[new_len] = '\0';
1415 69 : if (type == IS_UNICODE) {
1416 66 : RETVAL_U_STRINGL(UG(utf8_conv), new_str, new_len, ZSTR_AUTOFREE);
1417 : } else {
1418 3 : RETVAL_STRINGL(new_str, new_len, 0);
1419 : }
1420 : }
1421 : /* }}} */
1422 :
1423 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset]) U
1424 : Convert all HTML entities to their applicable characters */
1425 : PHP_FUNCTION(html_entity_decode)
1426 20 : {
1427 : zstr str;
1428 20 : char *hint_charset = NULL;
1429 20 : int str_len, hint_charset_len = 0, len;
1430 : char *str_utf8;
1431 : int str_utf8_len;
1432 : zend_uchar type;
1433 20 : long quote_style = ENT_COMPAT;
1434 : char *replaced;
1435 :
1436 20 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|ls", &str, &str_len, &type,
1437 : "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
1438 0 : return;
1439 : }
1440 :
1441 20 : if (type == IS_UNICODE) {
1442 20 : zend_unicode_to_string(UG(utf8_conv), &str_utf8, &str_utf8_len, str.u, str_len TSRMLS_CC);
1443 20 : str.s = str_utf8;
1444 20 : str_len = str_utf8_len;
1445 20 : hint_charset = "utf-8";
1446 : }
1447 :
1448 20 : replaced = php_unescape_html_entities(str.s, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1449 :
1450 20 : if (!replaced) {
1451 0 : RETURN_FALSE;
1452 : }
1453 :
1454 20 : if (type == IS_UNICODE) {
1455 20 : RETVAL_U_STRINGL(UG(utf8_conv), replaced, len, ZSTR_AUTOFREE);
1456 20 : efree(str_utf8);
1457 : } else {
1458 0 : RETVAL_STRINGL(replaced, len, 0);
1459 : }
1460 : }
1461 : /* }}} */
1462 :
1463 : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
1464 : Convert all applicable characters to HTML entities */
1465 : PHP_FUNCTION(htmlentities)
1466 410 : {
1467 410 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1468 410 : }
1469 : /* }}} */
1470 :
1471 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]]) U
1472 : Returns the internal translation table used by htmlspecialchars and htmlentities */
1473 : PHP_FUNCTION(get_html_translation_table)
1474 0 : {
1475 0 : long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1476 : int i, j;
1477 : char ind[2];
1478 0 : enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
1479 : UChar32 cp;
1480 : UChar key[3];
1481 : int key_len;
1482 :
1483 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, "e_style) == FAILURE) {
1484 0 : return;
1485 : }
1486 :
1487 0 : charset = cs_utf_8;
1488 :
1489 0 : array_init(return_value);
1490 :
1491 0 : ind[1] = 0;
1492 :
1493 0 : switch (which) {
1494 : case HTML_ENTITIES:
1495 0 : for (j=0; entity_map[j].charset != cs_terminator; j++) {
1496 0 : if (entity_map[j].charset != charset)
1497 0 : continue;
1498 0 : for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1499 : char buffer[16];
1500 :
1501 0 : if (entity_map[j].table[i] == NULL)
1502 0 : continue;
1503 :
1504 0 : cp = (UChar)(i + entity_map[j].basechar);
1505 0 : key_len = zend_codepoint_to_uchar(cp, key);
1506 0 : key[key_len] = 0;
1507 0 : snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1508 0 : add_u_assoc_ascii_string_ex(return_value, IS_UNICODE, ZSTR(key), key_len+1, buffer, 1);
1509 : }
1510 : }
1511 : /* break thru */
1512 :
1513 : case HTML_SPECIALCHARS:
1514 0 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1515 :
1516 0 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1517 0 : continue;
1518 :
1519 0 : ind[0] = (unsigned char)basic_entities[j].charcode;
1520 0 : add_ascii_assoc_ascii_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
1521 : }
1522 0 : add_ascii_assoc_ascii_stringl(return_value, "&", "&", sizeof("&") - 1, 1);
1523 :
1524 : break;
1525 : }
1526 : }
1527 : /* }}} */
1528 :
1529 : /*
1530 : * Local variables:
1531 : * tab-width: 4
1532 : * c-basic-offset: 4
1533 : * End:
1534 : * vim600: sw=4 ts=4 fdm=marker
1535 : * vim<600: sw=4 ts=4
1536 : */
|