1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 5 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2009 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Authors: Rasmus Lerdorf <rasmus@php.net> |
16 : | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> |
17 : | Wez Furlong <wez@thebrainroom.com> |
18 : +----------------------------------------------------------------------+
19 : */
20 :
21 : /* $Id: html.c 289605 2009-10-13 05:18:37Z moriyoshi $ */
22 :
23 : /*
24 : * HTML entity resources:
25 : *
26 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 : * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 : *
30 : * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 : *
32 : */
33 :
34 : #include "php.h"
35 : #if PHP_WIN32
36 : #include "config.w32.h"
37 : #else
38 : #include <php_config.h>
39 : #endif
40 : #include "html.h"
41 : #include "php_string.h"
42 : #include "SAPI.h"
43 : #if HAVE_LOCALE_H
44 : #include <locale.h>
45 : #endif
46 : #if HAVE_LANGINFO_H
47 : #include <langinfo.h>
48 : #endif
49 :
50 : #if HAVE_MBSTRING
51 : # include "ext/mbstring/mbstring.h"
52 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
53 : #endif
54 :
55 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
56 : cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
57 : cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
58 : cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
59 : };
60 : typedef const char *const entity_table_t;
61 :
62 : /* codepage 1252 is a Windows extension to iso-8859-1. */
63 : static entity_table_t ent_cp_1252[] = {
64 : "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
65 : "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
66 : NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
67 : "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
68 : "oelig", NULL, NULL, "Yuml"
69 : };
70 :
71 : static entity_table_t ent_iso_8859_1[] = {
72 : "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
73 : "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
74 : "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
75 : "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
76 : "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
77 : "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
78 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
79 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
80 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
81 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
82 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
83 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
84 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
85 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
86 : "uuml", "yacute", "thorn", "yuml"
87 : };
88 :
89 : static entity_table_t ent_iso_8859_15[] = {
90 : "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
91 : "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
92 : "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
93 : "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
94 : "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
95 : "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
96 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
97 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
98 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
99 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
100 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
101 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
102 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
103 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
104 : "uuml", "yacute", "thorn", "yuml"
105 : };
106 :
107 : static entity_table_t ent_uni_338_402[] = {
108 : /* 338 (0x0152) */
109 : "OElig", "oelig", NULL, NULL, NULL, NULL,
110 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
111 : /* 352 (0x0160) */
112 : "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
113 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
114 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
115 : /* 376 (0x0178) */
116 : "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
117 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
118 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
119 : /* 400 (0x0190) */
120 : NULL, NULL, "fnof"
121 : };
122 :
123 : static entity_table_t ent_uni_spacing[] = {
124 : /* 710 */
125 : "circ",
126 : /* 711 - 730 */
127 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
128 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
129 : /* 731 - 732 */
130 : NULL, "tilde"
131 : };
132 :
133 : static entity_table_t ent_uni_greek[] = {
134 : /* 913 */
135 : "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
136 : "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
137 : NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
138 : /* 938 - 944 are not mapped */
139 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
140 : "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
141 : "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
142 : "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
143 : /* 970 - 976 are not mapped */
144 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
145 : "thetasym", "upsih",
146 : NULL, NULL, NULL,
147 : "piv"
148 : };
149 :
150 : static entity_table_t ent_uni_punct[] = {
151 : /* 8194 */
152 : "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
153 : "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
154 : NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
155 : /* 8216 */
156 : "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
157 : "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
158 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
159 : /* 8242 */
160 : "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
161 : NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
162 : "frasl"
163 : };
164 :
165 : static entity_table_t ent_uni_euro[] = {
166 : "euro"
167 : };
168 :
169 : static entity_table_t ent_uni_8465_8501[] = {
170 : /* 8465 */
171 : "image", NULL, NULL, NULL, NULL, NULL, NULL,
172 : /* 8472 */
173 : "weierp", NULL, NULL, NULL,
174 : /* 8476 */
175 : "real", NULL, NULL, NULL, NULL, NULL,
176 : /* 8482 */
177 : "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 : /* 8501 */
180 : "alefsym",
181 : };
182 :
183 : static entity_table_t ent_uni_8592_9002[] = {
184 : /* 8592 (0x2190) */
185 : "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
186 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
187 : /* 8608 (0x21a0) */
188 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
189 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
190 : /* 8624 (0x21b0) */
191 : NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
192 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
193 : /* 8640 (0x21c0) */
194 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
195 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 : /* 8656 (0x21d0) */
197 : "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
198 : NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
199 : /* 8672 (0x21e0) */
200 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
201 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
203 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
204 : /* 8704 (0x2200) */
205 : "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
206 : "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
207 : /* 8720 (0x2210) */
208 : "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
209 : "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
210 : /* 8736 (0x2220) */
211 : "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
212 : "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
213 : /* 8752 (0x2230) */
214 : NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
215 : NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
216 : /* 8768 (0x2240) */
217 : "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
218 : "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
219 : /* 8784 (0x2250) */
220 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
221 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
222 : /* 8800 (0x2260) */
223 : "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
224 : "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
225 : /* 8816 (0x2270) */
226 : "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
227 : NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
228 : /* 8832 (0x2280) */
229 : "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
230 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
231 : /* 8848 (0x2290) */
232 : NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
233 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
234 : /* 8864 (0x22a0) */
235 : NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
236 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
237 : /* 8880 (0x22b0) */
238 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
239 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 : /* 8896 (0x22c0) */
241 : NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
242 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
243 : /* 8912 (0x22d0) */
244 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
245 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 : /* 8928 (0x22e0) */
247 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
248 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 : /* 8944 (0x22f0) */
250 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
251 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 : /* 8960 (0x2300) */
253 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
254 : "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
255 : /* 8976 (0x2310) */
256 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
257 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 : /* 8992 (0x2320) */
259 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
260 : NULL, "lang", "rang"
261 : };
262 :
263 : static entity_table_t ent_uni_9674[] = {
264 : /* 9674 */
265 : "loz"
266 : };
267 :
268 : static entity_table_t ent_uni_9824_9830[] = {
269 : /* 9824 */
270 : "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
271 : };
272 :
273 : static entity_table_t ent_koi8r[] = {
274 : "#1105", /* "jo "*/
275 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
276 : NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
277 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
278 : "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
279 : "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
280 : "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
281 : "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
282 : "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
283 : "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
284 : "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
285 : "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
286 : "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
287 : "#1066"
288 : };
289 :
290 : static entity_table_t ent_cp_1251[] = {
291 : "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
292 : "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
293 : "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
294 : "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
295 : "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
296 : "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
297 : "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
298 : "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
299 : "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
300 : "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
301 : "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
302 : "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
303 : "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
304 : "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
305 : "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
306 : "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
307 : "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
308 : "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
309 : "#1103"
310 : };
311 :
312 : static entity_table_t ent_iso_8859_5[] = {
313 : "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
314 : "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
315 : "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
316 : "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
317 : "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
318 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
319 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
320 : "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
321 : "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
322 : "#1119"
323 : };
324 :
325 : static entity_table_t ent_cp_866[] = {
326 :
327 : "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
328 : "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
329 : "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
330 : "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
331 : "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
332 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
333 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
334 : "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
335 : "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
336 : "#160"
337 : };
338 :
339 : /* MacRoman has a couple of low-ascii chars that need mapping too */
340 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
341 : /* DB exports, this mapping changes it to a space */
342 : static entity_table_t ent_macroman[] = {
343 : "sp", NULL, NULL, NULL,
344 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
345 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
346 : NULL, NULL, NULL, NULL, NULL, "quot", NULL,
347 : NULL, NULL, "amp", NULL, NULL, NULL, NULL,
348 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
349 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
350 : NULL, NULL, NULL, "lt", NULL, "gt", NULL,
351 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
352 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
353 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
354 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
357 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 : NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
361 : "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
362 : "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
363 : "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
364 : "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
365 : "cent", "pound", "sect", "bull", "para", "szlig", "reg",
366 : "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
367 : "infin", "plusmn", "le", "ge", "yen", "micro", "part",
368 : "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
369 : "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
370 : "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
371 : "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
372 : "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
373 : "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
374 : "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
375 : "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
376 : "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
377 : "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
378 : "#733", "#731", "#711"
379 : };
380 :
381 : struct html_entity_map {
382 : enum entity_charset charset; /* charset identifier */
383 : unsigned int basechar; /* char code at start of table */
384 : unsigned int endchar; /* last char code in the table */
385 : entity_table_t *table; /* the table of mappings */
386 : };
387 :
388 : static const struct html_entity_map entity_map[] = {
389 : { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
390 : { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
391 : { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
392 : { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
393 : { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
394 : { cs_utf_8, 338, 402, ent_uni_338_402 },
395 : { cs_utf_8, 710, 732, ent_uni_spacing },
396 : { cs_utf_8, 913, 982, ent_uni_greek },
397 : { cs_utf_8, 8194, 8260, ent_uni_punct },
398 : { cs_utf_8, 8364, 8364, ent_uni_euro },
399 : { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
400 : { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
401 : { cs_utf_8, 9674, 9674, ent_uni_9674 },
402 : { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
403 : { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
404 : { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
405 : { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
406 : { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
407 : { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
408 : { cs_koi8r, 0xa3, 0xff, ent_koi8r },
409 : { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
410 : { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
411 : { cs_cp866, 0xc0, 0xff, ent_cp_866 },
412 : { cs_macroman, 0x0b, 0xff, ent_macroman },
413 : { cs_terminator }
414 : };
415 :
416 : static const struct {
417 : const char *codeset;
418 : enum entity_charset charset;
419 : } charset_map[] = {
420 : { "ISO-8859-1", cs_8859_1 },
421 : { "ISO8859-1", cs_8859_1 },
422 : { "ISO-8859-15", cs_8859_15 },
423 : { "ISO8859-15", cs_8859_15 },
424 : { "utf-8", cs_utf_8 },
425 : { "cp1252", cs_cp1252 },
426 : { "Windows-1252", cs_cp1252 },
427 : { "1252", cs_cp1252 },
428 : { "BIG5", cs_big5 },
429 : { "950", cs_big5 },
430 : { "GB2312", cs_gb2312 },
431 : { "936", cs_gb2312 },
432 : { "BIG5-HKSCS", cs_big5hkscs },
433 : { "Shift_JIS", cs_sjis },
434 : { "SJIS", cs_sjis },
435 : { "932", cs_sjis },
436 : { "EUCJP", cs_eucjp },
437 : { "EUC-JP", cs_eucjp },
438 : { "KOI8-R", cs_koi8r },
439 : { "koi8-ru", cs_koi8r },
440 : { "koi8r", cs_koi8r },
441 : { "cp1251", cs_cp1251 },
442 : { "Windows-1251", cs_cp1251 },
443 : { "win-1251", cs_cp1251 },
444 : { "iso8859-5", cs_8859_5 },
445 : { "iso-8859-5", cs_8859_5 },
446 : { "cp866", cs_cp866 },
447 : { "866", cs_cp866 },
448 : { "ibm866", cs_cp866 },
449 : { "MacRoman", cs_macroman },
450 : { NULL }
451 : };
452 :
453 : static const struct {
454 : unsigned short charcode;
455 : char *entity;
456 : int entitylen;
457 : int flags;
458 : } basic_entities[] = {
459 : { '"', """, 6, ENT_HTML_QUOTE_DOUBLE },
460 : { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE },
461 : { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE },
462 : { '<', "<", 4, 0 },
463 : { '>', ">", 4, 0 },
464 : { 0, NULL, 0, 0 }
465 : };
466 :
467 : struct basic_entities_dec {
468 : unsigned short charcode;
469 : char entity[8];
470 : int entitylen;
471 : };
472 :
473 : #define MB_RETURN { \
474 : *newpos = pos; \
475 : mbseq[mbpos] = '\0'; \
476 : *mbseqlen = mbpos; \
477 : return this_char; }
478 :
479 : #define MB_WRITE(mbchar) { \
480 : mbspace--; \
481 : if (mbspace == 0) { \
482 : MB_RETURN; \
483 : } \
484 : mbseq[mbpos++] = (mbchar); }
485 :
486 : /* skip one byte and return */
487 : #define MB_FAILURE(pos) do { \
488 : *newpos = pos + 1; \
489 : *status = FAILURE; \
490 : return 0; \
491 : } while (0)
492 :
493 : #define CHECK_LEN(pos, chars_need) \
494 : if (chars_need < 1) { \
495 : if((str_len - (pos)) < chars_need) { \
496 : *newpos = pos; \
497 : *status = FAILURE; \
498 : return 0; \
499 : } \
500 : } else { \
501 : if((str_len - (pos)) < chars_need) { \
502 : *newpos = pos + 1; \
503 : *status = FAILURE; \
504 : return 0; \
505 : } \
506 : }
507 :
508 : /* {{{ get_next_char
509 : */
510 : inline static unsigned int get_next_char(enum entity_charset charset,
511 : unsigned char * str,
512 : int str_len,
513 : int * newpos,
514 : unsigned char * mbseq,
515 : int * mbseqlen,
516 : int *status)
517 35486 : {
518 35486 : int pos = *newpos;
519 35486 : int mbpos = 0;
520 35486 : int mbspace = *mbseqlen;
521 35486 : unsigned int this_char = 0;
522 : unsigned char next_char;
523 :
524 35486 : *status = SUCCESS;
525 :
526 35486 : if (mbspace <= 0) {
527 0 : *mbseqlen = 0;
528 0 : CHECK_LEN(pos, 1);
529 0 : *newpos = pos + 1;
530 0 : return str[pos];
531 : }
532 :
533 35486 : switch (charset) {
534 : case cs_utf_8:
535 : {
536 : unsigned char c;
537 297 : CHECK_LEN(pos, 1);
538 297 : c = str[pos];
539 297 : if (c < 0x80) {
540 100 : MB_WRITE(c);
541 100 : this_char = c;
542 100 : pos++;
543 197 : } else if (c < 0xc0) {
544 30 : MB_FAILURE(pos);
545 167 : } else if (c < 0xe0) {
546 44 : CHECK_LEN(pos, 2);
547 36 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
548 6 : MB_FAILURE(pos);
549 : }
550 30 : this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
551 30 : if (this_char < 0x80) {
552 3 : MB_FAILURE(pos);
553 : }
554 27 : MB_WRITE((unsigned char)c);
555 27 : MB_WRITE((unsigned char)str[pos + 1]);
556 27 : pos += 2;
557 123 : } else if (c < 0xf0) {
558 54 : CHECK_LEN(pos, 3);
559 30 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
560 3 : MB_FAILURE(pos);
561 : }
562 27 : if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
563 2 : MB_FAILURE(pos);
564 : }
565 25 : this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
566 25 : if (this_char < 0x800) {
567 8 : MB_FAILURE(pos);
568 : }
569 17 : MB_WRITE((unsigned char)c);
570 17 : MB_WRITE((unsigned char)str[pos + 1]);
571 17 : MB_WRITE((unsigned char)str[pos + 2]);
572 17 : pos += 3;
573 69 : } else if (c < 0xf8) {
574 28 : CHECK_LEN(pos, 4);
575 16 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
576 2 : MB_FAILURE(pos);
577 : }
578 14 : if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
579 2 : MB_FAILURE(pos);
580 : }
581 12 : if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
582 2 : MB_FAILURE(pos);
583 : }
584 10 : this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
585 10 : if (this_char < 0x10000) {
586 3 : MB_FAILURE(pos);
587 : }
588 7 : MB_WRITE((unsigned char)c);
589 7 : MB_WRITE((unsigned char)str[pos + 1]);
590 7 : MB_WRITE((unsigned char)str[pos + 2]);
591 7 : MB_WRITE((unsigned char)str[pos + 3]);
592 7 : pos += 4;
593 : } else {
594 41 : MB_FAILURE(pos);
595 : }
596 : }
597 151 : break;
598 : case cs_big5:
599 : case cs_gb2312:
600 : case cs_big5hkscs:
601 : {
602 1262 : CHECK_LEN(pos, 1);
603 1262 : this_char = str[pos++];
604 : /* check if this is the first of a 2-byte sequence */
605 1766 : if (this_char >= 0x81 && this_char <= 0xfe) {
606 : /* peek at the next char */
607 1260 : CHECK_LEN(pos, 1);
608 1134 : next_char = str[pos++];
609 1134 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
610 : (next_char >= 0xa1 && next_char <= 0xfe)) {
611 : /* yes, this a wide char */
612 504 : MB_WRITE(this_char);
613 504 : MB_WRITE(next_char);
614 504 : this_char = (this_char << 8) | next_char;
615 : } else {
616 630 : MB_FAILURE(pos);
617 : }
618 : } else {
619 2 : MB_WRITE(this_char);
620 : }
621 : }
622 506 : break;
623 : case cs_sjis:
624 : {
625 681 : CHECK_LEN(pos, 1);
626 681 : this_char = str[pos++];
627 : /* check if this is the first of a 2-byte sequence */
628 930 : if ((this_char >= 0x81 && this_char <= 0x9f) ||
629 : (this_char >= 0xe0 && this_char <= 0xfc)) {
630 : /* peek at the next char */
631 609 : CHECK_LEN(pos, 1);
632 549 : next_char = str[pos++];
633 549 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
634 : (next_char >= 0x80 && next_char <= 0xfc))
635 : {
636 : /* yes, this a wide char */
637 249 : MB_WRITE(this_char);
638 249 : MB_WRITE(next_char);
639 249 : this_char = (this_char << 8) | next_char;
640 : } else {
641 300 : MB_FAILURE(pos);
642 : }
643 : } else {
644 72 : MB_WRITE(this_char);
645 : }
646 321 : break;
647 : }
648 : case cs_eucjp:
649 : {
650 2402 : CHECK_LEN(pos, 1);
651 2402 : this_char = str[pos++];
652 : /* check if this is the first of a multi-byte sequence */
653 2602 : if (this_char >= 0xa1 && this_char <= 0xfe) {
654 : /* peek at the next char */
655 797 : CHECK_LEN(pos, 1);
656 576 : next_char = str[pos++];
657 576 : if (next_char >= 0xa1 && next_char <= 0xfe) {
658 : /* yes, this a jis kanji char */
659 200 : MB_WRITE(this_char);
660 200 : MB_WRITE(next_char);
661 200 : this_char = (this_char << 8) | next_char;
662 : } else {
663 376 : MB_FAILURE(pos);
664 : }
665 1605 : } else if (this_char == 0x8e) {
666 : /* peek at the next char */
667 661 : CHECK_LEN(pos, 1);
668 660 : next_char = str[pos++];
669 660 : if (next_char >= 0xa1 && next_char <= 0xdf) {
670 : /* JIS X 0201 kana */
671 443 : MB_WRITE(this_char);
672 443 : MB_WRITE(next_char);
673 443 : this_char = (this_char << 8) | next_char;
674 : } else {
675 217 : MB_FAILURE(pos);
676 : }
677 944 : } else if (this_char == 0x8f) {
678 : /* peek at the next two char */
679 : unsigned char next2_char;
680 661 : CHECK_LEN(pos, 2);
681 565 : next_char = str[pos];
682 565 : next2_char = str[pos + 1];
683 565 : pos += 2;
684 565 : if ((next_char >= 0xa1 && next_char <= 0xfe) &&
685 : (next2_char >= 0xa1 && next2_char <= 0xfe)) {
686 : /* JIS X 0212 hojo-kanji */
687 189 : MB_WRITE(this_char);
688 189 : MB_WRITE(next_char);
689 189 : MB_WRITE(next2_char);
690 189 : this_char = (this_char << 16) | (next_char << 8) | next2_char;
691 : } else {
692 376 : MB_FAILURE(pos);
693 : }
694 : } else {
695 283 : MB_WRITE(this_char);
696 : }
697 1115 : break;
698 : }
699 : default:
700 : /* single-byte charsets */
701 30844 : CHECK_LEN(pos, 1);
702 30844 : this_char = str[pos++];
703 30844 : MB_WRITE(this_char);
704 : break;
705 : }
706 32937 : MB_RETURN;
707 : }
708 : /* }}} */
709 :
710 : /* {{{ entity_charset determine_charset
711 : * returns the charset identifier based on current locale or a hint.
712 : * defaults to iso-8859-1 */
713 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
714 6480 : {
715 : int i;
716 6480 : enum entity_charset charset = cs_8859_1;
717 6480 : int len = 0;
718 6480 : zval *uf_result = NULL;
719 :
720 : /* Guarantee default behaviour for backwards compatibility */
721 6480 : if (charset_hint == NULL)
722 2305 : return cs_8859_1;
723 :
724 4175 : if ((len = strlen(charset_hint)) != 0) {
725 4152 : goto det_charset;
726 : }
727 : #if HAVE_MBSTRING
728 : #if !defined(COMPILE_DL_MBSTRING)
729 : /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
730 23 : switch (MBSTRG(current_internal_encoding)) {
731 : case mbfl_no_encoding_8859_1:
732 2 : return cs_8859_1;
733 :
734 : case mbfl_no_encoding_utf8:
735 0 : return cs_utf_8;
736 :
737 : case mbfl_no_encoding_euc_jp:
738 : case mbfl_no_encoding_eucjp_win:
739 2 : return cs_eucjp;
740 :
741 : case mbfl_no_encoding_sjis:
742 : case mbfl_no_encoding_sjis_win:
743 : case mbfl_no_encoding_sjis_mac:
744 2 : return cs_sjis;
745 :
746 : case mbfl_no_encoding_cp1252:
747 3 : return cs_cp1252;
748 :
749 : case mbfl_no_encoding_8859_15:
750 2 : return cs_8859_15;
751 :
752 : case mbfl_no_encoding_big5:
753 0 : return cs_big5;
754 :
755 : case mbfl_no_encoding_euc_cn:
756 : case mbfl_no_encoding_hz:
757 : case mbfl_no_encoding_cp936:
758 0 : return cs_gb2312;
759 :
760 : case mbfl_no_encoding_koi8r:
761 0 : return cs_koi8r;
762 :
763 : case mbfl_no_encoding_cp866:
764 0 : return cs_cp866;
765 :
766 : case mbfl_no_encoding_cp1251:
767 2 : return cs_cp1251;
768 :
769 : case mbfl_no_encoding_8859_5:
770 0 : return cs_8859_5;
771 :
772 : default:
773 : ;
774 : }
775 : #else
776 : {
777 : zval nm_mb_internal_encoding;
778 :
779 : ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
780 :
781 : if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
782 :
783 : charset_hint = Z_STRVAL_P(uf_result);
784 : len = Z_STRLEN_P(uf_result);
785 :
786 : if (len == 4) { /* sizeof(none|auto|pass)-1 */
787 : if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
788 : !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
789 : !memcmp("none", charset_hint, sizeof("none") - 1)) {
790 :
791 : charset_hint = NULL;
792 : len = 0;
793 : }
794 : }
795 : goto det_charset;
796 : }
797 : }
798 : #endif
799 : #endif
800 :
801 10 : charset_hint = SG(default_charset);
802 10 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
803 6 : goto det_charset;
804 : }
805 :
806 : /* try to detect the charset for the locale */
807 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
808 4 : charset_hint = nl_langinfo(CODESET);
809 4 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
810 4 : goto det_charset;
811 : }
812 : #endif
813 :
814 : #if HAVE_LOCALE_H
815 : /* try to figure out the charset from the locale */
816 : {
817 : char *localename;
818 : char *dot, *at;
819 :
820 : /* lang[_territory][.codeset][@modifier] */
821 0 : localename = setlocale(LC_CTYPE, NULL);
822 :
823 0 : dot = strchr(localename, '.');
824 0 : if (dot) {
825 0 : dot++;
826 : /* locale specifies a codeset */
827 0 : at = strchr(dot, '@');
828 0 : if (at)
829 0 : len = at - dot;
830 : else
831 0 : len = strlen(dot);
832 0 : charset_hint = dot;
833 : } else {
834 : /* no explicit name; see if the name itself
835 : * is the charset */
836 0 : charset_hint = localename;
837 0 : len = strlen(charset_hint);
838 : }
839 : }
840 : #endif
841 :
842 4162 : det_charset:
843 :
844 4162 : if (charset_hint) {
845 4162 : int found = 0;
846 :
847 : /* now walk the charset map and look for the codeset */
848 58237 : for (i = 0; charset_map[i].codeset; i++) {
849 58231 : if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
850 4156 : charset = charset_map[i].charset;
851 4156 : found = 1;
852 4156 : break;
853 : }
854 : }
855 4162 : if (!found) {
856 6 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
857 : charset_hint);
858 : }
859 : }
860 4162 : if (uf_result != NULL) {
861 0 : zval_ptr_dtor(&uf_result);
862 : }
863 4162 : return charset;
864 : }
865 : /* }}} */
866 :
867 : /* {{{ php_utf32_utf8 */
868 : size_t php_utf32_utf8(unsigned char *buf, int k)
869 3684 : {
870 3684 : size_t retval = 0;
871 :
872 3684 : if (k < 0x80) {
873 0 : buf[0] = k;
874 0 : retval = 1;
875 3684 : } else if (k < 0x800) {
876 1872 : buf[0] = 0xc0 | (k >> 6);
877 1872 : buf[1] = 0x80 | (k & 0x3f);
878 1872 : retval = 2;
879 1812 : } else if (k < 0x10000) {
880 1812 : buf[0] = 0xe0 | (k >> 12);
881 1812 : buf[1] = 0x80 | ((k >> 6) & 0x3f);
882 1812 : buf[2] = 0x80 | (k & 0x3f);
883 1812 : retval = 3;
884 0 : } else if (k < 0x200000) {
885 0 : buf[0] = 0xf0 | (k >> 18);
886 0 : buf[1] = 0x80 | ((k >> 12) & 0x3f);
887 0 : buf[2] = 0x80 | ((k >> 6) & 0x3f);
888 0 : buf[3] = 0x80 | (k & 0x3f);
889 0 : retval = 4;
890 0 : } else if (k < 0x4000000) {
891 0 : buf[0] = 0xf8 | (k >> 24);
892 0 : buf[1] = 0x80 | ((k >> 18) & 0x3f);
893 0 : buf[2] = 0x80 | ((k >> 12) & 0x3f);
894 0 : buf[3] = 0x80 | ((k >> 6) & 0x3f);
895 0 : buf[4] = 0x80 | (k & 0x3f);
896 0 : retval = 5;
897 : } else {
898 0 : buf[0] = 0xfc | (k >> 30);
899 0 : buf[1] = 0x80 | ((k >> 24) & 0x3f);
900 0 : buf[2] = 0x80 | ((k >> 18) & 0x3f);
901 0 : buf[3] = 0x80 | ((k >> 12) & 0x3f);
902 0 : buf[4] = 0x80 | ((k >> 6) & 0x3f);
903 0 : buf[5] = 0x80 | (k & 0x3f);
904 0 : retval = 6;
905 : }
906 3684 : buf[retval] = '\0';
907 :
908 3684 : return retval;
909 : }
910 : /* }}} */
911 :
912 : /* {{{ php_unescape_html_entities
913 : */
914 : PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
915 22 : {
916 : int retlen;
917 : int j, k;
918 : char *replaced, *ret, *p, *q, *lim, *next;
919 22 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
920 : unsigned char replacement[15];
921 : int replacement_len;
922 :
923 22 : ret = estrndup(old, oldlen);
924 22 : retlen = oldlen;
925 22 : if (!retlen) {
926 2 : goto empty_source;
927 : }
928 :
929 20 : if (all) {
930 : /* look for a match in the maps for this charset */
931 500 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
932 480 : if (entity_map[j].charset != charset)
933 352 : continue;
934 :
935 10223 : for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
936 : unsigned char entity[32];
937 10095 : int entity_length = 0;
938 :
939 10095 : if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
940 5733 : continue;
941 :
942 4362 : entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
943 4362 : if (entity_length >= sizeof(entity)) {
944 0 : continue;
945 : }
946 :
947 : /* When we have MBCS entities in the tables above, this will need to handle it */
948 4362 : replacement_len = 0;
949 4362 : switch (charset) {
950 : case cs_8859_1:
951 : case cs_cp1252:
952 : case cs_8859_15:
953 : case cs_cp1251:
954 : case cs_8859_5:
955 : case cs_cp866:
956 : case cs_koi8r:
957 678 : replacement[0] = k;
958 678 : replacement[1] = '\0';
959 678 : replacement_len = 1;
960 678 : break;
961 :
962 : case cs_big5:
963 : case cs_gb2312:
964 : case cs_big5hkscs:
965 : case cs_sjis:
966 : case cs_eucjp:
967 : /* we cannot properly handle those multibyte encodings
968 : * with php_str_to_str. skip it. */
969 0 : continue;
970 :
971 : case cs_utf_8:
972 3684 : replacement_len = php_utf32_utf8(replacement, k);
973 3684 : break;
974 :
975 : default:
976 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
977 0 : efree(ret);
978 0 : return NULL;
979 : }
980 :
981 4362 : if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
982 20 : replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
983 20 : efree(ret);
984 20 : ret = replaced;
985 : }
986 : }
987 : }
988 : }
989 :
990 120 : for (j = 0; basic_entities[j].charcode != 0; j++) {
991 :
992 100 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
993 16 : continue;
994 :
995 84 : replacement[0] = (unsigned char)basic_entities[j].charcode;
996 84 : replacement[1] = '\0';
997 :
998 84 : if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
999 6 : replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
1000 6 : efree(ret);
1001 6 : ret = replaced;
1002 : }
1003 : }
1004 :
1005 : /* replace numeric entities & "&" */
1006 20 : lim = ret + retlen;
1007 142 : for (p = ret, q = ret; p < lim;) {
1008 : int code;
1009 :
1010 102 : if (p[0] == '&') {
1011 8 : if (p + 2 < lim) {
1012 8 : if (p[1] == '#') {
1013 0 : int invalid_code = 0;
1014 :
1015 0 : if (p[2] == 'x' || p[2] == 'X') {
1016 0 : code = strtol(p + 3, &next, 16);
1017 : } else {
1018 0 : code = strtol(p + 2, &next, 10);
1019 : }
1020 :
1021 0 : if (next != NULL && *next == ';') {
1022 0 : switch (charset) {
1023 : case cs_utf_8:
1024 0 : q += php_utf32_utf8(q, code);
1025 0 : break;
1026 :
1027 : case cs_8859_1:
1028 : case cs_8859_5:
1029 : case cs_8859_15:
1030 0 : if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1031 0 : invalid_code = 1;
1032 : } else {
1033 0 : if (code == 39 || !quote_style) {
1034 0 : invalid_code = 1;
1035 : } else {
1036 0 : *(q++) = code;
1037 : }
1038 : }
1039 0 : break;
1040 :
1041 : case cs_cp1252:
1042 0 : if (code > 0xff) {
1043 0 : invalid_code = 1;
1044 : } else {
1045 0 : *(q++) = code;
1046 : }
1047 0 : break;
1048 :
1049 : case cs_cp1251:
1050 : case cs_cp866:
1051 : case cs_big5:
1052 : case cs_big5hkscs:
1053 : case cs_sjis:
1054 : case cs_eucjp:
1055 0 : if (code >= 0x80) {
1056 0 : invalid_code = 1;
1057 : } else {
1058 0 : *(q++) = code;
1059 : }
1060 0 : break;
1061 :
1062 : case cs_gb2312:
1063 0 : if (code >= 0x81) {
1064 0 : invalid_code = 1;
1065 : } else {
1066 0 : *(q++) = code;
1067 : }
1068 0 : break;
1069 :
1070 : default:
1071 : /* for backwards compatilibity */
1072 0 : invalid_code = 1;
1073 : break;
1074 : }
1075 0 : if (invalid_code) {
1076 0 : for (; p <= next; p++) {
1077 0 : *(q++) = *p;
1078 : }
1079 : }
1080 0 : p = next + 1;
1081 : } else {
1082 0 : *(q++) = *(p++);
1083 0 : *(q++) = *(p++);
1084 : }
1085 16 : } else if (p + 4 < lim &&
1086 : p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1087 : p[4] == ';') {
1088 8 : *(q++) = '&';
1089 8 : p += 5;
1090 : } else {
1091 0 : *(q++) = *(p++);
1092 0 : *(q++) = *(p++);
1093 : }
1094 : } else {
1095 0 : *(q++) = *(p++);
1096 : }
1097 : } else {
1098 94 : *(q++) = *(p++);
1099 : }
1100 : }
1101 20 : *q = '\0';
1102 20 : retlen = (size_t)(q - ret);
1103 22 : empty_source:
1104 22 : *newlen = retlen;
1105 22 : return ret;
1106 : }
1107 : /* }}} */
1108 :
1109 : PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
1110 1657 : {
1111 1657 : return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
1112 : }
1113 :
1114 :
1115 : /* {{{ php_escape_html_entities
1116 : */
1117 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1118 6390 : {
1119 : int i, j, maxlen, len;
1120 : char *replaced;
1121 6390 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1122 : int matches_map;
1123 :
1124 6390 : maxlen = 2 * oldlen;
1125 6390 : if (maxlen < 128)
1126 6338 : maxlen = 128;
1127 6390 : replaced = emalloc (maxlen);
1128 6390 : len = 0;
1129 6390 : i = 0;
1130 45802 : while (i < oldlen) {
1131 : unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
1132 35486 : int mbseqlen = sizeof(mbsequence);
1133 35486 : int status = SUCCESS;
1134 35486 : unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
1135 :
1136 35486 : if(status == FAILURE) {
1137 : /* invalid MB sequence */
1138 2549 : if (quote_style & ENT_HTML_IGNORE_ERRORS) {
1139 85 : continue;
1140 : }
1141 2464 : efree(replaced);
1142 2464 : if(!PG(display_errors)) {
1143 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
1144 : }
1145 2464 : *newlen = 0;
1146 2464 : return STR_EMPTY_ALLOC();
1147 : }
1148 32937 : matches_map = 0;
1149 :
1150 32937 : if (len + 16 > maxlen)
1151 2 : replaced = erealloc (replaced, maxlen += 128);
1152 :
1153 32937 : if (all) {
1154 : /* look for a match in the maps for this charset */
1155 2495 : unsigned char *rep = NULL;
1156 :
1157 :
1158 59804 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1159 57452 : if (entity_map[j].charset == charset
1160 : && this_char >= entity_map[j].basechar
1161 : && this_char <= entity_map[j].endchar) {
1162 143 : rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
1163 143 : if (rep == NULL) {
1164 : /* there is no entity for this position; fall through and
1165 : * just output the character itself */
1166 1 : break;
1167 : }
1168 :
1169 142 : matches_map = 1;
1170 142 : break;
1171 : }
1172 : }
1173 :
1174 2495 : if (matches_map) {
1175 142 : int l = strlen(rep);
1176 : /* increase the buffer size */
1177 142 : if (len + 2 + l >= maxlen) {
1178 0 : replaced = erealloc(replaced, maxlen += 128);
1179 : }
1180 :
1181 142 : replaced[len++] = '&';
1182 142 : strlcpy(replaced + len, rep, maxlen);
1183 142 : len += l;
1184 142 : replaced[len++] = ';';
1185 : }
1186 : }
1187 32937 : if (!matches_map) {
1188 32795 : int is_basic = 0;
1189 :
1190 32795 : if (this_char == '&') {
1191 168 : if (double_encode) {
1192 138 : encode_amp:
1193 138 : memcpy(replaced + len, "&", sizeof("&") - 1);
1194 138 : len += sizeof("&") - 1;
1195 : } else {
1196 50 : char *e = memchr(old + i, ';', oldlen - i);
1197 50 : char *s = old + i;
1198 :
1199 50 : if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
1200 : goto encode_amp;
1201 : } else {
1202 40 : if (*s == '#') { /* numeric entities */
1203 12 : s++;
1204 : /* Hex (Z) */
1205 16 : if (*s == 'x' || *s == 'X') {
1206 6 : s++;
1207 20 : while (s < e) {
1208 10 : if (!isxdigit((int)*(unsigned char *)s++)) {
1209 2 : goto encode_amp;
1210 : }
1211 : }
1212 : /* Dec (Z)*/
1213 : } else {
1214 22 : while (s < e) {
1215 12 : if (!isdigit((int)*(unsigned char *)s++)) {
1216 2 : goto encode_amp;
1217 : }
1218 : }
1219 : }
1220 : } else { /* text entities */
1221 124 : while (s < e) {
1222 74 : if (!isalnum((int)*(unsigned char *)s++)) {
1223 6 : goto encode_amp;
1224 : }
1225 : }
1226 : }
1227 30 : replaced[len++] = '&';
1228 : }
1229 : }
1230 168 : is_basic = 1;
1231 : } else {
1232 194949 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1233 162627 : if ((basic_entities[j].charcode != this_char) ||
1234 : (basic_entities[j].flags &&
1235 : (quote_style & basic_entities[j].flags) == 0)) {
1236 : continue;
1237 : }
1238 :
1239 305 : memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1240 305 : len += basic_entities[j].entitylen;
1241 :
1242 305 : is_basic = 1;
1243 305 : break;
1244 : }
1245 : }
1246 :
1247 32795 : if (!is_basic) {
1248 : /* a wide char without a named entity; pass through the original sequence */
1249 32322 : if (mbseqlen > 1) {
1250 1618 : memcpy(replaced + len, mbsequence, mbseqlen);
1251 1618 : len += mbseqlen;
1252 : } else {
1253 30704 : replaced[len++] = (unsigned char)this_char;
1254 : }
1255 : }
1256 : }
1257 : }
1258 3926 : replaced[len] = '\0';
1259 3926 : *newlen = len;
1260 :
1261 3926 : return replaced;
1262 :
1263 :
1264 : }
1265 : /* }}} */
1266 :
1267 : /* {{{ php_html_entities
1268 : */
1269 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1270 4735 : {
1271 4735 : char *str, *hint_charset = NULL;
1272 4735 : int str_len, hint_charset_len = 0;
1273 : int len;
1274 4735 : long quote_style = ENT_COMPAT;
1275 : char *replaced;
1276 4735 : zend_bool double_encode = 1;
1277 :
1278 4735 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, "e_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1279 2 : return;
1280 : }
1281 :
1282 4733 : replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
1283 4733 : RETVAL_STRINGL(replaced, len, 0);
1284 : }
1285 : /* }}} */
1286 :
1287 : #define HTML_SPECIALCHARS 0
1288 : #define HTML_ENTITIES 1
1289 :
1290 : /* {{{ register_html_constants
1291 : */
1292 : void register_html_constants(INIT_FUNC_ARGS)
1293 17633 : {
1294 17633 : REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1295 17633 : REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1296 17633 : REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1297 17633 : REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1298 17633 : REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1299 17633 : REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
1300 17633 : }
1301 : /* }}} */
1302 :
1303 : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
1304 : Convert special characters to HTML entities */
1305 : PHP_FUNCTION(htmlspecialchars)
1306 4319 : {
1307 4319 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1308 4319 : }
1309 : /* }}} */
1310 :
1311 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
1312 : Convert special HTML entities back to characters */
1313 : PHP_FUNCTION(htmlspecialchars_decode)
1314 113 : {
1315 : char *str, *new_str, *e, *p;
1316 : int len, j, i, new_len;
1317 113 : long quote_style = ENT_COMPAT;
1318 : struct basic_entities_dec basic_entities_dec[8];
1319 :
1320 113 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) {
1321 19 : return;
1322 : }
1323 :
1324 94 : new_str = estrndup(str, len);
1325 94 : new_len = len;
1326 94 : e = new_str + new_len;
1327 :
1328 94 : if (!(p = memchr(new_str, '&', new_len))) {
1329 25 : RETURN_STRINGL(new_str, new_len, 0);
1330 : }
1331 :
1332 414 : for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1333 345 : if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1334 133 : continue;
1335 : }
1336 212 : basic_entities_dec[j].charcode = basic_entities[i].charcode;
1337 212 : memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1338 212 : basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1339 212 : j++;
1340 : }
1341 69 : basic_entities_dec[j].charcode = '&';
1342 69 : basic_entities_dec[j].entitylen = sizeof("&") - 1;
1343 69 : memcpy(basic_entities_dec[j].entity, "&", sizeof("&"));
1344 69 : i = j + 1;
1345 :
1346 : do {
1347 343 : int l = e - p;
1348 :
1349 1188 : for (j = 0; j < i; j++) {
1350 1014 : if (basic_entities_dec[j].entitylen > l) {
1351 0 : continue;
1352 : }
1353 1014 : if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1354 169 : int e_len = basic_entities_dec[j].entitylen - 1;
1355 :
1356 169 : *p++ = basic_entities_dec[j].charcode;
1357 169 : memmove(p, p + e_len, (e - p - e_len));
1358 169 : e -= e_len;
1359 169 : goto done;
1360 : }
1361 : }
1362 174 : p++;
1363 :
1364 343 : done:
1365 343 : if (p >= e) {
1366 20 : break;
1367 : }
1368 323 : } while ((p = memchr(p, '&', (e - p))));
1369 :
1370 69 : new_len = e - new_str;
1371 :
1372 69 : new_str[new_len] = '\0';
1373 69 : RETURN_STRINGL(new_str, new_len, 0);
1374 : }
1375 : /* }}} */
1376 :
1377 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
1378 : Convert all HTML entities to their applicable characters */
1379 : PHP_FUNCTION(html_entity_decode)
1380 22 : {
1381 22 : char *str, *hint_charset = NULL;
1382 22 : int str_len, hint_charset_len = 0, len;
1383 22 : long quote_style = ENT_COMPAT;
1384 : char *replaced;
1385 :
1386 22 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
1387 : "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
1388 0 : return;
1389 : }
1390 :
1391 22 : replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1392 22 : if (replaced) {
1393 22 : RETURN_STRINGL(replaced, len, 0);
1394 : }
1395 0 : RETURN_FALSE;
1396 : }
1397 : /* }}} */
1398 :
1399 :
1400 : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
1401 : Convert all applicable characters to HTML entities */
1402 : PHP_FUNCTION(htmlentities)
1403 416 : {
1404 416 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1405 416 : }
1406 : /* }}} */
1407 :
1408 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]])
1409 : Returns the internal translation table used by htmlspecialchars and htmlentities */
1410 : PHP_FUNCTION(get_html_translation_table)
1411 68 : {
1412 68 : long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1413 : int i, j;
1414 : char ind[2];
1415 68 : enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
1416 :
1417 68 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, "e_style) == FAILURE) {
1418 34 : return;
1419 : }
1420 :
1421 34 : array_init(return_value);
1422 :
1423 34 : ind[1] = 0;
1424 :
1425 34 : switch (which) {
1426 : case HTML_ENTITIES:
1427 200 : for (j=0; entity_map[j].charset != cs_terminator; j++) {
1428 192 : if (entity_map[j].charset != charset)
1429 184 : continue;
1430 776 : for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1431 : char buffer[16];
1432 :
1433 768 : if (entity_map[j].table[i] == NULL)
1434 0 : continue;
1435 : /* what about wide chars here ?? */
1436 768 : ind[0] = i + entity_map[j].basechar;
1437 768 : snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1438 768 : add_assoc_string(return_value, ind, buffer, 1);
1439 :
1440 : }
1441 : }
1442 : /* break thru */
1443 :
1444 : case HTML_SPECIALCHARS:
1445 204 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1446 :
1447 170 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1448 70 : continue;
1449 :
1450 100 : ind[0] = (unsigned char)basic_entities[j].charcode;
1451 100 : add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
1452 : }
1453 34 : add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1);
1454 :
1455 : break;
1456 : }
1457 : }
1458 : /* }}} */
1459 :
1460 : /*
1461 : * Local variables:
1462 : * tab-width: 4
1463 : * c-basic-offset: 4
1464 : * End:
1465 : * vim600: sw=4 ts=4 fdm=marker
1466 : * vim<600: sw=4 ts=4
1467 : */
|