1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 5 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2009 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Authors: Rasmus Lerdorf <rasmus@php.net> |
16 : | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> |
17 : | Wez Furlong <wez@thebrainroom.com> |
18 : +----------------------------------------------------------------------+
19 : */
20 :
21 : /* $Id: html.c 289605 2009-10-13 05:18:37Z moriyoshi $ */
22 :
23 : /*
24 : * HTML entity resources:
25 : *
26 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 : * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 : *
30 : * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 : *
32 : */
33 :
34 : #include "php.h"
35 : #if PHP_WIN32
36 : #include "config.w32.h"
37 : #else
38 : #include <php_config.h>
39 : #endif
40 : #include "reg.h"
41 : #include "html.h"
42 : #include "php_string.h"
43 : #include "SAPI.h"
44 : #if HAVE_LOCALE_H
45 : #include <locale.h>
46 : #endif
47 : #if HAVE_LANGINFO_H
48 : #include <langinfo.h>
49 : #endif
50 :
51 : #if HAVE_MBSTRING
52 : # include "ext/mbstring/mbstring.h"
53 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
54 : #endif
55 :
56 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
57 : cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
58 : cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
59 : cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
60 : };
61 : typedef const char *const entity_table_t;
62 :
63 : /* codepage 1252 is a Windows extension to iso-8859-1. */
64 : static entity_table_t ent_cp_1252[] = {
65 : "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
66 : "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
67 : NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
68 : "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
69 : "oelig", NULL, NULL, "Yuml"
70 : };
71 :
72 : static entity_table_t ent_iso_8859_1[] = {
73 : "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
74 : "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
75 : "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
76 : "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
77 : "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
78 : "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
79 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
80 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
81 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
82 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
83 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
84 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
85 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
86 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
87 : "uuml", "yacute", "thorn", "yuml"
88 : };
89 :
90 : static entity_table_t ent_iso_8859_15[] = {
91 : "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
92 : "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
93 : "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
94 : "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
95 : "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
96 : "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
97 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
98 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
99 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
100 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
101 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
102 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
103 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
104 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
105 : "uuml", "yacute", "thorn", "yuml"
106 : };
107 :
108 : static entity_table_t ent_uni_338_402[] = {
109 : /* 338 (0x0152) */
110 : "OElig", "oelig", NULL, NULL, NULL, NULL,
111 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
112 : /* 352 (0x0160) */
113 : "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
114 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
115 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
116 : /* 376 (0x0178) */
117 : "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
118 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
119 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
120 : /* 400 (0x0190) */
121 : NULL, NULL, "fnof"
122 : };
123 :
124 : static entity_table_t ent_uni_spacing[] = {
125 : /* 710 */
126 : "circ",
127 : /* 711 - 730 */
128 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
129 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
130 : /* 731 - 732 */
131 : NULL, "tilde"
132 : };
133 :
134 : static entity_table_t ent_uni_greek[] = {
135 : /* 913 */
136 : "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
137 : "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
138 : NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
139 : /* 938 - 944 are not mapped */
140 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
141 : "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
142 : "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
143 : "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
144 : /* 970 - 976 are not mapped */
145 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
146 : "thetasym", "upsih",
147 : NULL, NULL, NULL,
148 : "piv"
149 : };
150 :
151 : static entity_table_t ent_uni_punct[] = {
152 : /* 8194 */
153 : "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
154 : "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
155 : NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
156 : /* 8216 */
157 : "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
158 : "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
159 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
160 : /* 8242 */
161 : "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
162 : NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
163 : "frasl"
164 : };
165 :
166 : static entity_table_t ent_uni_euro[] = {
167 : "euro"
168 : };
169 :
170 : static entity_table_t ent_uni_8465_8501[] = {
171 : /* 8465 */
172 : "image", NULL, NULL, NULL, NULL, NULL, NULL,
173 : /* 8472 */
174 : "weierp", NULL, NULL, NULL,
175 : /* 8476 */
176 : "real", NULL, NULL, NULL, NULL, NULL,
177 : /* 8482 */
178 : "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180 : /* 8501 */
181 : "alefsym",
182 : };
183 :
184 : static entity_table_t ent_uni_8592_9002[] = {
185 : /* 8592 (0x2190) */
186 : "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
187 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
188 : /* 8608 (0x21a0) */
189 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
190 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
191 : /* 8624 (0x21b0) */
192 : NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
193 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
194 : /* 8640 (0x21c0) */
195 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
197 : /* 8656 (0x21d0) */
198 : "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
199 : NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
200 : /* 8672 (0x21e0) */
201 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
203 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
204 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
205 : /* 8704 (0x2200) */
206 : "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
207 : "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
208 : /* 8720 (0x2210) */
209 : "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
210 : "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
211 : /* 8736 (0x2220) */
212 : "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
213 : "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
214 : /* 8752 (0x2230) */
215 : NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
216 : NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
217 : /* 8768 (0x2240) */
218 : "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
219 : "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
220 : /* 8784 (0x2250) */
221 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
222 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
223 : /* 8800 (0x2260) */
224 : "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
225 : "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
226 : /* 8816 (0x2270) */
227 : "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
228 : NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
229 : /* 8832 (0x2280) */
230 : "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
231 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
232 : /* 8848 (0x2290) */
233 : NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
234 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
235 : /* 8864 (0x22a0) */
236 : NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
237 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
238 : /* 8880 (0x22b0) */
239 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
241 : /* 8896 (0x22c0) */
242 : NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
243 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
244 : /* 8912 (0x22d0) */
245 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
247 : /* 8928 (0x22e0) */
248 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
250 : /* 8944 (0x22f0) */
251 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
253 : /* 8960 (0x2300) */
254 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
255 : "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
256 : /* 8976 (0x2310) */
257 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
259 : /* 8992 (0x2320) */
260 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
261 : NULL, "lang", "rang"
262 : };
263 :
264 : static entity_table_t ent_uni_9674[] = {
265 : /* 9674 */
266 : "loz"
267 : };
268 :
269 : static entity_table_t ent_uni_9824_9830[] = {
270 : /* 9824 */
271 : "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
272 : };
273 :
274 : static entity_table_t ent_koi8r[] = {
275 : "#1105", /* "jo "*/
276 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
277 : NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
278 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
279 : "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
280 : "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
281 : "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
282 : "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
283 : "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
284 : "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
285 : "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
286 : "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
287 : "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
288 : "#1066"
289 : };
290 :
291 : static entity_table_t ent_cp_1251[] = {
292 : "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
293 : "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
294 : "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
295 : "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
296 : "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
297 : "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
298 : "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
299 : "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
300 : "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
301 : "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
302 : "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
303 : "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
304 : "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
305 : "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
306 : "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
307 : "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
308 : "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
309 : "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
310 : "#1103"
311 : };
312 :
313 : static entity_table_t ent_iso_8859_5[] = {
314 : "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
315 : "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
316 : "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
317 : "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
318 : "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
319 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
320 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
321 : "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
322 : "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
323 : "#1119"
324 : };
325 :
326 : static entity_table_t ent_cp_866[] = {
327 :
328 : "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
329 : "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
330 : "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
331 : "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
332 : "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
333 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
334 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
335 : "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
336 : "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
337 : "#160"
338 : };
339 :
340 : /* MacRoman has a couple of low-ascii chars that need mapping too */
341 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
342 : /* DB exports, this mapping changes it to a space */
343 : static entity_table_t ent_macroman[] = {
344 : "sp", NULL, NULL, NULL,
345 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
346 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
347 : NULL, NULL, NULL, NULL, NULL, "quot", NULL,
348 : NULL, NULL, "amp", NULL, NULL, NULL, NULL,
349 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
350 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
351 : NULL, NULL, NULL, "lt", NULL, "gt", NULL,
352 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
353 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
354 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
357 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
361 : NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
362 : "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
363 : "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
364 : "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
365 : "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
366 : "cent", "pound", "sect", "bull", "para", "szlig", "reg",
367 : "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
368 : "infin", "plusmn", "le", "ge", "yen", "micro", "part",
369 : "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
370 : "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
371 : "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
372 : "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
373 : "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
374 : "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
375 : "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
376 : "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
377 : "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
378 : "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
379 : "#733", "#731", "#711"
380 : };
381 :
382 : struct html_entity_map {
383 : enum entity_charset charset; /* charset identifier */
384 : unsigned short basechar; /* char code at start of table */
385 : unsigned short endchar; /* last char code in the table */
386 : entity_table_t *table; /* the table of mappings */
387 : };
388 :
389 : static const struct html_entity_map entity_map[] = {
390 : { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
391 : { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
392 : { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
393 : { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
394 : { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
395 : { cs_utf_8, 338, 402, ent_uni_338_402 },
396 : { cs_utf_8, 710, 732, ent_uni_spacing },
397 : { cs_utf_8, 913, 982, ent_uni_greek },
398 : { cs_utf_8, 8194, 8260, ent_uni_punct },
399 : { cs_utf_8, 8364, 8364, ent_uni_euro },
400 : { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
401 : { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
402 : { cs_utf_8, 9674, 9674, ent_uni_9674 },
403 : { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
404 : { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
405 : { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
406 : { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
407 : { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
408 : { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
409 : { cs_koi8r, 0xa3, 0xff, ent_koi8r },
410 : { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
411 : { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
412 : { cs_cp866, 0xc0, 0xff, ent_cp_866 },
413 : { cs_macroman, 0x0b, 0xff, ent_macroman },
414 : { cs_terminator }
415 : };
416 :
417 : static const struct {
418 : const char *codeset;
419 : enum entity_charset charset;
420 : } charset_map[] = {
421 : { "ISO-8859-1", cs_8859_1 },
422 : { "ISO8859-1", cs_8859_1 },
423 : { "ISO-8859-15", cs_8859_15 },
424 : { "ISO8859-15", cs_8859_15 },
425 : { "utf-8", cs_utf_8 },
426 : { "cp1252", cs_cp1252 },
427 : { "Windows-1252", cs_cp1252 },
428 : { "1252", cs_cp1252 },
429 : { "BIG5", cs_big5 },
430 : { "950", cs_big5 },
431 : { "GB2312", cs_gb2312 },
432 : { "936", cs_gb2312 },
433 : { "BIG5-HKSCS", cs_big5hkscs },
434 : { "Shift_JIS", cs_sjis },
435 : { "SJIS", cs_sjis },
436 : { "932", cs_sjis },
437 : { "EUCJP", cs_eucjp },
438 : { "EUC-JP", cs_eucjp },
439 : { "KOI8-R", cs_koi8r },
440 : { "koi8-ru", cs_koi8r },
441 : { "koi8r", cs_koi8r },
442 : { "cp1251", cs_cp1251 },
443 : { "Windows-1251", cs_cp1251 },
444 : { "win-1251", cs_cp1251 },
445 : { "iso8859-5", cs_8859_5 },
446 : { "iso-8859-5", cs_8859_5 },
447 : { "cp866", cs_cp866 },
448 : { "866", cs_cp866 },
449 : { "ibm866", cs_cp866 },
450 : { "MacRoman", cs_macroman },
451 : { NULL }
452 : };
453 :
454 : static const struct {
455 : unsigned short charcode;
456 : char *entity;
457 : int entitylen;
458 : int flags;
459 : } basic_entities[] = {
460 : { '"', """, 6, ENT_HTML_QUOTE_DOUBLE },
461 : { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE },
462 : { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE },
463 : { '<', "<", 4, 0 },
464 : { '>', ">", 4, 0 },
465 : { 0, NULL, 0, 0 }
466 : };
467 :
468 : struct basic_entities_dec {
469 : unsigned short charcode;
470 : char entity[8];
471 : int entitylen;
472 : };
473 :
474 : #define MB_RETURN { \
475 : *newpos = pos; \
476 : mbseq[mbpos] = '\0'; \
477 : *mbseqlen = mbpos; \
478 : return this_char; }
479 :
480 : #define MB_WRITE(mbchar) { \
481 : mbspace--; \
482 : if (mbspace == 0) { \
483 : MB_RETURN; \
484 : } \
485 : mbseq[mbpos++] = (mbchar); }
486 :
487 : /* skip one byte and return */
488 : #define MB_FAILURE(pos) do { \
489 : *newpos = pos + 1; \
490 : *status = FAILURE; \
491 : return 0; \
492 : } while (0)
493 :
494 : #define CHECK_LEN(pos, chars_need) \
495 : if (chars_need < 1) { \
496 : if((str_len - (pos)) < chars_need) { \
497 : *newpos = pos; \
498 : *status = FAILURE; \
499 : return 0; \
500 : } \
501 : } else { \
502 : if((str_len - (pos)) < chars_need) { \
503 : *newpos = pos + 1; \
504 : *status = FAILURE; \
505 : return 0; \
506 : } \
507 : }
508 :
509 : /* {{{ get_next_char
510 : */
511 : inline static unsigned int get_next_char(enum entity_charset charset,
512 : unsigned char * str,
513 : int str_len,
514 : int * newpos,
515 : unsigned char * mbseq,
516 : int * mbseqlen,
517 : int *status)
518 29886 : {
519 29886 : int pos = *newpos;
520 29886 : int mbpos = 0;
521 29886 : int mbspace = *mbseqlen;
522 29886 : unsigned int this_char = 0;
523 : unsigned char next_char;
524 :
525 29886 : *status = SUCCESS;
526 :
527 29886 : if (mbspace <= 0) {
528 0 : *mbseqlen = 0;
529 0 : CHECK_LEN(pos, 1);
530 0 : *newpos = pos + 1;
531 0 : return str[pos];
532 : }
533 :
534 29886 : switch (charset) {
535 : case cs_utf_8:
536 : {
537 : unsigned char c;
538 90 : CHECK_LEN(pos, 1);
539 90 : c = str[pos];
540 90 : if (c < 0x80) {
541 22 : MB_WRITE(c);
542 22 : this_char = c;
543 22 : pos++;
544 68 : } else if (c < 0xc0) {
545 0 : MB_FAILURE(pos);
546 68 : } else if (c < 0xe0) {
547 23 : CHECK_LEN(pos, 2);
548 19 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
549 2 : MB_FAILURE(pos);
550 : }
551 17 : this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
552 17 : if (this_char < 0x80) {
553 3 : MB_FAILURE(pos);
554 : }
555 14 : MB_WRITE((unsigned char)c);
556 14 : MB_WRITE((unsigned char)str[pos + 1]);
557 14 : pos += 2;
558 45 : } else if (c < 0xf0) {
559 33 : CHECK_LEN(pos, 3);
560 27 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
561 3 : MB_FAILURE(pos);
562 : }
563 24 : if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
564 2 : MB_FAILURE(pos);
565 : }
566 22 : this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
567 22 : if (this_char < 0x800) {
568 6 : MB_FAILURE(pos);
569 : }
570 16 : MB_WRITE((unsigned char)c);
571 16 : MB_WRITE((unsigned char)str[pos + 1]);
572 16 : MB_WRITE((unsigned char)str[pos + 2]);
573 16 : pos += 3;
574 12 : } else if (c < 0xf8) {
575 11 : CHECK_LEN(pos, 4);
576 11 : if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
577 2 : MB_FAILURE(pos);
578 : }
579 9 : if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
580 2 : MB_FAILURE(pos);
581 : }
582 7 : if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
583 2 : MB_FAILURE(pos);
584 : }
585 5 : this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
586 5 : if (this_char < 0x10000) {
587 3 : MB_FAILURE(pos);
588 : }
589 2 : MB_WRITE((unsigned char)c);
590 2 : MB_WRITE((unsigned char)str[pos + 1]);
591 2 : MB_WRITE((unsigned char)str[pos + 2]);
592 2 : MB_WRITE((unsigned char)str[pos + 3]);
593 2 : pos += 4;
594 : } else {
595 1 : MB_FAILURE(pos);
596 : }
597 : }
598 54 : break;
599 : case cs_big5:
600 : case cs_gb2312:
601 : case cs_big5hkscs:
602 : {
603 1262 : CHECK_LEN(pos, 1);
604 1262 : this_char = str[pos++];
605 : /* check if this is the first of a 2-byte sequence */
606 1766 : if (this_char >= 0x81 && this_char <= 0xfe) {
607 : /* peek at the next char */
608 1260 : CHECK_LEN(pos, 1);
609 1134 : next_char = str[pos++];
610 1134 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
611 : (next_char >= 0xa1 && next_char <= 0xfe)) {
612 : /* yes, this a wide char */
613 504 : MB_WRITE(this_char);
614 504 : MB_WRITE(next_char);
615 504 : this_char = (this_char << 8) | next_char;
616 : } else {
617 630 : MB_FAILURE(pos);
618 : }
619 : } else {
620 2 : MB_WRITE(this_char);
621 : }
622 : }
623 506 : break;
624 : case cs_sjis:
625 : {
626 681 : CHECK_LEN(pos, 1);
627 681 : this_char = str[pos++];
628 : /* check if this is the first of a 2-byte sequence */
629 930 : if ((this_char >= 0x81 && this_char <= 0x9f) ||
630 : (this_char >= 0xe0 && this_char <= 0xfc)) {
631 : /* peek at the next char */
632 609 : CHECK_LEN(pos, 1);
633 549 : next_char = str[pos++];
634 549 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
635 : (next_char >= 0x80 && next_char <= 0xfc))
636 : {
637 : /* yes, this a wide char */
638 249 : MB_WRITE(this_char);
639 249 : MB_WRITE(next_char);
640 249 : this_char = (this_char << 8) | next_char;
641 : } else {
642 300 : MB_FAILURE(pos);
643 : }
644 : } else {
645 72 : MB_WRITE(this_char);
646 : }
647 321 : break;
648 : }
649 : case cs_eucjp:
650 : {
651 2402 : CHECK_LEN(pos, 1);
652 2402 : this_char = str[pos++];
653 : /* check if this is the first of a multi-byte sequence */
654 2602 : if (this_char >= 0xa1 && this_char <= 0xfe) {
655 : /* peek at the next char */
656 797 : CHECK_LEN(pos, 1);
657 576 : next_char = str[pos++];
658 576 : if (next_char >= 0xa1 && next_char <= 0xfe) {
659 : /* yes, this a jis kanji char */
660 200 : MB_WRITE(this_char);
661 200 : MB_WRITE(next_char);
662 200 : this_char = (this_char << 8) | next_char;
663 : } else {
664 376 : MB_FAILURE(pos);
665 : }
666 1605 : } else if (this_char == 0x8e) {
667 : /* peek at the next char */
668 661 : CHECK_LEN(pos, 1);
669 660 : next_char = str[pos++];
670 660 : if (next_char >= 0xa1 && next_char <= 0xdf) {
671 : /* JIS X 0201 kana */
672 443 : MB_WRITE(this_char);
673 443 : MB_WRITE(next_char);
674 443 : this_char = (this_char << 8) | next_char;
675 : } else {
676 217 : MB_FAILURE(pos);
677 : }
678 944 : } else if (this_char == 0x8f) {
679 : /* peek at the next two char */
680 : unsigned char next2_char;
681 661 : CHECK_LEN(pos, 2);
682 565 : next_char = str[pos];
683 565 : next2_char = str[pos + 1];
684 565 : pos += 2;
685 565 : if ((next_char >= 0xa1 && next_char <= 0xfe) &&
686 : (next2_char >= 0xa1 && next2_char <= 0xfe)) {
687 : /* JIS X 0212 hojo-kanji */
688 189 : MB_WRITE(this_char);
689 189 : MB_WRITE(next_char);
690 189 : MB_WRITE(next2_char);
691 189 : this_char = (this_char << 16) | (next_char << 8) | next2_char;
692 : } else {
693 376 : MB_FAILURE(pos);
694 : }
695 : } else {
696 283 : MB_WRITE(this_char);
697 : }
698 1115 : break;
699 : }
700 : default:
701 : /* single-byte charsets */
702 25451 : CHECK_LEN(pos, 1);
703 25451 : this_char = str[pos++];
704 25451 : MB_WRITE(this_char);
705 : break;
706 : }
707 27447 : MB_RETURN;
708 : }
709 : /* }}} */
710 :
711 : /* {{{ entity_charset determine_charset
712 : * returns the charset identifier based on current locale or a hint.
713 : * defaults to iso-8859-1 */
714 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
715 5956 : {
716 : int i;
717 5956 : enum entity_charset charset = cs_8859_1;
718 5956 : int len = 0;
719 5956 : zval *uf_result = NULL;
720 :
721 : /* Guarantee default behaviour for backwards compatibility */
722 5956 : if (charset_hint == NULL)
723 1863 : return cs_8859_1;
724 :
725 4093 : if ((len = strlen(charset_hint)) != 0) {
726 4070 : goto det_charset;
727 : }
728 : #if HAVE_MBSTRING
729 : #if !defined(COMPILE_DL_MBSTRING)
730 : /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
731 23 : switch (MBSTRG(current_internal_encoding)) {
732 : case mbfl_no_encoding_8859_1:
733 2 : return cs_8859_1;
734 :
735 : case mbfl_no_encoding_utf8:
736 0 : return cs_utf_8;
737 :
738 : case mbfl_no_encoding_euc_jp:
739 : case mbfl_no_encoding_eucjp_win:
740 2 : return cs_eucjp;
741 :
742 : case mbfl_no_encoding_sjis:
743 : case mbfl_no_encoding_sjis_win:
744 : case mbfl_no_encoding_sjis_mac:
745 2 : return cs_sjis;
746 :
747 : case mbfl_no_encoding_cp1252:
748 3 : return cs_cp1252;
749 :
750 : case mbfl_no_encoding_8859_15:
751 2 : return cs_8859_15;
752 :
753 : case mbfl_no_encoding_big5:
754 0 : return cs_big5;
755 :
756 : case mbfl_no_encoding_euc_cn:
757 : case mbfl_no_encoding_hz:
758 : case mbfl_no_encoding_cp936:
759 0 : return cs_gb2312;
760 :
761 : case mbfl_no_encoding_koi8r:
762 0 : return cs_koi8r;
763 :
764 : case mbfl_no_encoding_cp866:
765 0 : return cs_cp866;
766 :
767 : case mbfl_no_encoding_cp1251:
768 2 : return cs_cp1251;
769 :
770 : case mbfl_no_encoding_8859_5:
771 0 : return cs_8859_5;
772 :
773 : default:
774 : ;
775 : }
776 : #else
777 : {
778 : zval nm_mb_internal_encoding;
779 :
780 : ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
781 :
782 : if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
783 :
784 : charset_hint = Z_STRVAL_P(uf_result);
785 : len = Z_STRLEN_P(uf_result);
786 :
787 : if (len == 4) { /* sizeof(none|auto|pass)-1 */
788 : if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
789 : !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
790 : !memcmp("none", charset_hint, sizeof("none") - 1)) {
791 :
792 : charset_hint = NULL;
793 : len = 0;
794 : }
795 : }
796 : goto det_charset;
797 : }
798 : }
799 : #endif
800 : #endif
801 :
802 10 : charset_hint = SG(default_charset);
803 10 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
804 6 : goto det_charset;
805 : }
806 :
807 : /* try to detect the charset for the locale */
808 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
809 4 : charset_hint = nl_langinfo(CODESET);
810 4 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
811 4 : goto det_charset;
812 : }
813 : #endif
814 :
815 : #if HAVE_LOCALE_H
816 : /* try to figure out the charset from the locale */
817 : {
818 : char *localename;
819 : char *dot, *at;
820 :
821 : /* lang[_territory][.codeset][@modifier] */
822 0 : localename = setlocale(LC_CTYPE, NULL);
823 :
824 0 : dot = strchr(localename, '.');
825 0 : if (dot) {
826 0 : dot++;
827 : /* locale specifies a codeset */
828 0 : at = strchr(dot, '@');
829 0 : if (at)
830 0 : len = at - dot;
831 : else
832 0 : len = strlen(dot);
833 0 : charset_hint = dot;
834 : } else {
835 : /* no explicit name; see if the name itself
836 : * is the charset */
837 0 : charset_hint = localename;
838 0 : len = strlen(charset_hint);
839 : }
840 : }
841 : #endif
842 :
843 4080 : det_charset:
844 :
845 4080 : if (charset_hint) {
846 4080 : int found = 0;
847 :
848 : /* now walk the charset map and look for the codeset */
849 57827 : for (i = 0; charset_map[i].codeset; i++) {
850 57821 : if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
851 4074 : charset = charset_map[i].charset;
852 4074 : found = 1;
853 4074 : break;
854 : }
855 : }
856 4080 : if (!found) {
857 6 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
858 : charset_hint);
859 : }
860 : }
861 4080 : if (uf_result != NULL) {
862 0 : zval_ptr_dtor(&uf_result);
863 : }
864 4080 : return charset;
865 : }
866 : /* }}} */
867 :
868 : /* {{{ php_utf32_utf8 */
869 : size_t php_utf32_utf8(unsigned char *buf, int k)
870 3684 : {
871 3684 : size_t retval = 0;
872 :
873 3684 : if (k < 0x80) {
874 0 : buf[0] = k;
875 0 : retval = 1;
876 3684 : } else if (k < 0x800) {
877 1872 : buf[0] = 0xc0 | (k >> 6);
878 1872 : buf[1] = 0x80 | (k & 0x3f);
879 1872 : retval = 2;
880 1812 : } else if (k < 0x10000) {
881 1812 : buf[0] = 0xe0 | (k >> 12);
882 1812 : buf[1] = 0x80 | ((k >> 6) & 0x3f);
883 1812 : buf[2] = 0x80 | (k & 0x3f);
884 1812 : retval = 3;
885 0 : } else if (k < 0x200000) {
886 0 : buf[0] = 0xf0 | (k >> 18);
887 0 : buf[1] = 0x80 | ((k >> 12) & 0x3f);
888 0 : buf[2] = 0x80 | ((k >> 6) & 0x3f);
889 0 : buf[3] = 0x80 | (k & 0x3f);
890 0 : retval = 4;
891 0 : } else if (k < 0x4000000) {
892 0 : buf[0] = 0xf8 | (k >> 24);
893 0 : buf[1] = 0x80 | ((k >> 18) & 0x3f);
894 0 : buf[2] = 0x80 | ((k >> 12) & 0x3f);
895 0 : buf[3] = 0x80 | ((k >> 6) & 0x3f);
896 0 : buf[4] = 0x80 | (k & 0x3f);
897 0 : retval = 5;
898 : } else {
899 0 : buf[0] = 0xfc | (k >> 30);
900 0 : buf[1] = 0x80 | ((k >> 24) & 0x3f);
901 0 : buf[2] = 0x80 | ((k >> 18) & 0x3f);
902 0 : buf[3] = 0x80 | ((k >> 12) & 0x3f);
903 0 : buf[4] = 0x80 | ((k >> 6) & 0x3f);
904 0 : buf[5] = 0x80 | (k & 0x3f);
905 0 : retval = 6;
906 : }
907 3684 : buf[retval] = '\0';
908 :
909 3684 : return retval;
910 : }
911 : /* }}} */
912 :
913 : /* {{{ php_unescape_html_entities
914 : */
915 : PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
916 22 : {
917 : int retlen;
918 : int j, k;
919 : char *replaced, *ret, *p, *q, *lim, *next;
920 22 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
921 : unsigned char replacement[15];
922 : int replacement_len;
923 :
924 22 : ret = estrndup(old, oldlen);
925 22 : retlen = oldlen;
926 22 : if (!retlen) {
927 2 : goto empty_source;
928 : }
929 :
930 20 : if (all) {
931 : /* look for a match in the maps for this charset */
932 500 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
933 480 : if (entity_map[j].charset != charset)
934 352 : continue;
935 :
936 10223 : for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
937 : unsigned char entity[32];
938 10095 : int entity_length = 0;
939 :
940 10095 : if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
941 5733 : continue;
942 :
943 4362 : entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
944 4362 : if (entity_length >= sizeof(entity)) {
945 0 : continue;
946 : }
947 :
948 : /* When we have MBCS entities in the tables above, this will need to handle it */
949 4362 : replacement_len = 0;
950 4362 : switch (charset) {
951 : case cs_8859_1:
952 : case cs_cp1252:
953 : case cs_8859_15:
954 : case cs_cp1251:
955 : case cs_8859_5:
956 : case cs_cp866:
957 : case cs_koi8r:
958 678 : replacement[0] = k;
959 678 : replacement[1] = '\0';
960 678 : replacement_len = 1;
961 678 : break;
962 :
963 : case cs_big5:
964 : case cs_gb2312:
965 : case cs_big5hkscs:
966 : case cs_sjis:
967 : case cs_eucjp:
968 : /* we cannot properly handle those multibyte encodings
969 : * with php_str_to_str. skip it. */
970 0 : continue;
971 :
972 : case cs_utf_8:
973 3684 : replacement_len = php_utf32_utf8(replacement, k);
974 3684 : break;
975 :
976 : default:
977 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
978 0 : efree(ret);
979 0 : return NULL;
980 : }
981 :
982 4362 : if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
983 20 : replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
984 20 : efree(ret);
985 20 : ret = replaced;
986 : }
987 : }
988 : }
989 : }
990 :
991 120 : for (j = 0; basic_entities[j].charcode != 0; j++) {
992 :
993 100 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
994 16 : continue;
995 :
996 84 : replacement[0] = (unsigned char)basic_entities[j].charcode;
997 84 : replacement[1] = '\0';
998 :
999 84 : if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
1000 6 : replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
1001 6 : efree(ret);
1002 6 : ret = replaced;
1003 : }
1004 : }
1005 :
1006 : /* replace numeric entities & "&" */
1007 20 : lim = ret + retlen;
1008 142 : for (p = ret, q = ret; p < lim;) {
1009 : int code;
1010 :
1011 102 : if (p[0] == '&') {
1012 8 : if (p + 2 < lim) {
1013 8 : if (p[1] == '#') {
1014 0 : int invalid_code = 0;
1015 :
1016 0 : if (p[2] == 'x' || p[2] == 'X') {
1017 0 : code = strtol(p + 3, &next, 16);
1018 : } else {
1019 0 : code = strtol(p + 2, &next, 10);
1020 : }
1021 :
1022 0 : if (next != NULL && *next == ';') {
1023 0 : switch (charset) {
1024 : case cs_utf_8:
1025 0 : q += php_utf32_utf8(q, code);
1026 0 : break;
1027 :
1028 : case cs_8859_1:
1029 : case cs_8859_5:
1030 : case cs_8859_15:
1031 0 : if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1032 0 : invalid_code = 1;
1033 : } else {
1034 0 : if (code == 39 || !quote_style) {
1035 0 : invalid_code = 1;
1036 : } else {
1037 0 : *(q++) = code;
1038 : }
1039 : }
1040 0 : break;
1041 :
1042 : case cs_cp1252:
1043 : case cs_cp1251:
1044 : case cs_cp866:
1045 0 : if (code > 0xff) {
1046 0 : invalid_code = 1;
1047 : } else {
1048 0 : *(q++) = code;
1049 : }
1050 0 : break;
1051 :
1052 : case cs_big5:
1053 : case cs_big5hkscs:
1054 : case cs_sjis:
1055 : case cs_eucjp:
1056 0 : if (code >= 0x80) {
1057 0 : invalid_code = 1;
1058 : } else {
1059 0 : *(q++) = code;
1060 : }
1061 0 : break;
1062 :
1063 : case cs_gb2312:
1064 0 : if (code >= 0x81) {
1065 0 : invalid_code = 1;
1066 : } else {
1067 0 : *(q++) = code;
1068 : }
1069 0 : break;
1070 :
1071 : default:
1072 : /* for backwards compatilibity */
1073 0 : invalid_code = 1;
1074 : break;
1075 : }
1076 0 : if (invalid_code) {
1077 0 : for (; p <= next; p++) {
1078 0 : *(q++) = *p;
1079 : }
1080 : }
1081 0 : p = next + 1;
1082 : } else {
1083 0 : *(q++) = *(p++);
1084 0 : *(q++) = *(p++);
1085 : }
1086 16 : } else if (p + 4 < lim &&
1087 : p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1088 : p[4] == ';') {
1089 8 : *(q++) = '&';
1090 8 : p += 5;
1091 : } else {
1092 0 : *(q++) = *(p++);
1093 0 : *(q++) = *(p++);
1094 : }
1095 : } else {
1096 0 : *(q++) = *(p++);
1097 : }
1098 : } else {
1099 94 : *(q++) = *(p++);
1100 : }
1101 : }
1102 20 : *q = '\0';
1103 20 : retlen = (size_t)(q - ret);
1104 22 : empty_source:
1105 22 : *newlen = retlen;
1106 22 : return ret;
1107 : }
1108 : /* }}} */
1109 :
1110 : PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
1111 1215 : {
1112 1215 : return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
1113 : }
1114 :
1115 :
1116 : /* {{{ php_escape_html_entities
1117 : */
1118 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
1119 5866 : {
1120 : int i, j, maxlen, len;
1121 : char *replaced;
1122 5866 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1123 : int matches_map;
1124 :
1125 5866 : maxlen = 2 * oldlen;
1126 5866 : if (maxlen < 128)
1127 5818 : maxlen = 128;
1128 5866 : replaced = emalloc (maxlen);
1129 5866 : len = 0;
1130 5866 : i = 0;
1131 39179 : while (i < oldlen) {
1132 : unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
1133 29886 : int mbseqlen = sizeof(mbsequence);
1134 29886 : int status = SUCCESS;
1135 29886 : unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
1136 :
1137 29886 : if(status == FAILURE) {
1138 : /* invalid MB sequence */
1139 2439 : efree(replaced);
1140 2439 : if(!PG(display_errors)) {
1141 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
1142 : }
1143 2439 : *newlen = 0;
1144 2439 : return STR_EMPTY_ALLOC();
1145 : }
1146 27447 : matches_map = 0;
1147 :
1148 27447 : if (len + 16 > maxlen)
1149 2 : replaced = erealloc (replaced, maxlen += 128);
1150 :
1151 27447 : if (all) {
1152 : /* look for a match in the maps for this charset */
1153 2445 : unsigned char *rep = NULL;
1154 :
1155 :
1156 58594 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1157 56290 : if (entity_map[j].charset == charset
1158 : && this_char >= entity_map[j].basechar
1159 : && this_char <= entity_map[j].endchar) {
1160 141 : rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
1161 141 : if (rep == NULL) {
1162 : /* there is no entity for this position; fall through and
1163 : * just output the character itself */
1164 1 : break;
1165 : }
1166 :
1167 140 : matches_map = 1;
1168 140 : break;
1169 : }
1170 : }
1171 :
1172 2445 : if (matches_map) {
1173 140 : int l = strlen(rep);
1174 : /* increase the buffer size */
1175 140 : if (len + 2 + l >= maxlen) {
1176 0 : replaced = erealloc(replaced, maxlen += 128);
1177 : }
1178 :
1179 140 : replaced[len++] = '&';
1180 140 : strlcpy(replaced + len, rep, maxlen);
1181 140 : len += l;
1182 140 : replaced[len++] = ';';
1183 : }
1184 : }
1185 27447 : if (!matches_map) {
1186 27307 : int is_basic = 0;
1187 :
1188 27307 : if (this_char == '&') {
1189 168 : if (double_encode) {
1190 138 : encode_amp:
1191 138 : memcpy(replaced + len, "&", sizeof("&") - 1);
1192 138 : len += sizeof("&") - 1;
1193 : } else {
1194 50 : char *e = memchr(old + i, ';', oldlen - i);
1195 50 : char *s = old + i;
1196 :
1197 50 : if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
1198 : goto encode_amp;
1199 : } else {
1200 40 : if (*s == '#') { /* numeric entities */
1201 12 : s++;
1202 : /* Hex (Z) */
1203 16 : if (*s == 'x' || *s == 'X') {
1204 6 : s++;
1205 20 : while (s < e) {
1206 10 : if (!isxdigit((int)*(unsigned char *)s++)) {
1207 2 : goto encode_amp;
1208 : }
1209 : }
1210 : /* Dec (Z)*/
1211 : } else {
1212 22 : while (s < e) {
1213 12 : if (!isdigit((int)*(unsigned char *)s++)) {
1214 2 : goto encode_amp;
1215 : }
1216 : }
1217 : }
1218 : } else { /* text entities */
1219 124 : while (s < e) {
1220 74 : if (!isalnum((int)*(unsigned char *)s++)) {
1221 6 : goto encode_amp;
1222 : }
1223 : }
1224 : }
1225 30 : replaced[len++] = '&';
1226 : }
1227 : }
1228 168 : is_basic = 1;
1229 : } else {
1230 162057 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1231 135213 : if ((basic_entities[j].charcode != this_char) ||
1232 : (basic_entities[j].flags &&
1233 : (quote_style & basic_entities[j].flags) == 0)) {
1234 : continue;
1235 : }
1236 :
1237 295 : memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1238 295 : len += basic_entities[j].entitylen;
1239 :
1240 295 : is_basic = 1;
1241 295 : break;
1242 : }
1243 : }
1244 :
1245 27307 : if (!is_basic) {
1246 : /* a wide char without a named entity; pass through the original sequence */
1247 26844 : if (mbseqlen > 1) {
1248 1601 : memcpy(replaced + len, mbsequence, mbseqlen);
1249 1601 : len += mbseqlen;
1250 : } else {
1251 25243 : replaced[len++] = (unsigned char)this_char;
1252 : }
1253 : }
1254 : }
1255 : }
1256 3427 : replaced[len] = '\0';
1257 3427 : *newlen = len;
1258 :
1259 3427 : return replaced;
1260 :
1261 :
1262 : }
1263 : /* }}} */
1264 :
1265 : /* {{{ php_html_entities
1266 : */
1267 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1268 4653 : {
1269 4653 : char *str, *hint_charset = NULL;
1270 4653 : int str_len, hint_charset_len = 0;
1271 : int len;
1272 4653 : long quote_style = ENT_COMPAT;
1273 : char *replaced;
1274 4653 : zend_bool double_encode = 1;
1275 :
1276 4653 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, "e_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
1277 2 : return;
1278 : }
1279 :
1280 4651 : replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
1281 4651 : RETVAL_STRINGL(replaced, len, 0);
1282 : }
1283 : /* }}} */
1284 :
1285 : #define HTML_SPECIALCHARS 0
1286 : #define HTML_ENTITIES 1
1287 :
1288 : /* {{{ register_html_constants
1289 : */
1290 : void register_html_constants(INIT_FUNC_ARGS)
1291 13565 : {
1292 13565 : REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1293 13565 : REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1294 13565 : REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1295 13565 : REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1296 13565 : REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1297 13565 : }
1298 : /* }}} */
1299 :
1300 : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
1301 : Convert special characters to HTML entities */
1302 : PHP_FUNCTION(htmlspecialchars)
1303 4278 : {
1304 4278 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1305 4278 : }
1306 : /* }}} */
1307 :
1308 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
1309 : Convert special HTML entities back to characters */
1310 : PHP_FUNCTION(htmlspecialchars_decode)
1311 113 : {
1312 : char *str, *new_str, *e, *p;
1313 : int len, j, i, new_len;
1314 113 : long quote_style = ENT_COMPAT;
1315 : struct basic_entities_dec basic_entities_dec[8];
1316 :
1317 113 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) {
1318 19 : return;
1319 : }
1320 :
1321 94 : new_str = estrndup(str, len);
1322 94 : new_len = len;
1323 94 : e = new_str + new_len;
1324 :
1325 94 : if (!(p = memchr(new_str, '&', new_len))) {
1326 25 : RETURN_STRINGL(new_str, new_len, 0);
1327 : }
1328 :
1329 414 : for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1330 345 : if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1331 133 : continue;
1332 : }
1333 212 : basic_entities_dec[j].charcode = basic_entities[i].charcode;
1334 212 : memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1335 212 : basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1336 212 : j++;
1337 : }
1338 69 : basic_entities_dec[j].charcode = '&';
1339 69 : basic_entities_dec[j].entitylen = sizeof("&") - 1;
1340 69 : memcpy(basic_entities_dec[j].entity, "&", sizeof("&"));
1341 69 : i = j + 1;
1342 :
1343 : do {
1344 343 : int l = e - p;
1345 :
1346 1188 : for (j = 0; j < i; j++) {
1347 1014 : if (basic_entities_dec[j].entitylen > l) {
1348 0 : continue;
1349 : }
1350 1014 : if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1351 169 : int e_len = basic_entities_dec[j].entitylen - 1;
1352 :
1353 169 : *p++ = basic_entities_dec[j].charcode;
1354 169 : memmove(p, p + e_len, (e - p - e_len));
1355 169 : e -= e_len;
1356 169 : goto done;
1357 : }
1358 : }
1359 174 : p++;
1360 :
1361 343 : done:
1362 343 : if (p >= e) {
1363 20 : break;
1364 : }
1365 323 : } while ((p = memchr(p, '&', (e - p))));
1366 :
1367 69 : new_len = e - new_str;
1368 :
1369 69 : new_str[new_len] = '\0';
1370 69 : RETURN_STRINGL(new_str, new_len, 0);
1371 : }
1372 : /* }}} */
1373 :
1374 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
1375 : Convert all HTML entities to their applicable characters */
1376 : PHP_FUNCTION(html_entity_decode)
1377 22 : {
1378 22 : char *str, *hint_charset = NULL;
1379 : int str_len, hint_charset_len, len;
1380 22 : long quote_style = ENT_COMPAT;
1381 : char *replaced;
1382 :
1383 22 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
1384 : "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
1385 0 : return;
1386 : }
1387 :
1388 22 : replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1389 22 : if (replaced) {
1390 22 : RETURN_STRINGL(replaced, len, 0);
1391 : }
1392 0 : RETURN_FALSE;
1393 : }
1394 : /* }}} */
1395 :
1396 :
1397 : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
1398 : Convert all applicable characters to HTML entities */
1399 : PHP_FUNCTION(htmlentities)
1400 375 : {
1401 375 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1402 375 : }
1403 : /* }}} */
1404 :
1405 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]])
1406 : Returns the internal translation table used by htmlspecialchars and htmlentities */
1407 : PHP_FUNCTION(get_html_translation_table)
1408 68 : {
1409 68 : long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1410 : int i, j;
1411 : char ind[2];
1412 68 : enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
1413 :
1414 68 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, "e_style) == FAILURE) {
1415 34 : return;
1416 : }
1417 :
1418 34 : array_init(return_value);
1419 :
1420 34 : ind[1] = 0;
1421 :
1422 34 : switch (which) {
1423 : case HTML_ENTITIES:
1424 200 : for (j=0; entity_map[j].charset != cs_terminator; j++) {
1425 192 : if (entity_map[j].charset != charset)
1426 184 : continue;
1427 776 : for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1428 : char buffer[16];
1429 :
1430 768 : if (entity_map[j].table[i] == NULL)
1431 0 : continue;
1432 : /* what about wide chars here ?? */
1433 768 : ind[0] = i + entity_map[j].basechar;
1434 768 : snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1435 768 : add_assoc_string(return_value, ind, buffer, 1);
1436 :
1437 : }
1438 : }
1439 : /* break thru */
1440 :
1441 : case HTML_SPECIALCHARS:
1442 204 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1443 :
1444 170 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1445 70 : continue;
1446 :
1447 100 : ind[0] = (unsigned char)basic_entities[j].charcode;
1448 100 : add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
1449 : }
1450 34 : add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1);
1451 :
1452 : break;
1453 : }
1454 : }
1455 : /* }}} */
1456 :
1457 : /*
1458 : * Local variables:
1459 : * tab-width: 4
1460 : * c-basic-offset: 4
1461 : * End:
1462 : * vim600: sw=4 ts=4 fdm=marker
1463 : * vim<600: sw=4 ts=4
1464 : */
|