PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - standard - html.c
Test: PHP Code Coverage
Date: 2009-11-19 Instrumented lines: 470
Code covered: 80.4 % Executed lines: 378
Legend: not executed executed

       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 5                                                        |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | Copyright (c) 1997-2009 The PHP Group                                |
       6                 :    +----------------------------------------------------------------------+
       7                 :    | This source file is subject to version 3.01 of the PHP license,      |
       8                 :    | that is bundled with this package in the file LICENSE, and is        |
       9                 :    | available through the world-wide-web at the following url:           |
      10                 :    | http://www.php.net/license/3_01.txt                                  |
      11                 :    | If you did not receive a copy of the PHP license and are unable to   |
      12                 :    | obtain it through the world-wide-web, please send a note to          |
      13                 :    | license@php.net so we can mail you a copy immediately.               |
      14                 :    +----------------------------------------------------------------------+
      15                 :    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
      16                 :    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
      17                 :    |          Wez Furlong <wez@thebrainroom.com>                          |
      18                 :    +----------------------------------------------------------------------+
      19                 : */
      20                 : 
      21                 : /* $Id: html.c 289605 2009-10-13 05:18:37Z moriyoshi $ */
      22                 : 
      23                 : /*
      24                 :  * HTML entity resources:
      25                 :  *
      26                 :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
      27                 :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
      28                 :  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
      29                 :  *
      30                 :  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
      31                 :  * 
      32                 :  */
      33                 : 
      34                 : #include "php.h"
      35                 : #if PHP_WIN32
      36                 : #include "config.w32.h"
      37                 : #else
      38                 : #include <php_config.h>
      39                 : #endif
      40                 : #include "reg.h"
      41                 : #include "html.h"
      42                 : #include "php_string.h"
      43                 : #include "SAPI.h"
      44                 : #if HAVE_LOCALE_H
      45                 : #include <locale.h>
      46                 : #endif
      47                 : #if HAVE_LANGINFO_H
      48                 : #include <langinfo.h>
      49                 : #endif
      50                 : 
      51                 : #if HAVE_MBSTRING
      52                 : # include "ext/mbstring/mbstring.h"
      53                 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
      54                 : #endif
      55                 : 
      56                 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
      57                 :                                           cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 
      58                 :                                           cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
      59                 :                                           cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
      60                 :                                         };
      61                 : typedef const char *const entity_table_t;
      62                 : 
      63                 : /* codepage 1252 is a Windows extension to iso-8859-1. */
      64                 : static entity_table_t ent_cp_1252[] = {
      65                 :         "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
      66                 :         "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
      67                 :         NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
      68                 :         "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
      69                 :         "oelig", NULL, NULL, "Yuml" 
      70                 : };
      71                 : 
      72                 : static entity_table_t ent_iso_8859_1[] = {
      73                 :         "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
      74                 :         "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
      75                 :         "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
      76                 :         "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
      77                 :         "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
      78                 :         "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      79                 :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      80                 :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      81                 :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
      82                 :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
      83                 :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
      84                 :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
      85                 :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
      86                 :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
      87                 :         "uuml", "yacute", "thorn", "yuml"
      88                 : };
      89                 : 
      90                 : static entity_table_t ent_iso_8859_15[] = {
      91                 :         "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
      92                 :         "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
      93                 :         "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
      94                 :         "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
      95                 :         "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
      96                 :         "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      97                 :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      98                 :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      99                 :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
     100                 :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
     101                 :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
     102                 :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
     103                 :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
     104                 :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
     105                 :         "uuml", "yacute", "thorn", "yuml"
     106                 : };
     107                 : 
     108                 : static entity_table_t ent_uni_338_402[] = {
     109                 :         /* 338 (0x0152) */
     110                 :         "OElig", "oelig", NULL, NULL, NULL, NULL,
     111                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     112                 :         /* 352 (0x0160) */
     113                 :         "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
     114                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     115                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     116                 :         /* 376 (0x0178) */
     117                 :         "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     118                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     119                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     120                 :         /* 400 (0x0190) */
     121                 :         NULL, NULL, "fnof"
     122                 : };
     123                 : 
     124                 : static entity_table_t ent_uni_spacing[] = {
     125                 :         /* 710 */
     126                 :         "circ",
     127                 :         /* 711 - 730 */
     128                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     129                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     130                 :         /* 731 - 732 */
     131                 :         NULL, "tilde"
     132                 : };
     133                 : 
     134                 : static entity_table_t ent_uni_greek[] = {
     135                 :         /* 913 */
     136                 :         "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
     137                 :         "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
     138                 :         NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
     139                 :         /* 938 - 944 are not mapped */
     140                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     141                 :         "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
     142                 :         "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
     143                 :         "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
     144                 :         /* 970 - 976 are not mapped */
     145                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     146                 :         "thetasym", "upsih",
     147                 :         NULL, NULL, NULL,
     148                 :         "piv" 
     149                 : };
     150                 : 
     151                 : static entity_table_t ent_uni_punct[] = {
     152                 :         /* 8194 */
     153                 :         "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
     154                 :         "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
     155                 :         NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
     156                 :         /* 8216 */
     157                 :         "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
     158                 :         "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
     159                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
     160                 :         /* 8242 */
     161                 :         "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
     162                 :         NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
     163                 :         "frasl"
     164                 : };
     165                 : 
     166                 : static entity_table_t ent_uni_euro[] = {
     167                 :         "euro"
     168                 : };
     169                 : 
     170                 : static entity_table_t ent_uni_8465_8501[] = {
     171                 :         /* 8465 */
     172                 :         "image", NULL, NULL, NULL, NULL, NULL, NULL,
     173                 :         /* 8472 */
     174                 :         "weierp", NULL, NULL, NULL,
     175                 :         /* 8476 */
     176                 :         "real", NULL, NULL, NULL, NULL, NULL,
     177                 :         /* 8482 */
     178                 :         "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     179                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     180                 :         /* 8501 */
     181                 :         "alefsym",
     182                 : };
     183                 : 
     184                 : static entity_table_t ent_uni_8592_9002[] = {
     185                 :         /* 8592 (0x2190) */
     186                 :         "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
     187                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     188                 :         /* 8608 (0x21a0) */
     189                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     190                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     191                 :         /* 8624 (0x21b0) */
     192                 :         NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
     193                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     194                 :         /* 8640 (0x21c0) */
     195                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     196                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     197                 :         /* 8656 (0x21d0) */
     198                 :         "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
     199                 :         NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
     200                 :         /* 8672 (0x21e0) */
     201                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     202                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     203                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     204                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     205                 :         /* 8704 (0x2200) */
     206                 :         "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
     207                 :         "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
     208                 :         /* 8720 (0x2210) */
     209                 :         "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
     210                 :         "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
     211                 :         /* 8736 (0x2220) */
     212                 :         "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
     213                 :         "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
     214                 :         /* 8752 (0x2230) */
     215                 :         NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
     216                 :         NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
     217                 :         /* 8768 (0x2240) */
     218                 :         "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
     219                 :         "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
     220                 :         /* 8784 (0x2250) */
     221                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     222                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     223                 :         /* 8800 (0x2260) */
     224                 :         "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
     225                 :         "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
     226                 :         /* 8816 (0x2270) */
     227                 :         "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
     228                 :         NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
     229                 :         /* 8832 (0x2280) */
     230                 :         "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
     231                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     232                 :         /* 8848 (0x2290) */
     233                 :         NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
     234                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     235                 :         /* 8864 (0x22a0) */
     236                 :         NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
     237                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     238                 :         /* 8880 (0x22b0) */
     239                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     240                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     241                 :         /* 8896 (0x22c0) */
     242                 :         NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
     243                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     244                 :         /* 8912 (0x22d0) */
     245                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     246                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     247                 :         /* 8928 (0x22e0) */
     248                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     249                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     250                 :         /* 8944 (0x22f0) */
     251                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     252                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     253                 :         /* 8960 (0x2300) */
     254                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     255                 :         "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
     256                 :         /* 8976 (0x2310) */
     257                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     258                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     259                 :         /* 8992 (0x2320) */
     260                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     261                 :         NULL, "lang", "rang"
     262                 : };
     263                 : 
     264                 : static entity_table_t ent_uni_9674[] = {
     265                 :         /* 9674 */
     266                 :         "loz"
     267                 : };
     268                 : 
     269                 : static entity_table_t ent_uni_9824_9830[] = {
     270                 :         /* 9824 */
     271                 :         "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
     272                 : };
     273                 : 
     274                 : static entity_table_t ent_koi8r[] = {
     275                 :         "#1105", /* "jo "*/
     276                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     277                 :         NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
     278                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     279                 :         "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 
     280                 :         "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 
     281                 :         "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 
     282                 :         "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 
     283                 :         "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 
     284                 :         "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 
     285                 :         "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 
     286                 :         "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
     287                 :         "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 
     288                 :         "#1066"
     289                 : };
     290                 : 
     291                 : static entity_table_t ent_cp_1251[] = {
     292                 :         "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
     293                 :         "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
     294                 :         "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
     295                 :         "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
     296                 :         "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
     297                 :         "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
     298                 :         "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
     299                 :         "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
     300                 :         "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
     301                 :         "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
     302                 :         "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
     303                 :         "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
     304                 :         "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
     305                 :         "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
     306                 :         "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
     307                 :         "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
     308                 :         "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
     309                 :         "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
     310                 :         "#1103"
     311                 : };
     312                 : 
     313                 : static entity_table_t ent_iso_8859_5[] = {
     314                 :         "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
     315                 :         "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
     316                 :         "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
     317                 :         "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
     318                 :         "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
     319                 :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
     320                 :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
     321                 :         "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
     322                 :         "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
     323                 :         "#1119"
     324                 : };
     325                 : 
     326                 : static entity_table_t ent_cp_866[] = {
     327                 : 
     328                 :         "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 
     329                 :         "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 
     330                 :         "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 
     331                 :         "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 
     332                 :         "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 
     333                 :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 
     334                 :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 
     335                 :         "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 
     336                 :         "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632", 
     337                 :         "#160"
     338                 : };
     339                 : 
     340                 : /* MacRoman has a couple of low-ascii chars that need mapping too */
     341                 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
     342                 : /* DB exports, this mapping changes it to a space */
     343                 : static entity_table_t ent_macroman[] = {
     344                 :         "sp", NULL, NULL, NULL,
     345                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     346                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     347                 :         NULL, NULL, NULL, NULL, NULL, "quot", NULL,
     348                 :         NULL, NULL, "amp", NULL, NULL, NULL, NULL,
     349                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     350                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     351                 :         NULL, NULL, NULL, "lt", NULL, "gt", NULL,
     352                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     353                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     354                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     355                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     356                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     357                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     358                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     359                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     360                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     361                 :         NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
     362                 :         "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
     363                 :         "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
     364                 :         "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
     365                 :         "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
     366                 :         "cent", "pound", "sect", "bull", "para", "szlig", "reg",
     367                 :         "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
     368                 :         "infin", "plusmn", "le", "ge", "yen", "micro", "part",
     369                 :         "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
     370                 :         "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
     371                 :         "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
     372                 :         "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
     373                 :         "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
     374                 :         "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
     375                 :         "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
     376                 :         "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
     377                 :         "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
     378                 :         "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
     379                 :         "#733", "#731", "#711"
     380                 : };
     381                 : 
     382                 : struct html_entity_map {
     383                 :         enum entity_charset charset;    /* charset identifier */
     384                 :         unsigned short basechar;                        /* char code at start of table */
     385                 :         unsigned short endchar;                 /* last char code in the table */
     386                 :         entity_table_t *table;                  /* the table of mappings */
     387                 : };
     388                 : 
     389                 : static const struct html_entity_map entity_map[] = {
     390                 :         { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
     391                 :         { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
     392                 :         { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
     393                 :         { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
     394                 :         { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
     395                 :         { cs_utf_8,             338,  402,  ent_uni_338_402 },
     396                 :         { cs_utf_8,             710,  732,  ent_uni_spacing },
     397                 :         { cs_utf_8,             913,  982,  ent_uni_greek },
     398                 :         { cs_utf_8,             8194, 8260, ent_uni_punct },
     399                 :         { cs_utf_8,             8364, 8364, ent_uni_euro }, 
     400                 :         { cs_utf_8,             8465, 8501, ent_uni_8465_8501 },
     401                 :         { cs_utf_8,             8592, 9002, ent_uni_8592_9002 },
     402                 :         { cs_utf_8,             9674, 9674, ent_uni_9674 },
     403                 :         { cs_utf_8,             9824, 9830, ent_uni_9824_9830 },
     404                 :         { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
     405                 :         { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
     406                 :         { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
     407                 :         { cs_sjis,                      0xa0, 0xff, ent_iso_8859_1 },
     408                 :         { cs_eucjp,                     0xa0, 0xff, ent_iso_8859_1 },
     409                 :         { cs_koi8r,                 0xa3, 0xff, ent_koi8r },
     410                 :         { cs_cp1251,            0x80, 0xff, ent_cp_1251 },
     411                 :         { cs_8859_5,            0xc0, 0xff, ent_iso_8859_5 },
     412                 :         { cs_cp866,                 0xc0, 0xff, ent_cp_866 },
     413                 :         { cs_macroman,          0x0b, 0xff, ent_macroman },
     414                 :         { cs_terminator }
     415                 : };
     416                 : 
     417                 : static const struct {
     418                 :         const char *codeset;
     419                 :         enum entity_charset charset;
     420                 : } charset_map[] = {
     421                 :         { "ISO-8859-1",       cs_8859_1 },
     422                 :         { "ISO8859-1",                cs_8859_1 },
     423                 :         { "ISO-8859-15",      cs_8859_15 },
     424                 :         { "ISO8859-15",       cs_8859_15 },
     425                 :         { "utf-8",                    cs_utf_8 },
     426                 :         { "cp1252",           cs_cp1252 },
     427                 :         { "Windows-1252",     cs_cp1252 },
     428                 :         { "1252",           cs_cp1252 }, 
     429                 :         { "BIG5",                     cs_big5 },
     430                 :         { "950",            cs_big5 },
     431                 :         { "GB2312",                   cs_gb2312 },
     432                 :         { "936",            cs_gb2312 },
     433                 :         { "BIG5-HKSCS",               cs_big5hkscs },
     434                 :         { "Shift_JIS",                cs_sjis },
     435                 :         { "SJIS",             cs_sjis },
     436                 :         { "932",            cs_sjis },
     437                 :         { "EUCJP",            cs_eucjp },
     438                 :         { "EUC-JP",                   cs_eucjp },
     439                 :         { "KOI8-R",         cs_koi8r },
     440                 :         { "koi8-ru",        cs_koi8r },
     441                 :         { "koi8r",          cs_koi8r },
     442                 :         { "cp1251",         cs_cp1251 },
     443                 :         { "Windows-1251",   cs_cp1251 },
     444                 :         { "win-1251",       cs_cp1251 },
     445                 :         { "iso8859-5",      cs_8859_5 },
     446                 :         { "iso-8859-5",     cs_8859_5 },
     447                 :         { "cp866",          cs_cp866 },
     448                 :         { "866",            cs_cp866 },    
     449                 :         { "ibm866",         cs_cp866 },
     450                 :         { "MacRoman",       cs_macroman },
     451                 :         { NULL }
     452                 : };
     453                 : 
     454                 : static const struct {
     455                 :         unsigned short charcode;
     456                 :         char *entity;
     457                 :         int entitylen;
     458                 :         int flags;
     459                 : } basic_entities[] = {
     460                 :         { '"',     "&quot;", 6,      ENT_HTML_QUOTE_DOUBLE },
     461                 :         { '\'', "&#039;", 6,      ENT_HTML_QUOTE_SINGLE },
     462                 :         { '\'', "&#39;",  5,      ENT_HTML_QUOTE_SINGLE },
     463                 :         { '<',       "&lt;",           4,      0 },
     464                 :         { '>',       "&gt;",           4,      0 },
     465                 :         { 0, NULL, 0, 0 }
     466                 : };
     467                 :         
     468                 : struct basic_entities_dec {
     469                 :         unsigned short charcode;
     470                 :         char entity[8];
     471                 :         int entitylen;  
     472                 : };
     473                 :         
     474                 : #define MB_RETURN { \
     475                 :                         *newpos = pos;       \
     476                 :                         mbseq[mbpos] = '\0'; \
     477                 :                         *mbseqlen = mbpos;   \
     478                 :                         return this_char; }
     479                 :                                         
     480                 : #define MB_WRITE(mbchar) { \
     481                 :                         mbspace--;  \
     482                 :                         if (mbspace == 0) {      \
     483                 :                                 MB_RETURN;           \
     484                 :                         }                        \
     485                 :                         mbseq[mbpos++] = (mbchar); }
     486                 : 
     487                 : /* skip one byte and return */
     488                 : #define MB_FAILURE(pos) do { \
     489                 :         *newpos = pos + 1; \
     490                 :         *status = FAILURE; \
     491                 :         return 0; \
     492                 : } while (0)
     493                 : 
     494                 : #define CHECK_LEN(pos, chars_need)                      \
     495                 :         if (chars_need < 1) {                                                \
     496                 :                 if((str_len - (pos)) < chars_need) { \
     497                 :                         *newpos = pos;                                          \
     498                 :                         *status = FAILURE;                                      \
     499                 :                         return 0;                                                       \
     500                 :                 }                                                                               \
     501                 :         } else {                                                                        \
     502                 :                 if((str_len - (pos)) < chars_need) { \
     503                 :                         *newpos = pos + 1;                                      \
     504                 :                         *status = FAILURE;                                      \
     505                 :                         return 0;                                                       \
     506                 :                 }                                                                               \
     507                 :         }
     508                 : 
     509                 : /* {{{ get_next_char
     510                 :  */
     511                 : inline static unsigned int get_next_char(enum entity_charset charset,
     512                 :                 unsigned char * str,
     513                 :                 int str_len,
     514                 :                 int * newpos,
     515                 :                 unsigned char * mbseq,
     516                 :                 int * mbseqlen, 
     517                 :                 int *status)
     518           29886 : {
     519           29886 :         int pos = *newpos;
     520           29886 :         int mbpos = 0;
     521           29886 :         int mbspace = *mbseqlen;
     522           29886 :         unsigned int this_char = 0;
     523                 :         unsigned char next_char;
     524                 : 
     525           29886 :         *status = SUCCESS;
     526                 : 
     527           29886 :         if (mbspace <= 0) {
     528               0 :                 *mbseqlen = 0;
     529               0 :                 CHECK_LEN(pos, 1);
     530               0 :                 *newpos = pos + 1;
     531               0 :                 return str[pos];
     532                 :         }
     533                 : 
     534           29886 :         switch (charset) {
     535                 :                 case cs_utf_8:
     536                 :                         {
     537                 :                                 unsigned char c;
     538              90 :                                 CHECK_LEN(pos, 1);
     539              90 :                                 c = str[pos];
     540              90 :                                 if (c < 0x80) {
     541              22 :                                         MB_WRITE(c);
     542              22 :                                         this_char = c;
     543              22 :                                         pos++;
     544              68 :                                 } else if (c < 0xc0) {
     545               0 :                                         MB_FAILURE(pos);
     546              68 :                                 } else if (c < 0xe0) {
     547              23 :                                         CHECK_LEN(pos, 2);
     548              19 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     549               2 :                                                 MB_FAILURE(pos);
     550                 :                                         }
     551              17 :                                         this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
     552              17 :                                         if (this_char < 0x80) {
     553               3 :                                                 MB_FAILURE(pos);
     554                 :                                         }
     555              14 :                                         MB_WRITE((unsigned char)c);
     556              14 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     557              14 :                                         pos += 2;
     558              45 :                                 } else if (c < 0xf0) {
     559              33 :                                         CHECK_LEN(pos, 3);
     560              27 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     561               3 :                                                 MB_FAILURE(pos);
     562                 :                                         }
     563              24 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     564               2 :                                                 MB_FAILURE(pos);
     565                 :                                         }
     566              22 :                                         this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
     567              22 :                                         if (this_char < 0x800) {
     568               6 :                                                 MB_FAILURE(pos);
     569                 :                                         }
     570              16 :                                         MB_WRITE((unsigned char)c);
     571              16 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     572              16 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     573              16 :                                         pos += 3;
     574              12 :                                 } else if (c < 0xf8) {
     575              11 :                                         CHECK_LEN(pos, 4);
     576              11 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     577               2 :                                                 MB_FAILURE(pos);
     578                 :                                         }
     579               9 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     580               2 :                                                 MB_FAILURE(pos);
     581                 :                                         }
     582               7 :                                         if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
     583               2 :                                                 MB_FAILURE(pos);
     584                 :                                         }
     585               5 :                                         this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
     586               5 :                                         if (this_char < 0x10000) {
     587               3 :                                                 MB_FAILURE(pos);
     588                 :                                         }
     589               2 :                                         MB_WRITE((unsigned char)c);
     590               2 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     591               2 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     592               2 :                                         MB_WRITE((unsigned char)str[pos + 3]);
     593               2 :                                         pos += 4;
     594                 :                                 } else {
     595               1 :                                         MB_FAILURE(pos);
     596                 :                                 }
     597                 :                         }
     598              54 :                         break;
     599                 :                 case cs_big5:
     600                 :                 case cs_gb2312:
     601                 :                 case cs_big5hkscs:
     602                 :                         {
     603            1262 :                                 CHECK_LEN(pos, 1);
     604            1262 :                                 this_char = str[pos++];
     605                 :                                 /* check if this is the first of a 2-byte sequence */
     606            1766 :                                 if (this_char >= 0x81 && this_char <= 0xfe) {
     607                 :                                         /* peek at the next char */
     608            1260 :                                         CHECK_LEN(pos, 1);
     609            1134 :                                         next_char = str[pos++];
     610            1134 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     611                 :                                                         (next_char >= 0xa1 && next_char <= 0xfe)) {
     612                 :                                                 /* yes, this a wide char */
     613             504 :                                                 MB_WRITE(this_char);
     614             504 :                                                 MB_WRITE(next_char);
     615             504 :                                                 this_char = (this_char << 8) | next_char;
     616                 :                                         } else {
     617             630 :                                                 MB_FAILURE(pos);
     618                 :                                         }
     619                 :                                 } else {
     620               2 :                                         MB_WRITE(this_char);
     621                 :                                 }
     622                 :                         }
     623             506 :                         break;
     624                 :                 case cs_sjis:
     625                 :                         {
     626             681 :                                 CHECK_LEN(pos, 1);
     627             681 :                                 this_char = str[pos++];
     628                 :                                 /* check if this is the first of a 2-byte sequence */
     629             930 :                                 if ((this_char >= 0x81 && this_char <= 0x9f) ||
     630                 :                                         (this_char >= 0xe0 && this_char <= 0xfc)) {
     631                 :                                         /* peek at the next char */
     632             609 :                                         CHECK_LEN(pos, 1);
     633             549 :                                         next_char = str[pos++];
     634             549 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     635                 :                                                 (next_char >= 0x80 && next_char <= 0xfc))
     636                 :                                         {
     637                 :                                                 /* yes, this a wide char */
     638             249 :                                                 MB_WRITE(this_char);
     639             249 :                                                 MB_WRITE(next_char);
     640             249 :                                                 this_char = (this_char << 8) | next_char;
     641                 :                                         } else {
     642             300 :                                                 MB_FAILURE(pos);
     643                 :                                         }
     644                 :                                 } else {
     645              72 :                                         MB_WRITE(this_char);
     646                 :                                 }
     647             321 :                                 break;
     648                 :                         }
     649                 :                 case cs_eucjp:
     650                 :                         {
     651            2402 :                                 CHECK_LEN(pos, 1);
     652            2402 :                                 this_char = str[pos++];
     653                 :                                 /* check if this is the first of a multi-byte sequence */
     654            2602 :                                 if (this_char >= 0xa1 && this_char <= 0xfe) {
     655                 :                                         /* peek at the next char */
     656             797 :                                         CHECK_LEN(pos, 1);
     657             576 :                                         next_char = str[pos++];
     658             576 :                                         if (next_char >= 0xa1 && next_char <= 0xfe) {
     659                 :                                                 /* yes, this a jis kanji char */
     660             200 :                                                 MB_WRITE(this_char);
     661             200 :                                                 MB_WRITE(next_char);
     662             200 :                                                 this_char = (this_char << 8) | next_char;
     663                 :                                         } else {
     664             376 :                                                 MB_FAILURE(pos);
     665                 :                                         }
     666            1605 :                                 } else if (this_char == 0x8e) {
     667                 :                                         /* peek at the next char */
     668             661 :                                         CHECK_LEN(pos, 1);
     669             660 :                                         next_char = str[pos++];
     670             660 :                                         if (next_char >= 0xa1 && next_char <= 0xdf) {
     671                 :                                                 /* JIS X 0201 kana */
     672             443 :                                                 MB_WRITE(this_char);
     673             443 :                                                 MB_WRITE(next_char);
     674             443 :                                                 this_char = (this_char << 8) | next_char;
     675                 :                                         } else {
     676             217 :                                                 MB_FAILURE(pos);
     677                 :                                         }
     678             944 :                                 } else if (this_char == 0x8f) {
     679                 :                                         /* peek at the next two char */
     680                 :                                         unsigned char next2_char;
     681             661 :                                         CHECK_LEN(pos, 2);
     682             565 :                                         next_char = str[pos];
     683             565 :                                         next2_char = str[pos + 1];
     684             565 :                                         pos += 2;
     685             565 :                                         if ((next_char >= 0xa1 && next_char <= 0xfe) &&
     686                 :                                                 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
     687                 :                                                 /* JIS X 0212 hojo-kanji */
     688             189 :                                                 MB_WRITE(this_char);
     689             189 :                                                 MB_WRITE(next_char);
     690             189 :                                                 MB_WRITE(next2_char);
     691             189 :                                                 this_char = (this_char << 16) | (next_char << 8) | next2_char;
     692                 :                                         } else {
     693             376 :                                                 MB_FAILURE(pos);
     694                 :                                         }
     695                 :                                 } else {
     696             283 :                                         MB_WRITE(this_char);
     697                 :                                 }
     698            1115 :                                 break;
     699                 :                         }
     700                 :                 default:
     701                 :                         /* single-byte charsets */
     702           25451 :                         CHECK_LEN(pos, 1);
     703           25451 :                         this_char = str[pos++];
     704           25451 :                         MB_WRITE(this_char);
     705                 :                         break;
     706                 :         }
     707           27447 :         MB_RETURN;
     708                 : }
     709                 : /* }}} */
     710                 : 
     711                 : /* {{{ entity_charset determine_charset
     712                 :  * returns the charset identifier based on current locale or a hint.
     713                 :  * defaults to iso-8859-1 */
     714                 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
     715            5956 : {
     716                 :         int i;
     717            5956 :         enum entity_charset charset = cs_8859_1;
     718            5956 :         int len = 0;
     719            5956 :         zval *uf_result = NULL;
     720                 : 
     721                 :         /* Guarantee default behaviour for backwards compatibility */
     722            5956 :         if (charset_hint == NULL)
     723            1863 :                 return cs_8859_1;
     724                 : 
     725            4093 :         if ((len = strlen(charset_hint)) != 0) {
     726            4070 :                 goto det_charset;
     727                 :         }
     728                 : #if HAVE_MBSTRING
     729                 : #if !defined(COMPILE_DL_MBSTRING)
     730                 :         /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
     731              23 :         switch (MBSTRG(current_internal_encoding)) {
     732                 :                 case mbfl_no_encoding_8859_1:
     733               2 :                         return cs_8859_1;
     734                 : 
     735                 :                 case mbfl_no_encoding_utf8:
     736               0 :                         return cs_utf_8;
     737                 : 
     738                 :                 case mbfl_no_encoding_euc_jp:
     739                 :                 case mbfl_no_encoding_eucjp_win:
     740               2 :                         return cs_eucjp;
     741                 : 
     742                 :                 case mbfl_no_encoding_sjis:
     743                 :                 case mbfl_no_encoding_sjis_win:
     744                 :                 case mbfl_no_encoding_sjis_mac:
     745               2 :                         return cs_sjis;
     746                 : 
     747                 :                 case mbfl_no_encoding_cp1252:
     748               3 :                         return cs_cp1252;
     749                 : 
     750                 :                 case mbfl_no_encoding_8859_15:
     751               2 :                         return cs_8859_15;
     752                 : 
     753                 :                 case mbfl_no_encoding_big5:
     754               0 :                         return cs_big5;
     755                 : 
     756                 :                 case mbfl_no_encoding_euc_cn:
     757                 :                 case mbfl_no_encoding_hz:
     758                 :                 case mbfl_no_encoding_cp936:
     759               0 :                         return cs_gb2312;
     760                 : 
     761                 :                 case mbfl_no_encoding_koi8r:
     762               0 :                         return cs_koi8r;
     763                 : 
     764                 :                 case mbfl_no_encoding_cp866:
     765               0 :                         return cs_cp866;
     766                 : 
     767                 :                 case mbfl_no_encoding_cp1251:
     768               2 :                         return cs_cp1251;
     769                 : 
     770                 :                 case mbfl_no_encoding_8859_5:
     771               0 :                         return cs_8859_5;
     772                 : 
     773                 :                 default:
     774                 :                         ;
     775                 :         }
     776                 : #else
     777                 :         {
     778                 :                 zval nm_mb_internal_encoding;
     779                 : 
     780                 :                 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
     781                 : 
     782                 :                 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
     783                 : 
     784                 :                         charset_hint = Z_STRVAL_P(uf_result);
     785                 :                         len = Z_STRLEN_P(uf_result);
     786                 :                         
     787                 :                         if (len == 4) { /* sizeof(none|auto|pass)-1 */
     788                 :                                 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || 
     789                 :                                     !memcmp("auto", charset_hint, sizeof("auto") - 1) || 
     790                 :                                     !memcmp("none", charset_hint, sizeof("none") - 1)) {
     791                 :                                         
     792                 :                                         charset_hint = NULL;
     793                 :                                         len = 0;
     794                 :                                 }
     795                 :                         }
     796                 :                         goto det_charset;
     797                 :                 }
     798                 :         }
     799                 : #endif
     800                 : #endif
     801                 : 
     802              10 :         charset_hint = SG(default_charset);
     803              10 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     804               6 :                 goto det_charset;
     805                 :         }
     806                 : 
     807                 :         /* try to detect the charset for the locale */
     808                 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
     809               4 :         charset_hint = nl_langinfo(CODESET);
     810               4 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     811               4 :                 goto det_charset;
     812                 :         }
     813                 : #endif
     814                 : 
     815                 : #if HAVE_LOCALE_H
     816                 :         /* try to figure out the charset from the locale */
     817                 :         {
     818                 :                 char *localename;
     819                 :                 char *dot, *at;
     820                 : 
     821                 :                 /* lang[_territory][.codeset][@modifier] */
     822               0 :                 localename = setlocale(LC_CTYPE, NULL);
     823                 : 
     824               0 :                 dot = strchr(localename, '.');
     825               0 :                 if (dot) {
     826               0 :                         dot++;
     827                 :                         /* locale specifies a codeset */
     828               0 :                         at = strchr(dot, '@');
     829               0 :                         if (at)
     830               0 :                                 len = at - dot;
     831                 :                         else
     832               0 :                                 len = strlen(dot);
     833               0 :                         charset_hint = dot;
     834                 :                 } else {
     835                 :                         /* no explicit name; see if the name itself
     836                 :                          * is the charset */
     837               0 :                         charset_hint = localename;
     838               0 :                         len = strlen(charset_hint);
     839                 :                 }
     840                 :         }
     841                 : #endif
     842                 : 
     843            4080 : det_charset:
     844                 : 
     845            4080 :         if (charset_hint) {
     846            4080 :                 int found = 0;
     847                 :                 
     848                 :                 /* now walk the charset map and look for the codeset */
     849           57827 :                 for (i = 0; charset_map[i].codeset; i++) {
     850           57821 :                         if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
     851            4074 :                                 charset = charset_map[i].charset;
     852            4074 :                                 found = 1;
     853            4074 :                                 break;
     854                 :                         }
     855                 :                 }
     856            4080 :                 if (!found) {
     857               6 :                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
     858                 :                                         charset_hint);
     859                 :                 }
     860                 :         }
     861            4080 :         if (uf_result != NULL) {
     862               0 :                 zval_ptr_dtor(&uf_result);
     863                 :         }
     864            4080 :         return charset;
     865                 : }
     866                 : /* }}} */
     867                 : 
     868                 : /* {{{ php_utf32_utf8 */
     869                 : size_t php_utf32_utf8(unsigned char *buf, int k)
     870            3684 : {
     871            3684 :         size_t retval = 0;
     872                 : 
     873            3684 :         if (k < 0x80) {
     874               0 :                 buf[0] = k;
     875               0 :                 retval = 1;
     876            3684 :         } else if (k < 0x800) {
     877            1872 :                 buf[0] = 0xc0 | (k >> 6);
     878            1872 :                 buf[1] = 0x80 | (k & 0x3f);
     879            1872 :                 retval = 2;
     880            1812 :         } else if (k < 0x10000) {
     881            1812 :                 buf[0] = 0xe0 | (k >> 12);
     882            1812 :                 buf[1] = 0x80 | ((k >> 6) & 0x3f);
     883            1812 :                 buf[2] = 0x80 | (k & 0x3f);
     884            1812 :                 retval = 3;
     885               0 :         } else if (k < 0x200000) {
     886               0 :                 buf[0] = 0xf0 | (k >> 18);
     887               0 :                 buf[1] = 0x80 | ((k >> 12) & 0x3f);
     888               0 :                 buf[2] = 0x80 | ((k >> 6) & 0x3f);
     889               0 :                 buf[3] = 0x80 | (k & 0x3f);
     890               0 :                 retval = 4;
     891               0 :         } else if (k < 0x4000000) {
     892               0 :                 buf[0] = 0xf8 | (k >> 24);
     893               0 :                 buf[1] = 0x80 | ((k >> 18) & 0x3f);
     894               0 :                 buf[2] = 0x80 | ((k >> 12) & 0x3f);
     895               0 :                 buf[3] = 0x80 | ((k >> 6) & 0x3f);
     896               0 :                 buf[4] = 0x80 | (k & 0x3f);
     897               0 :                 retval = 5;
     898                 :         } else {
     899               0 :                 buf[0] = 0xfc | (k >> 30);
     900               0 :                 buf[1] = 0x80 | ((k >> 24) & 0x3f);
     901               0 :                 buf[2] = 0x80 | ((k >> 18) & 0x3f);
     902               0 :                 buf[3] = 0x80 | ((k >> 12) & 0x3f);
     903               0 :                 buf[4] = 0x80 | ((k >> 6) & 0x3f);
     904               0 :                 buf[5] = 0x80 | (k & 0x3f);
     905               0 :                 retval = 6;
     906                 :         }
     907            3684 :         buf[retval] = '\0';
     908                 : 
     909            3684 :         return retval;
     910                 : }
     911                 : /* }}} */
     912                 : 
     913                 : /* {{{ php_unescape_html_entities
     914                 :  */
     915                 : PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
     916              22 : {
     917                 :         int retlen;
     918                 :         int j, k;
     919                 :         char *replaced, *ret, *p, *q, *lim, *next;
     920              22 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
     921                 :         unsigned char replacement[15];
     922                 :         int replacement_len;
     923                 : 
     924              22 :         ret = estrndup(old, oldlen);
     925              22 :         retlen = oldlen;
     926              22 :         if (!retlen) {
     927               2 :                 goto empty_source;
     928                 :         }
     929                 :         
     930              20 :         if (all) {
     931                 :                 /* look for a match in the maps for this charset */
     932             500 :                 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
     933             480 :                         if (entity_map[j].charset != charset)
     934             352 :                                 continue;
     935                 : 
     936           10223 :                         for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
     937                 :                                 unsigned char entity[32];
     938           10095 :                                 int entity_length = 0;
     939                 : 
     940           10095 :                                 if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
     941            5733 :                                         continue;
     942                 : 
     943            4362 :                                 entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
     944            4362 :                                 if (entity_length >= sizeof(entity)) {
     945               0 :                                         continue;
     946                 :                                 }
     947                 : 
     948                 :                                 /* When we have MBCS entities in the tables above, this will need to handle it */
     949            4362 :                                 replacement_len = 0;
     950            4362 :                                 switch (charset) {
     951                 :                                         case cs_8859_1:
     952                 :                                         case cs_cp1252:
     953                 :                                         case cs_8859_15:
     954                 :                                         case cs_cp1251:
     955                 :                                         case cs_8859_5:
     956                 :                                         case cs_cp866:
     957                 :                                         case cs_koi8r:
     958             678 :                                                 replacement[0] = k;
     959             678 :                                                 replacement[1] = '\0';
     960             678 :                                                 replacement_len = 1;
     961             678 :                                                 break;
     962                 : 
     963                 :                                         case cs_big5:
     964                 :                                         case cs_gb2312:
     965                 :                                         case cs_big5hkscs:
     966                 :                                         case cs_sjis:
     967                 :                                         case cs_eucjp:
     968                 :                                                 /* we cannot properly handle those multibyte encodings
     969                 :                                                  * with php_str_to_str. skip it. */ 
     970               0 :                                                 continue;
     971                 : 
     972                 :                                         case cs_utf_8:
     973            3684 :                                                 replacement_len = php_utf32_utf8(replacement, k);
     974            3684 :                                                 break;
     975                 : 
     976                 :                                         default:
     977               0 :                                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
     978               0 :                                                 efree(ret);
     979               0 :                                                 return NULL;
     980                 :                                 }
     981                 : 
     982            4362 :                                 if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
     983              20 :                                         replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
     984              20 :                                         efree(ret);
     985              20 :                                         ret = replaced;
     986                 :                                 }
     987                 :                         }
     988                 :                 }
     989                 :         }
     990                 : 
     991             120 :         for (j = 0; basic_entities[j].charcode != 0; j++) {
     992                 : 
     993             100 :                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
     994              16 :                         continue;
     995                 :                 
     996              84 :                 replacement[0] = (unsigned char)basic_entities[j].charcode;
     997              84 :                 replacement[1] = '\0';
     998                 : 
     999              84 :                 if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {              
    1000               6 :                         replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
    1001               6 :                         efree(ret);
    1002               6 :                         ret = replaced;
    1003                 :                 }
    1004                 :         }
    1005                 : 
    1006                 :         /* replace numeric entities & "&amp;" */
    1007              20 :         lim = ret + retlen;
    1008             142 :         for (p = ret, q = ret; p < lim;) {
    1009                 :                 int code;
    1010                 : 
    1011             102 :                 if (p[0] == '&') {
    1012               8 :                         if (p + 2 < lim) {
    1013               8 :                                 if (p[1] == '#') {
    1014               0 :                                         int invalid_code = 0;
    1015                 : 
    1016               0 :                                         if (p[2] == 'x' || p[2] == 'X') {
    1017               0 :                                                 code = strtol(p + 3, &next, 16);
    1018                 :                                         } else {
    1019               0 :                                                 code = strtol(p + 2, &next, 10);
    1020                 :                                         }
    1021                 : 
    1022               0 :                                         if (next != NULL && *next == ';') {
    1023               0 :                                                 switch (charset) {
    1024                 :                                                         case cs_utf_8:
    1025               0 :                                                                 q += php_utf32_utf8(q, code);
    1026               0 :                                                                 break;
    1027                 : 
    1028                 :                                                         case cs_8859_1:
    1029                 :                                                         case cs_8859_5:
    1030                 :                                                         case cs_8859_15:
    1031               0 :                                                                 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
    1032               0 :                                                                         invalid_code = 1;
    1033                 :                                                                 } else {
    1034               0 :                                                                         if (code == 39 || !quote_style) {
    1035               0 :                                                                                 invalid_code = 1;
    1036                 :                                                                         } else {
    1037               0 :                                                                                 *(q++) = code;
    1038                 :                                                                         }
    1039                 :                                                                 }
    1040               0 :                                                                 break;
    1041                 : 
    1042                 :                                                         case cs_cp1252:
    1043                 :                                                         case cs_cp1251:
    1044                 :                                                         case cs_cp866:
    1045               0 :                                                                 if (code > 0xff) {
    1046               0 :                                                                         invalid_code = 1;
    1047                 :                                                                 } else {
    1048               0 :                                                                         *(q++) = code;
    1049                 :                                                                 }
    1050               0 :                                                                 break;
    1051                 : 
    1052                 :                                                         case cs_big5:
    1053                 :                                                         case cs_big5hkscs:
    1054                 :                                                         case cs_sjis:
    1055                 :                                                         case cs_eucjp:
    1056               0 :                                                                 if (code >= 0x80) {
    1057               0 :                                                                         invalid_code = 1;
    1058                 :                                                                 } else {
    1059               0 :                                                                         *(q++) = code;
    1060                 :                                                                 }
    1061               0 :                                                                 break;
    1062                 : 
    1063                 :                                                         case cs_gb2312:
    1064               0 :                                                                 if (code >= 0x81) {
    1065               0 :                                                                         invalid_code = 1;
    1066                 :                                                                 } else {
    1067               0 :                                                                         *(q++) = code;
    1068                 :                                                                 }
    1069               0 :                                                                 break;
    1070                 : 
    1071                 :                                                         default:
    1072                 :                                                                 /* for backwards compatilibity */
    1073               0 :                                                                 invalid_code = 1;
    1074                 :                                                                 break;
    1075                 :                                                 }
    1076               0 :                                                 if (invalid_code) {
    1077               0 :                                                         for (; p <= next; p++) {
    1078               0 :                                                                 *(q++) = *p;
    1079                 :                                                         }
    1080                 :                                                 }
    1081               0 :                                                 p = next + 1;
    1082                 :                                         } else {
    1083               0 :                                                 *(q++) = *(p++);        
    1084               0 :                                                 *(q++) = *(p++);        
    1085                 :                                         }
    1086              16 :                                 } else if (p + 4 < lim &&
    1087                 :                                                         p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
    1088                 :                                                         p[4] == ';') {
    1089               8 :                                         *(q++) = '&';
    1090               8 :                                         p += 5;
    1091                 :                                 } else {
    1092               0 :                                         *(q++) = *(p++);
    1093               0 :                                         *(q++) = *(p++);
    1094                 :                                 }
    1095                 :                         } else {
    1096               0 :                                 *(q++) = *(p++);        
    1097                 :                         }
    1098                 :                 } else {
    1099              94 :                         *(q++) = *(p++);        
    1100                 :                 }
    1101                 :         }
    1102              20 :         *q = '\0';
    1103              20 :         retlen = (size_t)(q - ret);
    1104              22 : empty_source:   
    1105              22 :         *newlen = retlen;
    1106              22 :         return ret;
    1107                 : }
    1108                 : /* }}} */
    1109                 : 
    1110                 : PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
    1111            1215 : {
    1112            1215 :         return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
    1113                 : }
    1114                 : 
    1115                 : 
    1116                 : /* {{{ php_escape_html_entities
    1117                 :  */
    1118                 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
    1119            5866 : {
    1120                 :         int i, j, maxlen, len;
    1121                 :         char *replaced;
    1122            5866 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
    1123                 :         int matches_map;
    1124                 : 
    1125            5866 :         maxlen = 2 * oldlen;
    1126            5866 :         if (maxlen < 128)
    1127            5818 :                 maxlen = 128;
    1128            5866 :         replaced = emalloc (maxlen);
    1129            5866 :         len = 0;
    1130            5866 :         i = 0;
    1131           39179 :         while (i < oldlen) {
    1132                 :                 unsigned char mbsequence[16];   /* allow up to 15 characters in a multibyte sequence */
    1133           29886 :                 int mbseqlen = sizeof(mbsequence);
    1134           29886 :                 int status = SUCCESS;
    1135           29886 :                 unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
    1136                 : 
    1137           29886 :                 if(status == FAILURE) {
    1138                 :                         /* invalid MB sequence */
    1139            2439 :                         efree(replaced);
    1140            2439 :                         if(!PG(display_errors)) {
    1141               0 :                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
    1142                 :                         }
    1143            2439 :                         *newlen = 0;
    1144            2439 :                         return STR_EMPTY_ALLOC();
    1145                 :                 }
    1146           27447 :                 matches_map = 0;
    1147                 : 
    1148           27447 :                 if (len + 16 > maxlen)
    1149               2 :                         replaced = erealloc (replaced, maxlen += 128);
    1150                 : 
    1151           27447 :                 if (all) {
    1152                 :                         /* look for a match in the maps for this charset */
    1153            2445 :                         unsigned char *rep = NULL;
    1154                 : 
    1155                 : 
    1156           58594 :                         for (j = 0; entity_map[j].charset != cs_terminator; j++) {
    1157           56290 :                                 if (entity_map[j].charset == charset
    1158                 :                                                 && this_char >= entity_map[j].basechar
    1159                 :                                                 && this_char <= entity_map[j].endchar) {
    1160             141 :                                         rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
    1161             141 :                                         if (rep == NULL) {
    1162                 :                                                 /* there is no entity for this position; fall through and
    1163                 :                                                  * just output the character itself */
    1164               1 :                                                 break;
    1165                 :                                         }
    1166                 : 
    1167             140 :                                         matches_map = 1;
    1168             140 :                                         break;
    1169                 :                                 }
    1170                 :                         }
    1171                 : 
    1172            2445 :                         if (matches_map) {
    1173             140 :                                 int l = strlen(rep);
    1174                 :                                 /* increase the buffer size */
    1175             140 :                                 if (len + 2 + l >= maxlen) {
    1176               0 :                                         replaced = erealloc(replaced, maxlen += 128);
    1177                 :                                 }
    1178                 : 
    1179             140 :                                 replaced[len++] = '&';
    1180             140 :                                 strlcpy(replaced + len, rep, maxlen);
    1181             140 :                                 len += l;
    1182             140 :                                 replaced[len++] = ';';
    1183                 :                         }
    1184                 :                 }
    1185           27447 :                 if (!matches_map) {     
    1186           27307 :                         int is_basic = 0;
    1187                 : 
    1188           27307 :                         if (this_char == '&') {
    1189             168 :                                 if (double_encode) {
    1190             138 : encode_amp:
    1191             138 :                                         memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
    1192             138 :                                         len += sizeof("&amp;") - 1;
    1193                 :                                 } else {
    1194              50 :                                         char *e = memchr(old + i, ';', oldlen - i);
    1195              50 :                                         char *s = old + i;
    1196                 : 
    1197              50 :                                         if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
    1198                 :                                                 goto encode_amp;
    1199                 :                                         } else {
    1200              40 :                                                 if (*s == '#') { /* numeric entities */
    1201              12 :                                                         s++;
    1202                 :                                                         /* Hex (&#x5A;) */
    1203              16 :                                                         if (*s == 'x' || *s == 'X') {
    1204               6 :                                                                 s++;
    1205              20 :                                                                 while (s < e) {
    1206              10 :                                                                         if (!isxdigit((int)*(unsigned char *)s++)) {
    1207               2 :                                                                                 goto encode_amp;
    1208                 :                                                                         }
    1209                 :                                                                 }
    1210                 :                                                         /* Dec (&#90;)*/
    1211                 :                                                         } else {
    1212              22 :                                                                 while (s < e) {
    1213              12 :                                                                         if (!isdigit((int)*(unsigned char *)s++)) {
    1214               2 :                                                                                 goto encode_amp;
    1215                 :                                                                         }
    1216                 :                                                                 }
    1217                 :                                                         }
    1218                 :                                                 } else { /* text entities */
    1219             124 :                                                         while (s < e) {
    1220              74 :                                                                 if (!isalnum((int)*(unsigned char *)s++)) {
    1221               6 :                                                                         goto encode_amp;
    1222                 :                                                                 }
    1223                 :                                                         }
    1224                 :                                                 }
    1225              30 :                                                 replaced[len++] = '&';
    1226                 :                                         }
    1227                 :                                 }
    1228             168 :                                 is_basic = 1;
    1229                 :                         } else {
    1230          162057 :                                 for (j = 0; basic_entities[j].charcode != 0; j++) {
    1231          135213 :                                         if ((basic_entities[j].charcode != this_char) ||
    1232                 :                                                         (basic_entities[j].flags &&
    1233                 :                                                         (quote_style & basic_entities[j].flags) == 0)) {
    1234                 :                                                 continue;
    1235                 :                                         }
    1236                 : 
    1237             295 :                                         memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
    1238             295 :                                         len += basic_entities[j].entitylen;
    1239                 :                 
    1240             295 :                                         is_basic = 1;
    1241             295 :                                         break;
    1242                 :                                 }
    1243                 :                         }
    1244                 : 
    1245           27307 :                         if (!is_basic) {
    1246                 :                                 /* a wide char without a named entity; pass through the original sequence */
    1247           26844 :                                 if (mbseqlen > 1) {
    1248            1601 :                                         memcpy(replaced + len, mbsequence, mbseqlen);
    1249            1601 :                                         len += mbseqlen;
    1250                 :                                 } else {
    1251           25243 :                                         replaced[len++] = (unsigned char)this_char;
    1252                 :                                 }
    1253                 :                         }
    1254                 :                 }
    1255                 :         }
    1256            3427 :         replaced[len] = '\0';
    1257            3427 :         *newlen = len;
    1258                 : 
    1259            3427 :         return replaced;
    1260                 : 
    1261                 : 
    1262                 : }
    1263                 : /* }}} */
    1264                 : 
    1265                 : /* {{{ php_html_entities
    1266                 :  */
    1267                 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
    1268            4653 : {
    1269            4653 :         char *str, *hint_charset = NULL;
    1270            4653 :         int str_len, hint_charset_len = 0;
    1271                 :         int len;
    1272            4653 :         long quote_style = ENT_COMPAT;
    1273                 :         char *replaced;
    1274            4653 :         zend_bool double_encode = 1;
    1275                 : 
    1276            4653 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
    1277               2 :                 return;
    1278                 :         }
    1279                 : 
    1280            4651 :         replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
    1281            4651 :         RETVAL_STRINGL(replaced, len, 0);
    1282                 : }
    1283                 : /* }}} */
    1284                 : 
    1285                 : #define HTML_SPECIALCHARS       0
    1286                 : #define HTML_ENTITIES           1
    1287                 : 
    1288                 : /* {{{ register_html_constants
    1289                 :  */
    1290                 : void register_html_constants(INIT_FUNC_ARGS)
    1291           13565 : {
    1292           13565 :         REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
    1293           13565 :         REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
    1294           13565 :         REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
    1295           13565 :         REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
    1296           13565 :         REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
    1297           13565 : }
    1298                 : /* }}} */
    1299                 : 
    1300                 : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
    1301                 :    Convert special characters to HTML entities */
    1302                 : PHP_FUNCTION(htmlspecialchars)
    1303            4278 : {
    1304            4278 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
    1305            4278 : }
    1306                 : /* }}} */
    1307                 : 
    1308                 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
    1309                 :    Convert special HTML entities back to characters */
    1310                 : PHP_FUNCTION(htmlspecialchars_decode)
    1311             113 : {
    1312                 :         char *str, *new_str, *e, *p;
    1313                 :         int len, j, i, new_len;
    1314             113 :         long quote_style = ENT_COMPAT;
    1315                 :         struct basic_entities_dec basic_entities_dec[8];
    1316                 : 
    1317             113 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
    1318              19 :                 return;
    1319                 :         }
    1320                 : 
    1321              94 :         new_str = estrndup(str, len);
    1322              94 :         new_len = len;
    1323              94 :         e = new_str + new_len;
    1324                 : 
    1325              94 :         if (!(p = memchr(new_str, '&', new_len))) {
    1326              25 :                 RETURN_STRINGL(new_str, new_len, 0);
    1327                 :         }
    1328                 : 
    1329             414 :         for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
    1330             345 :                 if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
    1331             133 :                         continue;
    1332                 :                 }
    1333             212 :                 basic_entities_dec[j].charcode = basic_entities[i].charcode;
    1334             212 :                 memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
    1335             212 :                 basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
    1336             212 :                 j++;
    1337                 :         }
    1338              69 :         basic_entities_dec[j].charcode = '&';
    1339              69 :         basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
    1340              69 :         memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
    1341              69 :         i = j + 1;
    1342                 :         
    1343                 :         do {
    1344             343 :                 int l = e - p;
    1345                 :         
    1346            1188 :                 for (j = 0; j < i; j++) {
    1347            1014 :                         if (basic_entities_dec[j].entitylen > l) {
    1348               0 :                                 continue;
    1349                 :                         }
    1350            1014 :                         if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
    1351             169 :                                 int e_len = basic_entities_dec[j].entitylen - 1;
    1352                 :                 
    1353             169 :                                 *p++ = basic_entities_dec[j].charcode;
    1354             169 :                                 memmove(p, p + e_len, (e - p - e_len));
    1355             169 :                                 e -= e_len;
    1356             169 :                                 goto done;
    1357                 :                         }
    1358                 :                 }
    1359             174 :                 p++;
    1360                 : 
    1361             343 : done:
    1362             343 :                 if (p >= e) {
    1363              20 :                         break;
    1364                 :                 }
    1365             323 :         } while ((p = memchr(p, '&', (e - p))));
    1366                 : 
    1367              69 :         new_len = e - new_str;
    1368                 : 
    1369              69 :         new_str[new_len] = '\0';
    1370              69 :         RETURN_STRINGL(new_str, new_len, 0);
    1371                 : }
    1372                 : /* }}} */
    1373                 : 
    1374                 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
    1375                 :    Convert all HTML entities to their applicable characters */
    1376                 : PHP_FUNCTION(html_entity_decode)
    1377              22 : {
    1378              22 :         char *str, *hint_charset = NULL;
    1379                 :         int str_len, hint_charset_len, len;
    1380              22 :         long quote_style = ENT_COMPAT;
    1381                 :         char *replaced;
    1382                 : 
    1383              22 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
    1384                 :                                                           &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
    1385               0 :                 return;
    1386                 :         }
    1387                 : 
    1388              22 :         replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
    1389              22 :         if (replaced) {
    1390              22 :                 RETURN_STRINGL(replaced, len, 0);
    1391                 :         }
    1392               0 :         RETURN_FALSE;
    1393                 : }
    1394                 : /* }}} */
    1395                 : 
    1396                 : 
    1397                 : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
    1398                 :    Convert all applicable characters to HTML entities */
    1399                 : PHP_FUNCTION(htmlentities)
    1400             375 : {
    1401             375 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
    1402             375 : }
    1403                 : /* }}} */
    1404                 : 
    1405                 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]])
    1406                 :    Returns the internal translation table used by htmlspecialchars and htmlentities */
    1407                 : PHP_FUNCTION(get_html_translation_table)
    1408              68 : {
    1409              68 :         long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
    1410                 :         int i, j;
    1411                 :         char ind[2];
    1412              68 :         enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
    1413                 : 
    1414              68 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, &quote_style) == FAILURE) {
    1415              34 :                 return;
    1416                 :         }
    1417                 : 
    1418              34 :         array_init(return_value);
    1419                 : 
    1420              34 :         ind[1] = 0;
    1421                 : 
    1422              34 :         switch (which) {
    1423                 :                 case HTML_ENTITIES:
    1424             200 :                         for (j=0; entity_map[j].charset != cs_terminator; j++) {
    1425             192 :                                 if (entity_map[j].charset != charset)
    1426             184 :                                         continue;
    1427             776 :                                 for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
    1428                 :                                         char buffer[16];
    1429                 : 
    1430             768 :                                         if (entity_map[j].table[i] == NULL)
    1431               0 :                                                 continue;
    1432                 :                                         /* what about wide chars here ?? */
    1433             768 :                                         ind[0] = i + entity_map[j].basechar;
    1434             768 :                                         snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
    1435             768 :                                         add_assoc_string(return_value, ind, buffer, 1);
    1436                 : 
    1437                 :                                 }
    1438                 :                         }
    1439                 :                         /* break thru */
    1440                 : 
    1441                 :                 case HTML_SPECIALCHARS:
    1442             204 :                         for (j = 0; basic_entities[j].charcode != 0; j++) {
    1443                 : 
    1444             170 :                                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
    1445              70 :                                         continue;
    1446                 :                                 
    1447             100 :                                 ind[0] = (unsigned char)basic_entities[j].charcode;
    1448             100 :                                 add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
    1449                 :                         }
    1450              34 :                         add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
    1451                 : 
    1452                 :                         break;
    1453                 :         }
    1454                 : }
    1455                 : /* }}} */
    1456                 : 
    1457                 : /*
    1458                 :  * Local variables:
    1459                 :  * tab-width: 4
    1460                 :  * c-basic-offset: 4
    1461                 :  * End:
    1462                 :  * vim600: sw=4 ts=4 fdm=marker
    1463                 :  * vim<600: sw=4 ts=4
    1464                 :  */

Generated by: LTP GCOV extension version 1.5

Generated at Thu, 19 Nov 2009 08:20:24 +0000 (5 days ago)

Copyright © 2005-2009 The PHP Group
All rights reserved.