PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - standard - html.c
Test: PHP Code Coverage
Date: 2009-11-21 Instrumented lines: 474
Code covered: 80.8 % Executed lines: 383
Legend: not executed executed

       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 5                                                        |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | Copyright (c) 1997-2009 The PHP Group                                |
       6                 :    +----------------------------------------------------------------------+
       7                 :    | This source file is subject to version 3.01 of the PHP license,      |
       8                 :    | that is bundled with this package in the file LICENSE, and is        |
       9                 :    | available through the world-wide-web at the following url:           |
      10                 :    | http://www.php.net/license/3_01.txt                                  |
      11                 :    | If you did not receive a copy of the PHP license and are unable to   |
      12                 :    | obtain it through the world-wide-web, please send a note to          |
      13                 :    | license@php.net so we can mail you a copy immediately.               |
      14                 :    +----------------------------------------------------------------------+
      15                 :    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
      16                 :    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
      17                 :    |          Wez Furlong <wez@thebrainroom.com>                          |
      18                 :    +----------------------------------------------------------------------+
      19                 : */
      20                 : 
      21                 : /* $Id: html.c 289605 2009-10-13 05:18:37Z moriyoshi $ */
      22                 : 
      23                 : /*
      24                 :  * HTML entity resources:
      25                 :  *
      26                 :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
      27                 :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
      28                 :  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
      29                 :  *
      30                 :  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
      31                 :  * 
      32                 :  */
      33                 : 
      34                 : #include "php.h"
      35                 : #if PHP_WIN32
      36                 : #include "config.w32.h"
      37                 : #else
      38                 : #include <php_config.h>
      39                 : #endif
      40                 : #include "html.h"
      41                 : #include "php_string.h"
      42                 : #include "SAPI.h"
      43                 : #if HAVE_LOCALE_H
      44                 : #include <locale.h>
      45                 : #endif
      46                 : #if HAVE_LANGINFO_H
      47                 : #include <langinfo.h>
      48                 : #endif
      49                 : 
      50                 : #if HAVE_MBSTRING
      51                 : # include "ext/mbstring/mbstring.h"
      52                 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
      53                 : #endif
      54                 : 
      55                 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
      56                 :                                           cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 
      57                 :                                           cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
      58                 :                                           cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
      59                 :                                         };
      60                 : typedef const char *const entity_table_t;
      61                 : 
      62                 : /* codepage 1252 is a Windows extension to iso-8859-1. */
      63                 : static entity_table_t ent_cp_1252[] = {
      64                 :         "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
      65                 :         "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
      66                 :         NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
      67                 :         "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
      68                 :         "oelig", NULL, NULL, "Yuml" 
      69                 : };
      70                 : 
      71                 : static entity_table_t ent_iso_8859_1[] = {
      72                 :         "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
      73                 :         "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
      74                 :         "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
      75                 :         "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
      76                 :         "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
      77                 :         "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      78                 :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      79                 :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      80                 :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
      81                 :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
      82                 :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
      83                 :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
      84                 :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
      85                 :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
      86                 :         "uuml", "yacute", "thorn", "yuml"
      87                 : };
      88                 : 
      89                 : static entity_table_t ent_iso_8859_15[] = {
      90                 :         "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
      91                 :         "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
      92                 :         "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
      93                 :         "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
      94                 :         "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
      95                 :         "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      96                 :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      97                 :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      98                 :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
      99                 :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
     100                 :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
     101                 :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
     102                 :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
     103                 :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
     104                 :         "uuml", "yacute", "thorn", "yuml"
     105                 : };
     106                 : 
     107                 : static entity_table_t ent_uni_338_402[] = {
     108                 :         /* 338 (0x0152) */
     109                 :         "OElig", "oelig", NULL, NULL, NULL, NULL,
     110                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     111                 :         /* 352 (0x0160) */
     112                 :         "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
     113                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     114                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     115                 :         /* 376 (0x0178) */
     116                 :         "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     117                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     118                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     119                 :         /* 400 (0x0190) */
     120                 :         NULL, NULL, "fnof"
     121                 : };
     122                 : 
     123                 : static entity_table_t ent_uni_spacing[] = {
     124                 :         /* 710 */
     125                 :         "circ",
     126                 :         /* 711 - 730 */
     127                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     128                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     129                 :         /* 731 - 732 */
     130                 :         NULL, "tilde"
     131                 : };
     132                 : 
     133                 : static entity_table_t ent_uni_greek[] = {
     134                 :         /* 913 */
     135                 :         "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
     136                 :         "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
     137                 :         NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
     138                 :         /* 938 - 944 are not mapped */
     139                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     140                 :         "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
     141                 :         "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
     142                 :         "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
     143                 :         /* 970 - 976 are not mapped */
     144                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     145                 :         "thetasym", "upsih",
     146                 :         NULL, NULL, NULL,
     147                 :         "piv" 
     148                 : };
     149                 : 
     150                 : static entity_table_t ent_uni_punct[] = {
     151                 :         /* 8194 */
     152                 :         "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
     153                 :         "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
     154                 :         NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
     155                 :         /* 8216 */
     156                 :         "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
     157                 :         "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
     158                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
     159                 :         /* 8242 */
     160                 :         "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
     161                 :         NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
     162                 :         "frasl"
     163                 : };
     164                 : 
     165                 : static entity_table_t ent_uni_euro[] = {
     166                 :         "euro"
     167                 : };
     168                 : 
     169                 : static entity_table_t ent_uni_8465_8501[] = {
     170                 :         /* 8465 */
     171                 :         "image", NULL, NULL, NULL, NULL, NULL, NULL,
     172                 :         /* 8472 */
     173                 :         "weierp", NULL, NULL, NULL,
     174                 :         /* 8476 */
     175                 :         "real", NULL, NULL, NULL, NULL, NULL,
     176                 :         /* 8482 */
     177                 :         "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     178                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     179                 :         /* 8501 */
     180                 :         "alefsym",
     181                 : };
     182                 : 
     183                 : static entity_table_t ent_uni_8592_9002[] = {
     184                 :         /* 8592 (0x2190) */
     185                 :         "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
     186                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     187                 :         /* 8608 (0x21a0) */
     188                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     189                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     190                 :         /* 8624 (0x21b0) */
     191                 :         NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
     192                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     193                 :         /* 8640 (0x21c0) */
     194                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     195                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     196                 :         /* 8656 (0x21d0) */
     197                 :         "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
     198                 :         NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
     199                 :         /* 8672 (0x21e0) */
     200                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     201                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     202                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     203                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     204                 :         /* 8704 (0x2200) */
     205                 :         "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
     206                 :         "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
     207                 :         /* 8720 (0x2210) */
     208                 :         "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
     209                 :         "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
     210                 :         /* 8736 (0x2220) */
     211                 :         "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
     212                 :         "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
     213                 :         /* 8752 (0x2230) */
     214                 :         NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
     215                 :         NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
     216                 :         /* 8768 (0x2240) */
     217                 :         "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
     218                 :         "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
     219                 :         /* 8784 (0x2250) */
     220                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     221                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     222                 :         /* 8800 (0x2260) */
     223                 :         "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
     224                 :         "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
     225                 :         /* 8816 (0x2270) */
     226                 :         "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
     227                 :         NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
     228                 :         /* 8832 (0x2280) */
     229                 :         "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
     230                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     231                 :         /* 8848 (0x2290) */
     232                 :         NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
     233                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     234                 :         /* 8864 (0x22a0) */
     235                 :         NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
     236                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     237                 :         /* 8880 (0x22b0) */
     238                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     239                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     240                 :         /* 8896 (0x22c0) */
     241                 :         NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
     242                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     243                 :         /* 8912 (0x22d0) */
     244                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     245                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     246                 :         /* 8928 (0x22e0) */
     247                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     248                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     249                 :         /* 8944 (0x22f0) */
     250                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     251                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     252                 :         /* 8960 (0x2300) */
     253                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     254                 :         "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
     255                 :         /* 8976 (0x2310) */
     256                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     257                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     258                 :         /* 8992 (0x2320) */
     259                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     260                 :         NULL, "lang", "rang"
     261                 : };
     262                 : 
     263                 : static entity_table_t ent_uni_9674[] = {
     264                 :         /* 9674 */
     265                 :         "loz"
     266                 : };
     267                 : 
     268                 : static entity_table_t ent_uni_9824_9830[] = {
     269                 :         /* 9824 */
     270                 :         "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
     271                 : };
     272                 : 
     273                 : static entity_table_t ent_koi8r[] = {
     274                 :         "#1105", /* "jo "*/
     275                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     276                 :         NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
     277                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     278                 :         "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 
     279                 :         "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 
     280                 :         "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 
     281                 :         "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 
     282                 :         "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 
     283                 :         "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 
     284                 :         "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 
     285                 :         "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
     286                 :         "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 
     287                 :         "#1066"
     288                 : };
     289                 : 
     290                 : static entity_table_t ent_cp_1251[] = {
     291                 :         "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
     292                 :         "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
     293                 :         "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
     294                 :         "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
     295                 :         "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
     296                 :         "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
     297                 :         "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
     298                 :         "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
     299                 :         "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
     300                 :         "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
     301                 :         "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
     302                 :         "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
     303                 :         "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
     304                 :         "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
     305                 :         "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
     306                 :         "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
     307                 :         "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
     308                 :         "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
     309                 :         "#1103"
     310                 : };
     311                 : 
     312                 : static entity_table_t ent_iso_8859_5[] = {
     313                 :         "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
     314                 :         "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
     315                 :         "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
     316                 :         "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
     317                 :         "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
     318                 :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
     319                 :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
     320                 :         "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
     321                 :         "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
     322                 :         "#1119"
     323                 : };
     324                 : 
     325                 : static entity_table_t ent_cp_866[] = {
     326                 : 
     327                 :         "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 
     328                 :         "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 
     329                 :         "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 
     330                 :         "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 
     331                 :         "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 
     332                 :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 
     333                 :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 
     334                 :         "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 
     335                 :         "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632", 
     336                 :         "#160"
     337                 : };
     338                 : 
     339                 : /* MacRoman has a couple of low-ascii chars that need mapping too */
     340                 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
     341                 : /* DB exports, this mapping changes it to a space */
     342                 : static entity_table_t ent_macroman[] = {
     343                 :         "sp", NULL, NULL, NULL,
     344                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     345                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     346                 :         NULL, NULL, NULL, NULL, NULL, "quot", NULL,
     347                 :         NULL, NULL, "amp", NULL, NULL, NULL, NULL,
     348                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     349                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     350                 :         NULL, NULL, NULL, "lt", NULL, "gt", NULL,
     351                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     352                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     353                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     354                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     355                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     356                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     357                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     358                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     359                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     360                 :         NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
     361                 :         "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
     362                 :         "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
     363                 :         "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
     364                 :         "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
     365                 :         "cent", "pound", "sect", "bull", "para", "szlig", "reg",
     366                 :         "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
     367                 :         "infin", "plusmn", "le", "ge", "yen", "micro", "part",
     368                 :         "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
     369                 :         "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
     370                 :         "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
     371                 :         "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
     372                 :         "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
     373                 :         "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
     374                 :         "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
     375                 :         "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
     376                 :         "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
     377                 :         "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
     378                 :         "#733", "#731", "#711"
     379                 : };
     380                 : 
     381                 : struct html_entity_map {
     382                 :         enum entity_charset charset;    /* charset identifier */
     383                 :         unsigned int basechar;                  /* char code at start of table */
     384                 :         unsigned int endchar;                   /* last char code in the table */
     385                 :         entity_table_t *table;                  /* the table of mappings */
     386                 : };
     387                 : 
     388                 : static const struct html_entity_map entity_map[] = {
     389                 :         { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
     390                 :         { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
     391                 :         { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
     392                 :         { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
     393                 :         { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
     394                 :         { cs_utf_8,             338,  402,  ent_uni_338_402 },
     395                 :         { cs_utf_8,             710,  732,  ent_uni_spacing },
     396                 :         { cs_utf_8,             913,  982,  ent_uni_greek },
     397                 :         { cs_utf_8,             8194, 8260, ent_uni_punct },
     398                 :         { cs_utf_8,             8364, 8364, ent_uni_euro }, 
     399                 :         { cs_utf_8,             8465, 8501, ent_uni_8465_8501 },
     400                 :         { cs_utf_8,             8592, 9002, ent_uni_8592_9002 },
     401                 :         { cs_utf_8,             9674, 9674, ent_uni_9674 },
     402                 :         { cs_utf_8,             9824, 9830, ent_uni_9824_9830 },
     403                 :         { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
     404                 :         { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
     405                 :         { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
     406                 :         { cs_sjis,                      0xa0, 0xff, ent_iso_8859_1 },
     407                 :         { cs_eucjp,                     0xa0, 0xff, ent_iso_8859_1 },
     408                 :         { cs_koi8r,                 0xa3, 0xff, ent_koi8r },
     409                 :         { cs_cp1251,            0x80, 0xff, ent_cp_1251 },
     410                 :         { cs_8859_5,            0xc0, 0xff, ent_iso_8859_5 },
     411                 :         { cs_cp866,                 0xc0, 0xff, ent_cp_866 },
     412                 :         { cs_macroman,          0x0b, 0xff, ent_macroman },
     413                 :         { cs_terminator }
     414                 : };
     415                 : 
     416                 : static const struct {
     417                 :         const char *codeset;
     418                 :         enum entity_charset charset;
     419                 : } charset_map[] = {
     420                 :         { "ISO-8859-1",       cs_8859_1 },
     421                 :         { "ISO8859-1",                cs_8859_1 },
     422                 :         { "ISO-8859-15",      cs_8859_15 },
     423                 :         { "ISO8859-15",       cs_8859_15 },
     424                 :         { "utf-8",                    cs_utf_8 },
     425                 :         { "cp1252",           cs_cp1252 },
     426                 :         { "Windows-1252",     cs_cp1252 },
     427                 :         { "1252",           cs_cp1252 }, 
     428                 :         { "BIG5",                     cs_big5 },
     429                 :         { "950",            cs_big5 },
     430                 :         { "GB2312",                   cs_gb2312 },
     431                 :         { "936",            cs_gb2312 },
     432                 :         { "BIG5-HKSCS",               cs_big5hkscs },
     433                 :         { "Shift_JIS",                cs_sjis },
     434                 :         { "SJIS",             cs_sjis },
     435                 :         { "932",            cs_sjis },
     436                 :         { "EUCJP",            cs_eucjp },
     437                 :         { "EUC-JP",                   cs_eucjp },
     438                 :         { "KOI8-R",         cs_koi8r },
     439                 :         { "koi8-ru",        cs_koi8r },
     440                 :         { "koi8r",          cs_koi8r },
     441                 :         { "cp1251",         cs_cp1251 },
     442                 :         { "Windows-1251",   cs_cp1251 },
     443                 :         { "win-1251",       cs_cp1251 },
     444                 :         { "iso8859-5",      cs_8859_5 },
     445                 :         { "iso-8859-5",     cs_8859_5 },
     446                 :         { "cp866",          cs_cp866 },
     447                 :         { "866",            cs_cp866 },    
     448                 :         { "ibm866",         cs_cp866 },
     449                 :         { "MacRoman",       cs_macroman },
     450                 :         { NULL }
     451                 : };
     452                 : 
     453                 : static const struct {
     454                 :         unsigned short charcode;
     455                 :         char *entity;
     456                 :         int entitylen;
     457                 :         int flags;
     458                 : } basic_entities[] = {
     459                 :         { '"',     "&quot;", 6,      ENT_HTML_QUOTE_DOUBLE },
     460                 :         { '\'', "&#039;", 6,      ENT_HTML_QUOTE_SINGLE },
     461                 :         { '\'', "&#39;",  5,      ENT_HTML_QUOTE_SINGLE },
     462                 :         { '<',       "&lt;",           4,      0 },
     463                 :         { '>',       "&gt;",           4,      0 },
     464                 :         { 0, NULL, 0, 0 }
     465                 : };
     466                 :         
     467                 : struct basic_entities_dec {
     468                 :         unsigned short charcode;
     469                 :         char entity[8];
     470                 :         int entitylen;  
     471                 : };
     472                 :         
     473                 : #define MB_RETURN { \
     474                 :                         *newpos = pos;       \
     475                 :                         mbseq[mbpos] = '\0'; \
     476                 :                         *mbseqlen = mbpos;   \
     477                 :                         return this_char; }
     478                 :                                         
     479                 : #define MB_WRITE(mbchar) { \
     480                 :                         mbspace--;  \
     481                 :                         if (mbspace == 0) {      \
     482                 :                                 MB_RETURN;           \
     483                 :                         }                        \
     484                 :                         mbseq[mbpos++] = (mbchar); }
     485                 : 
     486                 : /* skip one byte and return */
     487                 : #define MB_FAILURE(pos) do { \
     488                 :         *newpos = pos + 1; \
     489                 :         *status = FAILURE; \
     490                 :         return 0; \
     491                 : } while (0)
     492                 : 
     493                 : #define CHECK_LEN(pos, chars_need)                      \
     494                 :         if (chars_need < 1) {                                                \
     495                 :                 if((str_len - (pos)) < chars_need) { \
     496                 :                         *newpos = pos;                                          \
     497                 :                         *status = FAILURE;                                      \
     498                 :                         return 0;                                                       \
     499                 :                 }                                                                               \
     500                 :         } else {                                                                        \
     501                 :                 if((str_len - (pos)) < chars_need) { \
     502                 :                         *newpos = pos + 1;                                      \
     503                 :                         *status = FAILURE;                                      \
     504                 :                         return 0;                                                       \
     505                 :                 }                                                                               \
     506                 :         }
     507                 : 
     508                 : /* {{{ get_next_char
     509                 :  */
     510                 : inline static unsigned int get_next_char(enum entity_charset charset,
     511                 :                 unsigned char * str,
     512                 :                 int str_len,
     513                 :                 int * newpos,
     514                 :                 unsigned char * mbseq,
     515                 :                 int * mbseqlen, 
     516                 :                 int *status)
     517           35486 : {
     518           35486 :         int pos = *newpos;
     519           35486 :         int mbpos = 0;
     520           35486 :         int mbspace = *mbseqlen;
     521           35486 :         unsigned int this_char = 0;
     522                 :         unsigned char next_char;
     523                 : 
     524           35486 :         *status = SUCCESS;
     525                 : 
     526           35486 :         if (mbspace <= 0) {
     527               0 :                 *mbseqlen = 0;
     528               0 :                 CHECK_LEN(pos, 1);
     529               0 :                 *newpos = pos + 1;
     530               0 :                 return str[pos];
     531                 :         }
     532                 : 
     533           35486 :         switch (charset) {
     534                 :                 case cs_utf_8:
     535                 :                         {
     536                 :                                 unsigned char c;
     537             297 :                                 CHECK_LEN(pos, 1);
     538             297 :                                 c = str[pos];
     539             297 :                                 if (c < 0x80) {
     540             100 :                                         MB_WRITE(c);
     541             100 :                                         this_char = c;
     542             100 :                                         pos++;
     543             197 :                                 } else if (c < 0xc0) {
     544              30 :                                         MB_FAILURE(pos);
     545             167 :                                 } else if (c < 0xe0) {
     546              44 :                                         CHECK_LEN(pos, 2);
     547              36 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     548               6 :                                                 MB_FAILURE(pos);
     549                 :                                         }
     550              30 :                                         this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
     551              30 :                                         if (this_char < 0x80) {
     552               3 :                                                 MB_FAILURE(pos);
     553                 :                                         }
     554              27 :                                         MB_WRITE((unsigned char)c);
     555              27 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     556              27 :                                         pos += 2;
     557             123 :                                 } else if (c < 0xf0) {
     558              54 :                                         CHECK_LEN(pos, 3);
     559              30 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     560               3 :                                                 MB_FAILURE(pos);
     561                 :                                         }
     562              27 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     563               2 :                                                 MB_FAILURE(pos);
     564                 :                                         }
     565              25 :                                         this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
     566              25 :                                         if (this_char < 0x800) {
     567               8 :                                                 MB_FAILURE(pos);
     568                 :                                         }
     569              17 :                                         MB_WRITE((unsigned char)c);
     570              17 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     571              17 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     572              17 :                                         pos += 3;
     573              69 :                                 } else if (c < 0xf8) {
     574              28 :                                         CHECK_LEN(pos, 4);
     575              16 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     576               2 :                                                 MB_FAILURE(pos);
     577                 :                                         }
     578              14 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     579               2 :                                                 MB_FAILURE(pos);
     580                 :                                         }
     581              12 :                                         if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
     582               2 :                                                 MB_FAILURE(pos);
     583                 :                                         }
     584              10 :                                         this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
     585              10 :                                         if (this_char < 0x10000) {
     586               3 :                                                 MB_FAILURE(pos);
     587                 :                                         }
     588               7 :                                         MB_WRITE((unsigned char)c);
     589               7 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     590               7 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     591               7 :                                         MB_WRITE((unsigned char)str[pos + 3]);
     592               7 :                                         pos += 4;
     593                 :                                 } else {
     594              41 :                                         MB_FAILURE(pos);
     595                 :                                 }
     596                 :                         }
     597             151 :                         break;
     598                 :                 case cs_big5:
     599                 :                 case cs_gb2312:
     600                 :                 case cs_big5hkscs:
     601                 :                         {
     602            1262 :                                 CHECK_LEN(pos, 1);
     603            1262 :                                 this_char = str[pos++];
     604                 :                                 /* check if this is the first of a 2-byte sequence */
     605            1766 :                                 if (this_char >= 0x81 && this_char <= 0xfe) {
     606                 :                                         /* peek at the next char */
     607            1260 :                                         CHECK_LEN(pos, 1);
     608            1134 :                                         next_char = str[pos++];
     609            1134 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     610                 :                                                         (next_char >= 0xa1 && next_char <= 0xfe)) {
     611                 :                                                 /* yes, this a wide char */
     612             504 :                                                 MB_WRITE(this_char);
     613             504 :                                                 MB_WRITE(next_char);
     614             504 :                                                 this_char = (this_char << 8) | next_char;
     615                 :                                         } else {
     616             630 :                                                 MB_FAILURE(pos);
     617                 :                                         }
     618                 :                                 } else {
     619               2 :                                         MB_WRITE(this_char);
     620                 :                                 }
     621                 :                         }
     622             506 :                         break;
     623                 :                 case cs_sjis:
     624                 :                         {
     625             681 :                                 CHECK_LEN(pos, 1);
     626             681 :                                 this_char = str[pos++];
     627                 :                                 /* check if this is the first of a 2-byte sequence */
     628             930 :                                 if ((this_char >= 0x81 && this_char <= 0x9f) ||
     629                 :                                         (this_char >= 0xe0 && this_char <= 0xfc)) {
     630                 :                                         /* peek at the next char */
     631             609 :                                         CHECK_LEN(pos, 1);
     632             549 :                                         next_char = str[pos++];
     633             549 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     634                 :                                                 (next_char >= 0x80 && next_char <= 0xfc))
     635                 :                                         {
     636                 :                                                 /* yes, this a wide char */
     637             249 :                                                 MB_WRITE(this_char);
     638             249 :                                                 MB_WRITE(next_char);
     639             249 :                                                 this_char = (this_char << 8) | next_char;
     640                 :                                         } else {
     641             300 :                                                 MB_FAILURE(pos);
     642                 :                                         }
     643                 :                                 } else {
     644              72 :                                         MB_WRITE(this_char);
     645                 :                                 }
     646             321 :                                 break;
     647                 :                         }
     648                 :                 case cs_eucjp:
     649                 :                         {
     650            2402 :                                 CHECK_LEN(pos, 1);
     651            2402 :                                 this_char = str[pos++];
     652                 :                                 /* check if this is the first of a multi-byte sequence */
     653            2602 :                                 if (this_char >= 0xa1 && this_char <= 0xfe) {
     654                 :                                         /* peek at the next char */
     655             797 :                                         CHECK_LEN(pos, 1);
     656             576 :                                         next_char = str[pos++];
     657             576 :                                         if (next_char >= 0xa1 && next_char <= 0xfe) {
     658                 :                                                 /* yes, this a jis kanji char */
     659             200 :                                                 MB_WRITE(this_char);
     660             200 :                                                 MB_WRITE(next_char);
     661             200 :                                                 this_char = (this_char << 8) | next_char;
     662                 :                                         } else {
     663             376 :                                                 MB_FAILURE(pos);
     664                 :                                         }
     665            1605 :                                 } else if (this_char == 0x8e) {
     666                 :                                         /* peek at the next char */
     667             661 :                                         CHECK_LEN(pos, 1);
     668             660 :                                         next_char = str[pos++];
     669             660 :                                         if (next_char >= 0xa1 && next_char <= 0xdf) {
     670                 :                                                 /* JIS X 0201 kana */
     671             443 :                                                 MB_WRITE(this_char);
     672             443 :                                                 MB_WRITE(next_char);
     673             443 :                                                 this_char = (this_char << 8) | next_char;
     674                 :                                         } else {
     675             217 :                                                 MB_FAILURE(pos);
     676                 :                                         }
     677             944 :                                 } else if (this_char == 0x8f) {
     678                 :                                         /* peek at the next two char */
     679                 :                                         unsigned char next2_char;
     680             661 :                                         CHECK_LEN(pos, 2);
     681             565 :                                         next_char = str[pos];
     682             565 :                                         next2_char = str[pos + 1];
     683             565 :                                         pos += 2;
     684             565 :                                         if ((next_char >= 0xa1 && next_char <= 0xfe) &&
     685                 :                                                 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
     686                 :                                                 /* JIS X 0212 hojo-kanji */
     687             189 :                                                 MB_WRITE(this_char);
     688             189 :                                                 MB_WRITE(next_char);
     689             189 :                                                 MB_WRITE(next2_char);
     690             189 :                                                 this_char = (this_char << 16) | (next_char << 8) | next2_char;
     691                 :                                         } else {
     692             376 :                                                 MB_FAILURE(pos);
     693                 :                                         }
     694                 :                                 } else {
     695             283 :                                         MB_WRITE(this_char);
     696                 :                                 }
     697            1115 :                                 break;
     698                 :                         }
     699                 :                 default:
     700                 :                         /* single-byte charsets */
     701           30844 :                         CHECK_LEN(pos, 1);
     702           30844 :                         this_char = str[pos++];
     703           30844 :                         MB_WRITE(this_char);
     704                 :                         break;
     705                 :         }
     706           32937 :         MB_RETURN;
     707                 : }
     708                 : /* }}} */
     709                 : 
     710                 : /* {{{ entity_charset determine_charset
     711                 :  * returns the charset identifier based on current locale or a hint.
     712                 :  * defaults to iso-8859-1 */
     713                 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
     714            6480 : {
     715                 :         int i;
     716            6480 :         enum entity_charset charset = cs_8859_1;
     717            6480 :         int len = 0;
     718            6480 :         zval *uf_result = NULL;
     719                 : 
     720                 :         /* Guarantee default behaviour for backwards compatibility */
     721            6480 :         if (charset_hint == NULL)
     722            2305 :                 return cs_8859_1;
     723                 : 
     724            4175 :         if ((len = strlen(charset_hint)) != 0) {
     725            4152 :                 goto det_charset;
     726                 :         }
     727                 : #if HAVE_MBSTRING
     728                 : #if !defined(COMPILE_DL_MBSTRING)
     729                 :         /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
     730              23 :         switch (MBSTRG(current_internal_encoding)) {
     731                 :                 case mbfl_no_encoding_8859_1:
     732               2 :                         return cs_8859_1;
     733                 : 
     734                 :                 case mbfl_no_encoding_utf8:
     735               0 :                         return cs_utf_8;
     736                 : 
     737                 :                 case mbfl_no_encoding_euc_jp:
     738                 :                 case mbfl_no_encoding_eucjp_win:
     739               2 :                         return cs_eucjp;
     740                 : 
     741                 :                 case mbfl_no_encoding_sjis:
     742                 :                 case mbfl_no_encoding_sjis_win:
     743                 :                 case mbfl_no_encoding_sjis_mac:
     744               2 :                         return cs_sjis;
     745                 : 
     746                 :                 case mbfl_no_encoding_cp1252:
     747               3 :                         return cs_cp1252;
     748                 : 
     749                 :                 case mbfl_no_encoding_8859_15:
     750               2 :                         return cs_8859_15;
     751                 : 
     752                 :                 case mbfl_no_encoding_big5:
     753               0 :                         return cs_big5;
     754                 : 
     755                 :                 case mbfl_no_encoding_euc_cn:
     756                 :                 case mbfl_no_encoding_hz:
     757                 :                 case mbfl_no_encoding_cp936:
     758               0 :                         return cs_gb2312;
     759                 : 
     760                 :                 case mbfl_no_encoding_koi8r:
     761               0 :                         return cs_koi8r;
     762                 : 
     763                 :                 case mbfl_no_encoding_cp866:
     764               0 :                         return cs_cp866;
     765                 : 
     766                 :                 case mbfl_no_encoding_cp1251:
     767               2 :                         return cs_cp1251;
     768                 : 
     769                 :                 case mbfl_no_encoding_8859_5:
     770               0 :                         return cs_8859_5;
     771                 : 
     772                 :                 default:
     773                 :                         ;
     774                 :         }
     775                 : #else
     776                 :         {
     777                 :                 zval nm_mb_internal_encoding;
     778                 : 
     779                 :                 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
     780                 : 
     781                 :                 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
     782                 : 
     783                 :                         charset_hint = Z_STRVAL_P(uf_result);
     784                 :                         len = Z_STRLEN_P(uf_result);
     785                 :                         
     786                 :                         if (len == 4) { /* sizeof(none|auto|pass)-1 */
     787                 :                                 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || 
     788                 :                                     !memcmp("auto", charset_hint, sizeof("auto") - 1) || 
     789                 :                                     !memcmp("none", charset_hint, sizeof("none") - 1)) {
     790                 :                                         
     791                 :                                         charset_hint = NULL;
     792                 :                                         len = 0;
     793                 :                                 }
     794                 :                         }
     795                 :                         goto det_charset;
     796                 :                 }
     797                 :         }
     798                 : #endif
     799                 : #endif
     800                 : 
     801              10 :         charset_hint = SG(default_charset);
     802              10 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     803               6 :                 goto det_charset;
     804                 :         }
     805                 : 
     806                 :         /* try to detect the charset for the locale */
     807                 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
     808               4 :         charset_hint = nl_langinfo(CODESET);
     809               4 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     810               4 :                 goto det_charset;
     811                 :         }
     812                 : #endif
     813                 : 
     814                 : #if HAVE_LOCALE_H
     815                 :         /* try to figure out the charset from the locale */
     816                 :         {
     817                 :                 char *localename;
     818                 :                 char *dot, *at;
     819                 : 
     820                 :                 /* lang[_territory][.codeset][@modifier] */
     821               0 :                 localename = setlocale(LC_CTYPE, NULL);
     822                 : 
     823               0 :                 dot = strchr(localename, '.');
     824               0 :                 if (dot) {
     825               0 :                         dot++;
     826                 :                         /* locale specifies a codeset */
     827               0 :                         at = strchr(dot, '@');
     828               0 :                         if (at)
     829               0 :                                 len = at - dot;
     830                 :                         else
     831               0 :                                 len = strlen(dot);
     832               0 :                         charset_hint = dot;
     833                 :                 } else {
     834                 :                         /* no explicit name; see if the name itself
     835                 :                          * is the charset */
     836               0 :                         charset_hint = localename;
     837               0 :                         len = strlen(charset_hint);
     838                 :                 }
     839                 :         }
     840                 : #endif
     841                 : 
     842            4162 : det_charset:
     843                 : 
     844            4162 :         if (charset_hint) {
     845            4162 :                 int found = 0;
     846                 :                 
     847                 :                 /* now walk the charset map and look for the codeset */
     848           58237 :                 for (i = 0; charset_map[i].codeset; i++) {
     849           58231 :                         if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
     850            4156 :                                 charset = charset_map[i].charset;
     851            4156 :                                 found = 1;
     852            4156 :                                 break;
     853                 :                         }
     854                 :                 }
     855            4162 :                 if (!found) {
     856               6 :                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
     857                 :                                         charset_hint);
     858                 :                 }
     859                 :         }
     860            4162 :         if (uf_result != NULL) {
     861               0 :                 zval_ptr_dtor(&uf_result);
     862                 :         }
     863            4162 :         return charset;
     864                 : }
     865                 : /* }}} */
     866                 : 
     867                 : /* {{{ php_utf32_utf8 */
     868                 : size_t php_utf32_utf8(unsigned char *buf, int k)
     869            3684 : {
     870            3684 :         size_t retval = 0;
     871                 : 
     872            3684 :         if (k < 0x80) {
     873               0 :                 buf[0] = k;
     874               0 :                 retval = 1;
     875            3684 :         } else if (k < 0x800) {
     876            1872 :                 buf[0] = 0xc0 | (k >> 6);
     877            1872 :                 buf[1] = 0x80 | (k & 0x3f);
     878            1872 :                 retval = 2;
     879            1812 :         } else if (k < 0x10000) {
     880            1812 :                 buf[0] = 0xe0 | (k >> 12);
     881            1812 :                 buf[1] = 0x80 | ((k >> 6) & 0x3f);
     882            1812 :                 buf[2] = 0x80 | (k & 0x3f);
     883            1812 :                 retval = 3;
     884               0 :         } else if (k < 0x200000) {
     885               0 :                 buf[0] = 0xf0 | (k >> 18);
     886               0 :                 buf[1] = 0x80 | ((k >> 12) & 0x3f);
     887               0 :                 buf[2] = 0x80 | ((k >> 6) & 0x3f);
     888               0 :                 buf[3] = 0x80 | (k & 0x3f);
     889               0 :                 retval = 4;
     890               0 :         } else if (k < 0x4000000) {
     891               0 :                 buf[0] = 0xf8 | (k >> 24);
     892               0 :                 buf[1] = 0x80 | ((k >> 18) & 0x3f);
     893               0 :                 buf[2] = 0x80 | ((k >> 12) & 0x3f);
     894               0 :                 buf[3] = 0x80 | ((k >> 6) & 0x3f);
     895               0 :                 buf[4] = 0x80 | (k & 0x3f);
     896               0 :                 retval = 5;
     897                 :         } else {
     898               0 :                 buf[0] = 0xfc | (k >> 30);
     899               0 :                 buf[1] = 0x80 | ((k >> 24) & 0x3f);
     900               0 :                 buf[2] = 0x80 | ((k >> 18) & 0x3f);
     901               0 :                 buf[3] = 0x80 | ((k >> 12) & 0x3f);
     902               0 :                 buf[4] = 0x80 | ((k >> 6) & 0x3f);
     903               0 :                 buf[5] = 0x80 | (k & 0x3f);
     904               0 :                 retval = 6;
     905                 :         }
     906            3684 :         buf[retval] = '\0';
     907                 : 
     908            3684 :         return retval;
     909                 : }
     910                 : /* }}} */
     911                 : 
     912                 : /* {{{ php_unescape_html_entities
     913                 :  */
     914                 : PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
     915              22 : {
     916                 :         int retlen;
     917                 :         int j, k;
     918                 :         char *replaced, *ret, *p, *q, *lim, *next;
     919              22 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
     920                 :         unsigned char replacement[15];
     921                 :         int replacement_len;
     922                 : 
     923              22 :         ret = estrndup(old, oldlen);
     924              22 :         retlen = oldlen;
     925              22 :         if (!retlen) {
     926               2 :                 goto empty_source;
     927                 :         }
     928                 :         
     929              20 :         if (all) {
     930                 :                 /* look for a match in the maps for this charset */
     931             500 :                 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
     932             480 :                         if (entity_map[j].charset != charset)
     933             352 :                                 continue;
     934                 : 
     935           10223 :                         for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
     936                 :                                 unsigned char entity[32];
     937           10095 :                                 int entity_length = 0;
     938                 : 
     939           10095 :                                 if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
     940            5733 :                                         continue;
     941                 : 
     942            4362 :                                 entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
     943            4362 :                                 if (entity_length >= sizeof(entity)) {
     944               0 :                                         continue;
     945                 :                                 }
     946                 : 
     947                 :                                 /* When we have MBCS entities in the tables above, this will need to handle it */
     948            4362 :                                 replacement_len = 0;
     949            4362 :                                 switch (charset) {
     950                 :                                         case cs_8859_1:
     951                 :                                         case cs_cp1252:
     952                 :                                         case cs_8859_15:
     953                 :                                         case cs_cp1251:
     954                 :                                         case cs_8859_5:
     955                 :                                         case cs_cp866:
     956                 :                                         case cs_koi8r:
     957             678 :                                                 replacement[0] = k;
     958             678 :                                                 replacement[1] = '\0';
     959             678 :                                                 replacement_len = 1;
     960             678 :                                                 break;
     961                 : 
     962                 :                                         case cs_big5:
     963                 :                                         case cs_gb2312:
     964                 :                                         case cs_big5hkscs:
     965                 :                                         case cs_sjis:
     966                 :                                         case cs_eucjp:
     967                 :                                                 /* we cannot properly handle those multibyte encodings
     968                 :                                                  * with php_str_to_str. skip it. */ 
     969               0 :                                                 continue;
     970                 : 
     971                 :                                         case cs_utf_8:
     972            3684 :                                                 replacement_len = php_utf32_utf8(replacement, k);
     973            3684 :                                                 break;
     974                 : 
     975                 :                                         default:
     976               0 :                                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
     977               0 :                                                 efree(ret);
     978               0 :                                                 return NULL;
     979                 :                                 }
     980                 : 
     981            4362 :                                 if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
     982              20 :                                         replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
     983              20 :                                         efree(ret);
     984              20 :                                         ret = replaced;
     985                 :                                 }
     986                 :                         }
     987                 :                 }
     988                 :         }
     989                 : 
     990             120 :         for (j = 0; basic_entities[j].charcode != 0; j++) {
     991                 : 
     992             100 :                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
     993              16 :                         continue;
     994                 :                 
     995              84 :                 replacement[0] = (unsigned char)basic_entities[j].charcode;
     996              84 :                 replacement[1] = '\0';
     997                 : 
     998              84 :                 if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {              
     999               6 :                         replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
    1000               6 :                         efree(ret);
    1001               6 :                         ret = replaced;
    1002                 :                 }
    1003                 :         }
    1004                 : 
    1005                 :         /* replace numeric entities & "&amp;" */
    1006              20 :         lim = ret + retlen;
    1007             142 :         for (p = ret, q = ret; p < lim;) {
    1008                 :                 int code;
    1009                 : 
    1010             102 :                 if (p[0] == '&') {
    1011               8 :                         if (p + 2 < lim) {
    1012               8 :                                 if (p[1] == '#') {
    1013               0 :                                         int invalid_code = 0;
    1014                 : 
    1015               0 :                                         if (p[2] == 'x' || p[2] == 'X') {
    1016               0 :                                                 code = strtol(p + 3, &next, 16);
    1017                 :                                         } else {
    1018               0 :                                                 code = strtol(p + 2, &next, 10);
    1019                 :                                         }
    1020                 : 
    1021               0 :                                         if (next != NULL && *next == ';') {
    1022               0 :                                                 switch (charset) {
    1023                 :                                                         case cs_utf_8:
    1024               0 :                                                                 q += php_utf32_utf8(q, code);
    1025               0 :                                                                 break;
    1026                 : 
    1027                 :                                                         case cs_8859_1:
    1028                 :                                                         case cs_8859_5:
    1029                 :                                                         case cs_8859_15:
    1030               0 :                                                                 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
    1031               0 :                                                                         invalid_code = 1;
    1032                 :                                                                 } else {
    1033               0 :                                                                         if (code == 39 || !quote_style) {
    1034               0 :                                                                                 invalid_code = 1;
    1035                 :                                                                         } else {
    1036               0 :                                                                                 *(q++) = code;
    1037                 :                                                                         }
    1038                 :                                                                 }
    1039               0 :                                                                 break;
    1040                 : 
    1041                 :                                                         case cs_cp1252:
    1042               0 :                                                                 if (code > 0xff) {
    1043               0 :                                                                         invalid_code = 1;
    1044                 :                                                                 } else {
    1045               0 :                                                                         *(q++) = code;
    1046                 :                                                                 }
    1047               0 :                                                                 break;
    1048                 : 
    1049                 :                                                         case cs_cp1251:
    1050                 :                                                         case cs_cp866:
    1051                 :                                                         case cs_big5:
    1052                 :                                                         case cs_big5hkscs:
    1053                 :                                                         case cs_sjis:
    1054                 :                                                         case cs_eucjp:
    1055               0 :                                                                 if (code >= 0x80) {
    1056               0 :                                                                         invalid_code = 1;
    1057                 :                                                                 } else {
    1058               0 :                                                                         *(q++) = code;
    1059                 :                                                                 }
    1060               0 :                                                                 break;
    1061                 : 
    1062                 :                                                         case cs_gb2312:
    1063               0 :                                                                 if (code >= 0x81) {
    1064               0 :                                                                         invalid_code = 1;
    1065                 :                                                                 } else {
    1066               0 :                                                                         *(q++) = code;
    1067                 :                                                                 }
    1068               0 :                                                                 break;
    1069                 : 
    1070                 :                                                         default:
    1071                 :                                                                 /* for backwards compatilibity */
    1072               0 :                                                                 invalid_code = 1;
    1073                 :                                                                 break;
    1074                 :                                                 }
    1075               0 :                                                 if (invalid_code) {
    1076               0 :                                                         for (; p <= next; p++) {
    1077               0 :                                                                 *(q++) = *p;
    1078                 :                                                         }
    1079                 :                                                 }
    1080               0 :                                                 p = next + 1;
    1081                 :                                         } else {
    1082               0 :                                                 *(q++) = *(p++);        
    1083               0 :                                                 *(q++) = *(p++);        
    1084                 :                                         }
    1085              16 :                                 } else if (p + 4 < lim &&
    1086                 :                                                         p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
    1087                 :                                                         p[4] == ';') {
    1088               8 :                                         *(q++) = '&';
    1089               8 :                                         p += 5;
    1090                 :                                 } else {
    1091               0 :                                         *(q++) = *(p++);
    1092               0 :                                         *(q++) = *(p++);
    1093                 :                                 }
    1094                 :                         } else {
    1095               0 :                                 *(q++) = *(p++);        
    1096                 :                         }
    1097                 :                 } else {
    1098              94 :                         *(q++) = *(p++);        
    1099                 :                 }
    1100                 :         }
    1101              20 :         *q = '\0';
    1102              20 :         retlen = (size_t)(q - ret);
    1103              22 : empty_source:   
    1104              22 :         *newlen = retlen;
    1105              22 :         return ret;
    1106                 : }
    1107                 : /* }}} */
    1108                 : 
    1109                 : PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
    1110            1657 : {
    1111            1657 :         return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
    1112                 : }
    1113                 : 
    1114                 : 
    1115                 : /* {{{ php_escape_html_entities
    1116                 :  */
    1117                 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
    1118            6390 : {
    1119                 :         int i, j, maxlen, len;
    1120                 :         char *replaced;
    1121            6390 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
    1122                 :         int matches_map;
    1123                 : 
    1124            6390 :         maxlen = 2 * oldlen;
    1125            6390 :         if (maxlen < 128)
    1126            6338 :                 maxlen = 128;
    1127            6390 :         replaced = emalloc (maxlen);
    1128            6390 :         len = 0;
    1129            6390 :         i = 0;
    1130           45802 :         while (i < oldlen) {
    1131                 :                 unsigned char mbsequence[16];   /* allow up to 15 characters in a multibyte sequence */
    1132           35486 :                 int mbseqlen = sizeof(mbsequence);
    1133           35486 :                 int status = SUCCESS;
    1134           35486 :                 unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
    1135                 : 
    1136           35486 :                 if(status == FAILURE) {
    1137                 :                         /* invalid MB sequence */
    1138            2549 :                         if (quote_style & ENT_HTML_IGNORE_ERRORS) {
    1139              85 :                                 continue;
    1140                 :                         }
    1141            2464 :                         efree(replaced);
    1142            2464 :                         if(!PG(display_errors)) {
    1143               0 :                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
    1144                 :                         }
    1145            2464 :                         *newlen = 0;
    1146            2464 :                         return STR_EMPTY_ALLOC();
    1147                 :                 }
    1148           32937 :                 matches_map = 0;
    1149                 : 
    1150           32937 :                 if (len + 16 > maxlen)
    1151               2 :                         replaced = erealloc (replaced, maxlen += 128);
    1152                 : 
    1153           32937 :                 if (all) {
    1154                 :                         /* look for a match in the maps for this charset */
    1155            2495 :                         unsigned char *rep = NULL;
    1156                 : 
    1157                 : 
    1158           59804 :                         for (j = 0; entity_map[j].charset != cs_terminator; j++) {
    1159           57452 :                                 if (entity_map[j].charset == charset
    1160                 :                                                 && this_char >= entity_map[j].basechar
    1161                 :                                                 && this_char <= entity_map[j].endchar) {
    1162             143 :                                         rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
    1163             143 :                                         if (rep == NULL) {
    1164                 :                                                 /* there is no entity for this position; fall through and
    1165                 :                                                  * just output the character itself */
    1166               1 :                                                 break;
    1167                 :                                         }
    1168                 : 
    1169             142 :                                         matches_map = 1;
    1170             142 :                                         break;
    1171                 :                                 }
    1172                 :                         }
    1173                 : 
    1174            2495 :                         if (matches_map) {
    1175             142 :                                 int l = strlen(rep);
    1176                 :                                 /* increase the buffer size */
    1177             142 :                                 if (len + 2 + l >= maxlen) {
    1178               0 :                                         replaced = erealloc(replaced, maxlen += 128);
    1179                 :                                 }
    1180                 : 
    1181             142 :                                 replaced[len++] = '&';
    1182             142 :                                 strlcpy(replaced + len, rep, maxlen);
    1183             142 :                                 len += l;
    1184             142 :                                 replaced[len++] = ';';
    1185                 :                         }
    1186                 :                 }
    1187           32937 :                 if (!matches_map) {     
    1188           32795 :                         int is_basic = 0;
    1189                 : 
    1190           32795 :                         if (this_char == '&') {
    1191             168 :                                 if (double_encode) {
    1192             138 : encode_amp:
    1193             138 :                                         memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
    1194             138 :                                         len += sizeof("&amp;") - 1;
    1195                 :                                 } else {
    1196              50 :                                         char *e = memchr(old + i, ';', oldlen - i);
    1197              50 :                                         char *s = old + i;
    1198                 : 
    1199              50 :                                         if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
    1200                 :                                                 goto encode_amp;
    1201                 :                                         } else {
    1202              40 :                                                 if (*s == '#') { /* numeric entities */
    1203              12 :                                                         s++;
    1204                 :                                                         /* Hex (&#x5A;) */
    1205              16 :                                                         if (*s == 'x' || *s == 'X') {
    1206               6 :                                                                 s++;
    1207              20 :                                                                 while (s < e) {
    1208              10 :                                                                         if (!isxdigit((int)*(unsigned char *)s++)) {
    1209               2 :                                                                                 goto encode_amp;
    1210                 :                                                                         }
    1211                 :                                                                 }
    1212                 :                                                         /* Dec (&#90;)*/
    1213                 :                                                         } else {
    1214              22 :                                                                 while (s < e) {
    1215              12 :                                                                         if (!isdigit((int)*(unsigned char *)s++)) {
    1216               2 :                                                                                 goto encode_amp;
    1217                 :                                                                         }
    1218                 :                                                                 }
    1219                 :                                                         }
    1220                 :                                                 } else { /* text entities */
    1221             124 :                                                         while (s < e) {
    1222              74 :                                                                 if (!isalnum((int)*(unsigned char *)s++)) {
    1223               6 :                                                                         goto encode_amp;
    1224                 :                                                                 }
    1225                 :                                                         }
    1226                 :                                                 }
    1227              30 :                                                 replaced[len++] = '&';
    1228                 :                                         }
    1229                 :                                 }
    1230             168 :                                 is_basic = 1;
    1231                 :                         } else {
    1232          194949 :                                 for (j = 0; basic_entities[j].charcode != 0; j++) {
    1233          162627 :                                         if ((basic_entities[j].charcode != this_char) ||
    1234                 :                                                         (basic_entities[j].flags &&
    1235                 :                                                         (quote_style & basic_entities[j].flags) == 0)) {
    1236                 :                                                 continue;
    1237                 :                                         }
    1238                 : 
    1239             305 :                                         memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
    1240             305 :                                         len += basic_entities[j].entitylen;
    1241                 :                 
    1242             305 :                                         is_basic = 1;
    1243             305 :                                         break;
    1244                 :                                 }
    1245                 :                         }
    1246                 : 
    1247           32795 :                         if (!is_basic) {
    1248                 :                                 /* a wide char without a named entity; pass through the original sequence */
    1249           32322 :                                 if (mbseqlen > 1) {
    1250            1618 :                                         memcpy(replaced + len, mbsequence, mbseqlen);
    1251            1618 :                                         len += mbseqlen;
    1252                 :                                 } else {
    1253           30704 :                                         replaced[len++] = (unsigned char)this_char;
    1254                 :                                 }
    1255                 :                         }
    1256                 :                 }
    1257                 :         }
    1258            3926 :         replaced[len] = '\0';
    1259            3926 :         *newlen = len;
    1260                 : 
    1261            3926 :         return replaced;
    1262                 : 
    1263                 : 
    1264                 : }
    1265                 : /* }}} */
    1266                 : 
    1267                 : /* {{{ php_html_entities
    1268                 :  */
    1269                 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
    1270            4735 : {
    1271            4735 :         char *str, *hint_charset = NULL;
    1272            4735 :         int str_len, hint_charset_len = 0;
    1273                 :         int len;
    1274            4735 :         long quote_style = ENT_COMPAT;
    1275                 :         char *replaced;
    1276            4735 :         zend_bool double_encode = 1;
    1277                 : 
    1278            4735 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
    1279               2 :                 return;
    1280                 :         }
    1281                 : 
    1282            4733 :         replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
    1283            4733 :         RETVAL_STRINGL(replaced, len, 0);
    1284                 : }
    1285                 : /* }}} */
    1286                 : 
    1287                 : #define HTML_SPECIALCHARS       0
    1288                 : #define HTML_ENTITIES           1
    1289                 : 
    1290                 : /* {{{ register_html_constants
    1291                 :  */
    1292                 : void register_html_constants(INIT_FUNC_ARGS)
    1293           17633 : {
    1294           17633 :         REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
    1295           17633 :         REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
    1296           17633 :         REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
    1297           17633 :         REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
    1298           17633 :         REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
    1299           17633 :         REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
    1300           17633 : }
    1301                 : /* }}} */
    1302                 : 
    1303                 : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
    1304                 :    Convert special characters to HTML entities */
    1305                 : PHP_FUNCTION(htmlspecialchars)
    1306            4319 : {
    1307            4319 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
    1308            4319 : }
    1309                 : /* }}} */
    1310                 : 
    1311                 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
    1312                 :    Convert special HTML entities back to characters */
    1313                 : PHP_FUNCTION(htmlspecialchars_decode)
    1314             113 : {
    1315                 :         char *str, *new_str, *e, *p;
    1316                 :         int len, j, i, new_len;
    1317             113 :         long quote_style = ENT_COMPAT;
    1318                 :         struct basic_entities_dec basic_entities_dec[8];
    1319                 : 
    1320             113 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
    1321              19 :                 return;
    1322                 :         }
    1323                 : 
    1324              94 :         new_str = estrndup(str, len);
    1325              94 :         new_len = len;
    1326              94 :         e = new_str + new_len;
    1327                 : 
    1328              94 :         if (!(p = memchr(new_str, '&', new_len))) {
    1329              25 :                 RETURN_STRINGL(new_str, new_len, 0);
    1330                 :         }
    1331                 : 
    1332             414 :         for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
    1333             345 :                 if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
    1334             133 :                         continue;
    1335                 :                 }
    1336             212 :                 basic_entities_dec[j].charcode = basic_entities[i].charcode;
    1337             212 :                 memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
    1338             212 :                 basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
    1339             212 :                 j++;
    1340                 :         }
    1341              69 :         basic_entities_dec[j].charcode = '&';
    1342              69 :         basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
    1343              69 :         memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
    1344              69 :         i = j + 1;
    1345                 :         
    1346                 :         do {
    1347             343 :                 int l = e - p;
    1348                 :         
    1349            1188 :                 for (j = 0; j < i; j++) {
    1350            1014 :                         if (basic_entities_dec[j].entitylen > l) {
    1351               0 :                                 continue;
    1352                 :                         }
    1353            1014 :                         if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
    1354             169 :                                 int e_len = basic_entities_dec[j].entitylen - 1;
    1355                 :                 
    1356             169 :                                 *p++ = basic_entities_dec[j].charcode;
    1357             169 :                                 memmove(p, p + e_len, (e - p - e_len));
    1358             169 :                                 e -= e_len;
    1359             169 :                                 goto done;
    1360                 :                         }
    1361                 :                 }
    1362             174 :                 p++;
    1363                 : 
    1364             343 : done:
    1365             343 :                 if (p >= e) {
    1366              20 :                         break;
    1367                 :                 }
    1368             323 :         } while ((p = memchr(p, '&', (e - p))));
    1369                 : 
    1370              69 :         new_len = e - new_str;
    1371                 : 
    1372              69 :         new_str[new_len] = '\0';
    1373              69 :         RETURN_STRINGL(new_str, new_len, 0);
    1374                 : }
    1375                 : /* }}} */
    1376                 : 
    1377                 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
    1378                 :    Convert all HTML entities to their applicable characters */
    1379                 : PHP_FUNCTION(html_entity_decode)
    1380              22 : {
    1381              22 :         char *str, *hint_charset = NULL;
    1382              22 :         int str_len, hint_charset_len = 0, len;
    1383              22 :         long quote_style = ENT_COMPAT;
    1384                 :         char *replaced;
    1385                 : 
    1386              22 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
    1387                 :                                                           &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
    1388               0 :                 return;
    1389                 :         }
    1390                 : 
    1391              22 :         replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
    1392              22 :         if (replaced) {
    1393              22 :                 RETURN_STRINGL(replaced, len, 0);
    1394                 :         }
    1395               0 :         RETURN_FALSE;
    1396                 : }
    1397                 : /* }}} */
    1398                 : 
    1399                 : 
    1400                 : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
    1401                 :    Convert all applicable characters to HTML entities */
    1402                 : PHP_FUNCTION(htmlentities)
    1403             416 : {
    1404             416 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
    1405             416 : }
    1406                 : /* }}} */
    1407                 : 
    1408                 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]])
    1409                 :    Returns the internal translation table used by htmlspecialchars and htmlentities */
    1410                 : PHP_FUNCTION(get_html_translation_table)
    1411              68 : {
    1412              68 :         long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
    1413                 :         int i, j;
    1414                 :         char ind[2];
    1415              68 :         enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
    1416                 : 
    1417              68 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, &quote_style) == FAILURE) {
    1418              34 :                 return;
    1419                 :         }
    1420                 : 
    1421              34 :         array_init(return_value);
    1422                 : 
    1423              34 :         ind[1] = 0;
    1424                 : 
    1425              34 :         switch (which) {
    1426                 :                 case HTML_ENTITIES:
    1427             200 :                         for (j=0; entity_map[j].charset != cs_terminator; j++) {
    1428             192 :                                 if (entity_map[j].charset != charset)
    1429             184 :                                         continue;
    1430             776 :                                 for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
    1431                 :                                         char buffer[16];
    1432                 : 
    1433             768 :                                         if (entity_map[j].table[i] == NULL)
    1434               0 :                                                 continue;
    1435                 :                                         /* what about wide chars here ?? */
    1436             768 :                                         ind[0] = i + entity_map[j].basechar;
    1437             768 :                                         snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
    1438             768 :                                         add_assoc_string(return_value, ind, buffer, 1);
    1439                 : 
    1440                 :                                 }
    1441                 :                         }
    1442                 :                         /* break thru */
    1443                 : 
    1444                 :                 case HTML_SPECIALCHARS:
    1445             204 :                         for (j = 0; basic_entities[j].charcode != 0; j++) {
    1446                 : 
    1447             170 :                                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
    1448              70 :                                         continue;
    1449                 :                                 
    1450             100 :                                 ind[0] = (unsigned char)basic_entities[j].charcode;
    1451             100 :                                 add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
    1452                 :                         }
    1453              34 :                         add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
    1454                 : 
    1455                 :                         break;
    1456                 :         }
    1457                 : }
    1458                 : /* }}} */
    1459                 : 
    1460                 : /*
    1461                 :  * Local variables:
    1462                 :  * tab-width: 4
    1463                 :  * c-basic-offset: 4
    1464                 :  * End:
    1465                 :  * vim600: sw=4 ts=4 fdm=marker
    1466                 :  * vim<600: sw=4 ts=4
    1467                 :  */

Generated by: LTP GCOV extension version 1.5

Generated at Sat, 21 Nov 2009 12:27:12 +0000 (3 days ago)

Copyright © 2005-2009 The PHP Group
All rights reserved.