PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LTP GCOV extension - code coverage report
Current view: directory - standard - html.c
Test: PHP Code Coverage
Date: 2009-11-23 Instrumented lines: 506
Code covered: 62.1 % Executed lines: 314
Legend: not executed executed

       1                 : /*
       2                 :    +----------------------------------------------------------------------+
       3                 :    | PHP Version 6                                                        |
       4                 :    +----------------------------------------------------------------------+
       5                 :    | Copyright (c) 1997-2009 The PHP Group                                |
       6                 :    +----------------------------------------------------------------------+
       7                 :    | This source file is subject to version 3.01 of the PHP license,      |
       8                 :    | that is bundled with this package in the file LICENSE, and is        |
       9                 :    | available through the world-wide-web at the following url:           |
      10                 :    | http://www.php.net/license/3_01.txt                                  |
      11                 :    | If you did not receive a copy of the PHP license and are unable to   |
      12                 :    | obtain it through the world-wide-web, please send a note to          |
      13                 :    | license@php.net so we can mail you a copy immediately.               |
      14                 :    +----------------------------------------------------------------------+
      15                 :    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
      16                 :    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
      17                 :    |          Wez Furlong <wez@thebrainroom.com>                          |
      18                 :    +----------------------------------------------------------------------+
      19                 : */
      20                 : 
      21                 : /* $Id: html.c 289605 2009-10-13 05:18:37Z moriyoshi $ */
      22                 : 
      23                 : /*
      24                 :  * HTML entity resources:
      25                 :  *
      26                 :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
      27                 :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
      28                 :  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
      29                 :  *
      30                 :  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
      31                 :  * 
      32                 :  * UNICODE NOTE:
      33                 :  *  The way Unicode support is implemented (namely, IS_UNICODE support) is by
      34                 :  *  converting the IS_UNICODE strings to UTF-8 and handing them off to existing
      35                 :  *  implementation. This saves on redoing all the code that encodes and decodes
      36                 :  *  entities to support UChar*, but it does result in slight performance loss.
      37                 :  *  Whoever wants to do this properly, go ahead.
      38                 :  */
      39                 : 
      40                 : #include "php.h"
      41                 : #if PHP_WIN32
      42                 : #include "config.w32.h"
      43                 : #else
      44                 : #include <php_config.h>
      45                 : #endif
      46                 : #include "html.h"
      47                 : #include "php_string.h"
      48                 : #include "SAPI.h"
      49                 : #if HAVE_LOCALE_H
      50                 : #include <locale.h>
      51                 : #endif
      52                 : #if HAVE_LANGINFO_H
      53                 : #include <langinfo.h>
      54                 : #endif
      55                 : 
      56                 : #if HAVE_MBSTRING
      57                 : # include "ext/mbstring/mbstring.h"
      58                 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
      59                 : #endif
      60                 : 
      61                 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
      62                 :                                           cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 
      63                 :                                           cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
      64                 :                                           cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
      65                 :                                         };
      66                 : typedef const char *const entity_table_t;
      67                 : 
      68                 : /* codepage 1252 is a Windows extension to iso-8859-1. */
      69                 : static entity_table_t ent_cp_1252[] = {
      70                 :         "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
      71                 :         "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
      72                 :         NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
      73                 :         "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
      74                 :         "oelig", NULL, NULL, "Yuml" 
      75                 : };
      76                 : 
      77                 : static entity_table_t ent_iso_8859_1[] = {
      78                 :         "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
      79                 :         "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
      80                 :         "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
      81                 :         "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
      82                 :         "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
      83                 :         "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      84                 :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      85                 :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      86                 :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
      87                 :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
      88                 :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
      89                 :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
      90                 :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
      91                 :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
      92                 :         "uuml", "yacute", "thorn", "yuml"
      93                 : };
      94                 : 
      95                 : static entity_table_t ent_iso_8859_15[] = {
      96                 :         "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
      97                 :         "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
      98                 :         "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
      99                 :         "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
     100                 :         "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
     101                 :         "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
     102                 :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
     103                 :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
     104                 :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
     105                 :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
     106                 :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
     107                 :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
     108                 :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
     109                 :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
     110                 :         "uuml", "yacute", "thorn", "yuml"
     111                 : };
     112                 : 
     113                 : static entity_table_t ent_uni_338_402[] = {
     114                 :         /* 338 (0x0152) */
     115                 :         "OElig", "oelig", NULL, NULL, NULL, NULL,
     116                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     117                 :         /* 352 (0x0160) */
     118                 :         "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
     119                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     120                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     121                 :         /* 376 (0x0178) */
     122                 :         "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     123                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     124                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     125                 :         /* 400 (0x0190) */
     126                 :         NULL, NULL, "fnof"
     127                 : };
     128                 : 
     129                 : static entity_table_t ent_uni_spacing[] = {
     130                 :         /* 710 */
     131                 :         "circ",
     132                 :         /* 711 - 730 */
     133                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     134                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     135                 :         /* 731 - 732 */
     136                 :         NULL, "tilde"
     137                 : };
     138                 : 
     139                 : static entity_table_t ent_uni_greek[] = {
     140                 :         /* 913 */
     141                 :         "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
     142                 :         "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
     143                 :         NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
     144                 :         /* 938 - 944 are not mapped */
     145                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     146                 :         "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
     147                 :         "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
     148                 :         "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
     149                 :         /* 970 - 976 are not mapped */
     150                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     151                 :         "thetasym", "upsih",
     152                 :         NULL, NULL, NULL,
     153                 :         "piv" 
     154                 : };
     155                 : 
     156                 : static entity_table_t ent_uni_punct[] = {
     157                 :         /* 8194 */
     158                 :         "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
     159                 :         "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
     160                 :         NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
     161                 :         /* 8216 */
     162                 :         "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
     163                 :         "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
     164                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
     165                 :         /* 8242 */
     166                 :         "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
     167                 :         NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
     168                 :         "frasl"
     169                 : };
     170                 : 
     171                 : static entity_table_t ent_uni_euro[] = {
     172                 :         "euro"
     173                 : };
     174                 : 
     175                 : static entity_table_t ent_uni_8465_8501[] = {
     176                 :         /* 8465 */
     177                 :         "image", NULL, NULL, NULL, NULL, NULL, NULL,
     178                 :         /* 8472 */
     179                 :         "weierp", NULL, NULL, NULL,
     180                 :         /* 8476 */
     181                 :         "real", NULL, NULL, NULL, NULL, NULL,
     182                 :         /* 8482 */
     183                 :         "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     184                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     185                 :         /* 8501 */
     186                 :         "alefsym",
     187                 : };
     188                 : 
     189                 : static entity_table_t ent_uni_8592_9002[] = {
     190                 :         /* 8592 (0x2190) */
     191                 :         "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
     192                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     193                 :         /* 8608 (0x21a0) */
     194                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     195                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     196                 :         /* 8624 (0x21b0) */
     197                 :         NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
     198                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     199                 :         /* 8640 (0x21c0) */
     200                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     201                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     202                 :         /* 8656 (0x21d0) */
     203                 :         "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
     204                 :         NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
     205                 :         /* 8672 (0x21e0) */
     206                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     207                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     208                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     209                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     210                 :         /* 8704 (0x2200) */
     211                 :         "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
     212                 :         "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
     213                 :         /* 8720 (0x2210) */
     214                 :         "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
     215                 :         "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
     216                 :         /* 8736 (0x2220) */
     217                 :         "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
     218                 :         "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
     219                 :         /* 8752 (0x2230) */
     220                 :         NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
     221                 :         NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
     222                 :         /* 8768 (0x2240) */
     223                 :         "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
     224                 :         "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
     225                 :         /* 8784 (0x2250) */
     226                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     227                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     228                 :         /* 8800 (0x2260) */
     229                 :         "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
     230                 :         "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
     231                 :         /* 8816 (0x2270) */
     232                 :         "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
     233                 :         NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
     234                 :         /* 8832 (0x2280) */
     235                 :         "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
     236                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     237                 :         /* 8848 (0x2290) */
     238                 :         NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
     239                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     240                 :         /* 8864 (0x22a0) */
     241                 :         NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
     242                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     243                 :         /* 8880 (0x22b0) */
     244                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     245                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     246                 :         /* 8896 (0x22c0) */
     247                 :         NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
     248                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     249                 :         /* 8912 (0x22d0) */
     250                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     251                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     252                 :         /* 8928 (0x22e0) */
     253                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     254                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     255                 :         /* 8944 (0x22f0) */
     256                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     257                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     258                 :         /* 8960 (0x2300) */
     259                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     260                 :         "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
     261                 :         /* 8976 (0x2310) */
     262                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     263                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     264                 :         /* 8992 (0x2320) */
     265                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     266                 :         NULL, "lang", "rang"
     267                 : };
     268                 : 
     269                 : static entity_table_t ent_uni_9674[] = {
     270                 :         /* 9674 */
     271                 :         "loz"
     272                 : };
     273                 : 
     274                 : static entity_table_t ent_uni_9824_9830[] = {
     275                 :         /* 9824 */
     276                 :         "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
     277                 : };
     278                 : 
     279                 : static entity_table_t ent_koi8r[] = {
     280                 :         "#1105", /* "jo "*/
     281                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     282                 :         NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
     283                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     284                 :         "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 
     285                 :         "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 
     286                 :         "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 
     287                 :         "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 
     288                 :         "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 
     289                 :         "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 
     290                 :         "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 
     291                 :         "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
     292                 :         "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 
     293                 :         "#1066"
     294                 : };
     295                 : 
     296                 : static entity_table_t ent_cp_1251[] = {
     297                 :         "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
     298                 :         "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
     299                 :         "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
     300                 :         "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
     301                 :         "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
     302                 :         "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
     303                 :         "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
     304                 :         "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
     305                 :         "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
     306                 :         "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
     307                 :         "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
     308                 :         "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
     309                 :         "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
     310                 :         "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
     311                 :         "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
     312                 :         "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
     313                 :         "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
     314                 :         "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
     315                 :         "#1103"
     316                 : };
     317                 : 
     318                 : static entity_table_t ent_iso_8859_5[] = {
     319                 :         "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
     320                 :         "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
     321                 :         "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
     322                 :         "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
     323                 :         "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
     324                 :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
     325                 :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
     326                 :         "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
     327                 :         "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
     328                 :         "#1119"
     329                 : };
     330                 : 
     331                 : static entity_table_t ent_cp_866[] = {
     332                 : 
     333                 :         "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 
     334                 :         "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 
     335                 :         "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 
     336                 :         "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 
     337                 :         "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 
     338                 :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 
     339                 :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 
     340                 :         "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 
     341                 :         "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632", 
     342                 :         "#160"
     343                 : };
     344                 : 
     345                 : /* MacRoman has a couple of low-ascii chars that need mapping too */
     346                 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
     347                 : /* DB exports, this mapping changes it to a space */
     348                 : static entity_table_t ent_macroman[] = {
     349                 :         "sp", NULL, NULL, NULL,
     350                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     351                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     352                 :         NULL, NULL, NULL, NULL, NULL, "quot", NULL,
     353                 :         NULL, NULL, "amp", NULL, NULL, NULL, NULL,
     354                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     355                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     356                 :         NULL, NULL, NULL, "lt", NULL, "gt", NULL,
     357                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     358                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     359                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     360                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     361                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     362                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     363                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     364                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     365                 :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     366                 :         NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
     367                 :         "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
     368                 :         "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
     369                 :         "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
     370                 :         "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
     371                 :         "cent", "pound", "sect", "bull", "para", "szlig", "reg",
     372                 :         "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
     373                 :         "infin", "plusmn", "le", "ge", "yen", "micro", "part",
     374                 :         "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
     375                 :         "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
     376                 :         "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
     377                 :         "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
     378                 :         "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
     379                 :         "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
     380                 :         "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
     381                 :         "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
     382                 :         "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
     383                 :         "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
     384                 :         "#733", "#731", "#711"
     385                 : };
     386                 : 
     387                 : struct html_entity_map {
     388                 :         enum entity_charset charset;    /* charset identifier */
     389                 :         unsigned int basechar;                  /* char code at start of table */
     390                 :         unsigned int endchar;                   /* last char code in the table */
     391                 :         entity_table_t *table;                  /* the table of mappings */
     392                 : };
     393                 : 
     394                 : static const struct html_entity_map entity_map[] = {
     395                 :         { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
     396                 :         { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
     397                 :         { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
     398                 :         { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
     399                 :         { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
     400                 :         { cs_utf_8,             338,  402,  ent_uni_338_402 },
     401                 :         { cs_utf_8,             710,  732,  ent_uni_spacing },
     402                 :         { cs_utf_8,             913,  982,  ent_uni_greek },
     403                 :         { cs_utf_8,             8194, 8260, ent_uni_punct },
     404                 :         { cs_utf_8,             8364, 8364, ent_uni_euro }, 
     405                 :         { cs_utf_8,             8465, 8501, ent_uni_8465_8501 },
     406                 :         { cs_utf_8,             8592, 9002, ent_uni_8592_9002 },
     407                 :         { cs_utf_8,             9674, 9674, ent_uni_9674 },
     408                 :         { cs_utf_8,             9824, 9830, ent_uni_9824_9830 },
     409                 :         { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
     410                 :         { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
     411                 :         { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
     412                 :         { cs_sjis,                      0xa0, 0xff, ent_iso_8859_1 },
     413                 :         { cs_eucjp,                     0xa0, 0xff, ent_iso_8859_1 },
     414                 :         { cs_koi8r,                 0xa3, 0xff, ent_koi8r },
     415                 :         { cs_cp1251,            0x80, 0xff, ent_cp_1251 },
     416                 :         { cs_8859_5,            0xc0, 0xff, ent_iso_8859_5 },
     417                 :         { cs_cp866,                 0xc0, 0xff, ent_cp_866 },
     418                 :         { cs_macroman,          0x0b, 0xff, ent_macroman },
     419                 :         { cs_terminator }
     420                 : };
     421                 : 
     422                 : static const struct {
     423                 :         const char *codeset;
     424                 :         enum entity_charset charset;
     425                 : } charset_map[] = {
     426                 :         { "ISO-8859-1",       cs_8859_1 },
     427                 :         { "ISO8859-1",                cs_8859_1 },
     428                 :         { "ISO-8859-15",      cs_8859_15 },
     429                 :         { "ISO8859-15",       cs_8859_15 },
     430                 :         { "utf-8",                    cs_utf_8 },
     431                 :         { "cp1252",           cs_cp1252 },
     432                 :         { "Windows-1252",     cs_cp1252 },
     433                 :         { "1252",           cs_cp1252 }, 
     434                 :         { "BIG5",                     cs_big5 },
     435                 :         { "950",            cs_big5 },
     436                 :         { "GB2312",                   cs_gb2312 },
     437                 :         { "936",            cs_gb2312 },
     438                 :         { "BIG5-HKSCS",               cs_big5hkscs },
     439                 :         { "Shift_JIS",                cs_sjis },
     440                 :         { "SJIS",             cs_sjis },
     441                 :         { "932",            cs_sjis },
     442                 :         { "EUCJP",            cs_eucjp },
     443                 :         { "EUC-JP",                   cs_eucjp },
     444                 :         { "KOI8-R",         cs_koi8r },
     445                 :         { "koi8-ru",        cs_koi8r },
     446                 :         { "koi8r",          cs_koi8r },
     447                 :         { "cp1251",         cs_cp1251 },
     448                 :         { "Windows-1251",   cs_cp1251 },
     449                 :         { "win-1251",       cs_cp1251 },
     450                 :         { "iso8859-5",      cs_8859_5 },
     451                 :         { "iso-8859-5",     cs_8859_5 },
     452                 :         { "cp866",          cs_cp866 },
     453                 :         { "866",            cs_cp866 },    
     454                 :         { "ibm866",         cs_cp866 },
     455                 :         { "MacRoman",       cs_macroman },
     456                 :         { NULL }
     457                 : };
     458                 : 
     459                 : static const struct {
     460                 :         unsigned short charcode;
     461                 :         char *entity;
     462                 :         int entitylen;
     463                 :         int flags;
     464                 : } basic_entities[] = {
     465                 :         { '"',     "&quot;", 6,      ENT_HTML_QUOTE_DOUBLE },
     466                 :         { '\'', "&#039;", 6,      ENT_HTML_QUOTE_SINGLE },
     467                 :         { '\'', "&#39;",  5,      ENT_HTML_QUOTE_SINGLE },
     468                 :         { '<',       "&lt;",           4,      0 },
     469                 :         { '>',       "&gt;",           4,      0 },
     470                 :         { 0, NULL, 0, 0 }
     471                 : };
     472                 :         
     473                 : struct basic_entities_dec {
     474                 :         unsigned short charcode;
     475                 :         char entity[8];
     476                 :         int entitylen;  
     477                 : };
     478                 :         
     479                 : #define MB_RETURN { \
     480                 :                         *newpos = pos;       \
     481                 :                         mbseq[mbpos] = '\0'; \
     482                 :                         *mbseqlen = mbpos;   \
     483                 :                         return this_char; }
     484                 :                                         
     485                 : #define MB_WRITE(mbchar) { \
     486                 :                         mbspace--;  \
     487                 :                         if (mbspace == 0) {      \
     488                 :                                 MB_RETURN;           \
     489                 :                         }                        \
     490                 :                         mbseq[mbpos++] = (mbchar); }
     491                 : 
     492                 : /* skip one byte and return */
     493                 : #define MB_FAILURE(pos) do { \
     494                 :         *newpos = pos + 1; \
     495                 :         *status = FAILURE; \
     496                 :         return 0; \
     497                 : } while (0)
     498                 : 
     499                 : #define CHECK_LEN(pos, chars_need)                      \
     500                 :         if (chars_need < 1) {                                                \
     501                 :                 if((str_len - (pos)) < chars_need) { \
     502                 :                         *newpos = pos;                                          \
     503                 :                         *status = FAILURE;                                      \
     504                 :                         return 0;                                                       \
     505                 :                 }                                                                               \
     506                 :         } else {                                                                        \
     507                 :                 if((str_len - (pos)) < chars_need) { \
     508                 :                         *newpos = pos + 1;                                      \
     509                 :                         *status = FAILURE;                                      \
     510                 :                         return 0;                                                       \
     511                 :                 }                                                                               \
     512                 :         }
     513                 : 
     514                 : /* {{{ get_next_char
     515                 :  */
     516                 : inline static unsigned int get_next_char(enum entity_charset charset,
     517                 :                 unsigned char * str,
     518                 :                 int str_len,
     519                 :                 int * newpos,
     520                 :                 unsigned char * mbseq,
     521                 :                 int * mbseqlen, 
     522                 :                 int *status)
     523           38410 : {
     524           38410 :         int pos = *newpos;
     525           38410 :         int mbpos = 0;
     526           38410 :         int mbspace = *mbseqlen;
     527           38410 :         unsigned int this_char = 0;
     528                 :         unsigned char next_char;
     529                 : 
     530           38410 :         *status = SUCCESS;
     531                 : 
     532           38410 :         if (mbspace <= 0) {
     533               0 :                 *mbseqlen = 0;
     534               0 :                 CHECK_LEN(pos, 1);
     535               0 :                 *newpos = pos + 1;
     536               0 :                 return str[pos];
     537                 :         }
     538                 : 
     539           38410 :         switch (charset) {
     540                 :                 case cs_utf_8:
     541                 :                         {
     542                 :                                 unsigned char c;
     543           34032 :                                 CHECK_LEN(pos, 1);
     544           34032 :                                 c = str[pos];
     545           34032 :                                 if (c < 0x80) {
     546           26040 :                                         MB_WRITE(c);
     547           26040 :                                         this_char = c;
     548           26040 :                                         pos++;
     549            7992 :                                 } else if (c < 0xc0) {
     550              24 :                                         MB_FAILURE(pos);
     551            7968 :                                 } else if (c < 0xe0) {
     552            7871 :                                         CHECK_LEN(pos, 2);
     553            7863 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     554               4 :                                                 MB_FAILURE(pos);
     555                 :                                         }
     556            7859 :                                         this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
     557            7859 :                                         if (this_char < 0x80) {
     558               0 :                                                 MB_FAILURE(pos);
     559                 :                                         }
     560            7859 :                                         MB_WRITE((unsigned char)c);
     561            7859 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     562            7859 :                                         pos += 2;
     563              97 :                                 } else if (c < 0xf0) {
     564              41 :                                         CHECK_LEN(pos, 3);
     565              17 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     566               0 :                                                 MB_FAILURE(pos);
     567                 :                                         }
     568              17 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     569               0 :                                                 MB_FAILURE(pos);
     570                 :                                         }
     571              17 :                                         this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
     572              17 :                                         if (this_char < 0x800) {
     573               4 :                                                 MB_FAILURE(pos);
     574                 :                                         }
     575              13 :                                         MB_WRITE((unsigned char)c);
     576              13 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     577              13 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     578              13 :                                         pos += 3;
     579              56 :                                 } else if (c < 0xf8) {
     580              16 :                                         CHECK_LEN(pos, 4);
     581               4 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     582               0 :                                                 MB_FAILURE(pos);
     583                 :                                         }
     584               4 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     585               0 :                                                 MB_FAILURE(pos);
     586                 :                                         }
     587               4 :                                         if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
     588               0 :                                                 MB_FAILURE(pos);
     589                 :                                         }
     590               4 :                                         this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
     591               4 :                                         if (this_char < 0x10000) {
     592               0 :                                                 MB_FAILURE(pos);
     593                 :                                         }
     594               4 :                                         MB_WRITE((unsigned char)c);
     595               4 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     596               4 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     597               4 :                                         MB_WRITE((unsigned char)str[pos + 3]);
     598               4 :                                         pos += 4;
     599                 :                                 } else {
     600              40 :                                         MB_FAILURE(pos);
     601                 :                                 }
     602                 :                         }
     603           33916 :                         break;
     604                 :                 case cs_big5:
     605                 :                 case cs_gb2312:
     606                 :                 case cs_big5hkscs:
     607                 :                         {
     608               0 :                                 CHECK_LEN(pos, 1);
     609               0 :                                 this_char = str[pos++];
     610                 :                                 /* check if this is the first of a 2-byte sequence */
     611               0 :                                 if (this_char >= 0x81 && this_char <= 0xfe) {
     612                 :                                         /* peek at the next char */
     613               0 :                                         CHECK_LEN(pos, 1);
     614               0 :                                         next_char = str[pos++];
     615               0 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     616                 :                                                         (next_char >= 0xa1 && next_char <= 0xfe)) {
     617                 :                                                 /* yes, this a wide char */
     618               0 :                                                 MB_WRITE(this_char);
     619               0 :                                                 MB_WRITE(next_char);
     620               0 :                                                 this_char = (this_char << 8) | next_char;
     621                 :                                         } else {
     622               0 :                                                 MB_FAILURE(pos);
     623                 :                                         }
     624                 :                                 } else {
     625               0 :                                         MB_WRITE(this_char);
     626                 :                                 }
     627                 :                         }
     628               0 :                         break;
     629                 :                 case cs_sjis:
     630                 :                         {
     631               4 :                                 CHECK_LEN(pos, 1);
     632               4 :                                 this_char = str[pos++];
     633                 :                                 /* check if this is the first of a 2-byte sequence */
     634               4 :                                 if ((this_char >= 0x81 && this_char <= 0x9f) ||
     635                 :                                         (this_char >= 0xe0 && this_char <= 0xfc)) {
     636                 :                                         /* peek at the next char */
     637               0 :                                         CHECK_LEN(pos, 1);
     638               0 :                                         next_char = str[pos++];
     639               0 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     640                 :                                                 (next_char >= 0x80 && next_char <= 0xfc))
     641                 :                                         {
     642                 :                                                 /* yes, this a wide char */
     643               0 :                                                 MB_WRITE(this_char);
     644               0 :                                                 MB_WRITE(next_char);
     645               0 :                                                 this_char = (this_char << 8) | next_char;
     646                 :                                         } else {
     647               0 :                                                 MB_FAILURE(pos);
     648                 :                                         }
     649                 :                                 } else {
     650               4 :                                         MB_WRITE(this_char);
     651                 :                                 }
     652               4 :                                 break;
     653                 :                         }
     654                 :                 case cs_eucjp:
     655                 :                         {
     656               0 :                                 CHECK_LEN(pos, 1);
     657               0 :                                 this_char = str[pos++];
     658                 :                                 /* check if this is the first of a multi-byte sequence */
     659               0 :                                 if (this_char >= 0xa1 && this_char <= 0xfe) {
     660                 :                                         /* peek at the next char */
     661               0 :                                         CHECK_LEN(pos, 1);
     662               0 :                                         next_char = str[pos++];
     663               0 :                                         if (next_char >= 0xa1 && next_char <= 0xfe) {
     664                 :                                                 /* yes, this a jis kanji char */
     665               0 :                                                 MB_WRITE(this_char);
     666               0 :                                                 MB_WRITE(next_char);
     667               0 :                                                 this_char = (this_char << 8) | next_char;
     668                 :                                         } else {
     669               0 :                                                 MB_FAILURE(pos);
     670                 :                                         }
     671               0 :                                 } else if (this_char == 0x8e) {
     672                 :                                         /* peek at the next char */
     673               0 :                                         CHECK_LEN(pos, 1);
     674               0 :                                         next_char = str[pos++];
     675               0 :                                         if (next_char >= 0xa1 && next_char <= 0xdf) {
     676                 :                                                 /* JIS X 0201 kana */
     677               0 :                                                 MB_WRITE(this_char);
     678               0 :                                                 MB_WRITE(next_char);
     679               0 :                                                 this_char = (this_char << 8) | next_char;
     680                 :                                         } else {
     681               0 :                                                 MB_FAILURE(pos);
     682                 :                                         }
     683               0 :                                 } else if (this_char == 0x8f) {
     684                 :                                         /* peek at the next two char */
     685                 :                                         unsigned char next2_char;
     686               0 :                                         CHECK_LEN(pos, 2);
     687               0 :                                         next_char = str[pos];
     688               0 :                                         next2_char = str[pos + 1];
     689               0 :                                         pos += 2;
     690               0 :                                         if ((next_char >= 0xa1 && next_char <= 0xfe) &&
     691                 :                                                 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
     692                 :                                                 /* JIS X 0212 hojo-kanji */
     693               0 :                                                 MB_WRITE(this_char);
     694               0 :                                                 MB_WRITE(next_char);
     695               0 :                                                 MB_WRITE(next2_char);
     696               0 :                                                 this_char = (this_char << 16) | (next_char << 8) | next2_char;
     697                 :                                         } else {
     698               0 :                                                 MB_FAILURE(pos);
     699                 :                                         }
     700                 :                                 } else {
     701               0 :                                         MB_WRITE(this_char);
     702                 :                                 }
     703               0 :                                 break;
     704                 :                         }
     705                 :                 default:
     706                 :                         /* single-byte charsets */
     707            4374 :                         CHECK_LEN(pos, 1);
     708            4374 :                         this_char = str[pos++];
     709            4374 :                         MB_WRITE(this_char);
     710                 :                         break;
     711                 :         }
     712           38294 :         MB_RETURN;
     713                 : }
     714                 : /* }}} */
     715                 : 
     716                 : /* {{{ entity_charset determine_charset
     717                 :  * returns the charset identifier based on current locale or a hint.
     718                 :  * defaults to iso-8859-1 */
     719                 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
     720            6089 : {
     721                 :         int i;
     722            6089 :         enum entity_charset charset = cs_8859_1;
     723            6089 :         int len = 0;
     724            6089 :         zval *uf_result = NULL;
     725                 : 
     726                 :         /* Guarantee default behaviour for backwards compatibility */
     727            6089 :         if (charset_hint == NULL)
     728             533 :                 return cs_8859_1;
     729                 : 
     730            5556 :         if ((len = strlen(charset_hint)) != 0) {
     731            5555 :                 goto det_charset;
     732                 :         }
     733                 : #if HAVE_MBSTRING
     734                 : #if !defined(COMPILE_DL_MBSTRING)
     735                 :         /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
     736               1 :         switch (MBSTRG(current_internal_encoding)) {
     737                 :                 case mbfl_no_encoding_8859_1:
     738               0 :                         return cs_8859_1;
     739                 : 
     740                 :                 case mbfl_no_encoding_utf8:
     741               0 :                         return cs_utf_8;
     742                 : 
     743                 :                 case mbfl_no_encoding_euc_jp:
     744                 :                 case mbfl_no_encoding_eucjp_win:
     745               0 :                         return cs_eucjp;
     746                 : 
     747                 :                 case mbfl_no_encoding_sjis:
     748                 :                 case mbfl_no_encoding_sjis_win:
     749                 :                 case mbfl_no_encoding_sjis_mac:
     750               0 :                         return cs_sjis;
     751                 : 
     752                 :                 case mbfl_no_encoding_cp1252:
     753               0 :                         return cs_cp1252;
     754                 : 
     755                 :                 case mbfl_no_encoding_8859_15:
     756               0 :                         return cs_8859_15;
     757                 : 
     758                 :                 case mbfl_no_encoding_big5:
     759               0 :                         return cs_big5;
     760                 : 
     761                 :                 case mbfl_no_encoding_euc_cn:
     762                 :                 case mbfl_no_encoding_hz:
     763                 :                 case mbfl_no_encoding_cp936:
     764               0 :                         return cs_gb2312;
     765                 : 
     766                 :                 case mbfl_no_encoding_koi8r:
     767               0 :                         return cs_koi8r;
     768                 : 
     769                 :                 case mbfl_no_encoding_cp866:
     770               0 :                         return cs_cp866;
     771                 : 
     772                 :                 case mbfl_no_encoding_cp1251:
     773               1 :                         return cs_cp1251;
     774                 : 
     775                 :                 case mbfl_no_encoding_8859_5:
     776               0 :                         return cs_8859_5;
     777                 : 
     778                 :                 default:
     779                 :                         ;
     780                 :         }
     781                 : #else
     782                 :         {
     783                 :                 zval nm_mb_internal_encoding;
     784                 : 
     785                 :                 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
     786                 : 
     787                 :                 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
     788                 : 
     789                 :                         charset_hint = Z_STRVAL_P(uf_result);
     790                 :                         len = Z_STRLEN_P(uf_result);
     791                 :                         
     792                 :                         if (len == 4) { /* sizeof(none|auto|pass)-1 */
     793                 :                                 if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || 
     794                 :                                     !memcmp("auto", charset_hint, sizeof("auto") - 1) || 
     795                 :                                     !memcmp("none", charset_hint, sizeof("none") - 1)) {
     796                 :                                         
     797                 :                                         charset_hint = NULL;
     798                 :                                         len = 0;
     799                 :                                 }
     800                 :                         }
     801                 :                         goto det_charset;
     802                 :                 }
     803                 :         }
     804                 : #endif
     805                 : #endif
     806                 : 
     807               0 :         charset_hint = SG(default_charset);
     808               0 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     809               0 :                 goto det_charset;
     810                 :         }
     811                 : 
     812                 :         /* try to detect the charset for the locale */
     813                 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
     814               0 :         charset_hint = nl_langinfo(CODESET);
     815               0 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     816               0 :                 goto det_charset;
     817                 :         }
     818                 : #endif
     819                 : 
     820                 : #if HAVE_LOCALE_H
     821                 :         /* try to figure out the charset from the locale */
     822                 :         {
     823                 :                 char *localename;
     824                 :                 char *dot, *at;
     825                 : 
     826                 :                 /* lang[_territory][.codeset][@modifier] */
     827               0 :                 localename = setlocale(LC_CTYPE, NULL);
     828                 : 
     829               0 :                 dot = strchr(localename, '.');
     830               0 :                 if (dot) {
     831               0 :                         dot++;
     832                 :                         /* locale specifies a codeset */
     833               0 :                         at = strchr(dot, '@');
     834               0 :                         if (at)
     835               0 :                                 len = at - dot;
     836                 :                         else
     837               0 :                                 len = strlen(dot);
     838               0 :                         charset_hint = dot;
     839                 :                 } else {
     840                 :                         /* no explicit name; see if the name itself
     841                 :                          * is the charset */
     842               0 :                         charset_hint = localename;
     843               0 :                         len = strlen(charset_hint);
     844                 :                 }
     845                 :         }
     846                 : #endif
     847                 : 
     848            5555 : det_charset:
     849                 : 
     850            5555 :         if (charset_hint) {
     851            5555 :                 int found = 0;
     852                 :                 
     853                 :                 /* now walk the charset map and look for the codeset */
     854           27979 :                 for (i = 0; charset_map[i].codeset; i++) {
     855           27973 :                         if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
     856            5549 :                                 charset = charset_map[i].charset;
     857            5549 :                                 found = 1;
     858            5549 :                                 break;
     859                 :                         }
     860                 :                 }
     861            5555 :                 if (!found) {
     862               6 :                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
     863                 :                                         charset_hint);
     864                 :                 }
     865                 :         }
     866            5555 :         if (uf_result != NULL) {
     867               0 :                 zval_ptr_dtor(&uf_result);
     868                 :         }
     869            5555 :         return charset;
     870                 : }
     871                 : /* }}} */
     872                 : 
     873                 : /* {{{ php_utf32_utf8 */
     874                 : size_t php_utf32_utf8(unsigned char *buf, int k)
     875            5526 : {
     876            5526 :         size_t retval = 0;
     877                 : 
     878            5526 :         if (k < 0x80) {
     879               0 :                 buf[0] = k;
     880               0 :                 retval = 1;
     881            5526 :         } else if (k < 0x800) {
     882            2808 :                 buf[0] = 0xc0 | (k >> 6);
     883            2808 :                 buf[1] = 0x80 | (k & 0x3f);
     884            2808 :                 retval = 2;
     885            2718 :         } else if (k < 0x10000) {
     886            2718 :                 buf[0] = 0xe0 | (k >> 12);
     887            2718 :                 buf[1] = 0x80 | ((k >> 6) & 0x3f);
     888            2718 :                 buf[2] = 0x80 | (k & 0x3f);
     889            2718 :                 retval = 3;
     890               0 :         } else if (k < 0x200000) {
     891               0 :                 buf[0] = 0xf0 | (k >> 18);
     892               0 :                 buf[1] = 0x80 | ((k >> 12) & 0x3f);
     893               0 :                 buf[2] = 0x80 | ((k >> 6) & 0x3f);
     894               0 :                 buf[3] = 0x80 | (k & 0x3f);
     895               0 :                 retval = 4;
     896               0 :         } else if (k < 0x4000000) {
     897               0 :                 buf[0] = 0xf8 | (k >> 24);
     898               0 :                 buf[1] = 0x80 | ((k >> 18) & 0x3f);
     899               0 :                 buf[2] = 0x80 | ((k >> 12) & 0x3f);
     900               0 :                 buf[3] = 0x80 | ((k >> 6) & 0x3f);
     901               0 :                 buf[4] = 0x80 | (k & 0x3f);
     902               0 :                 retval = 5;
     903                 :         } else {
     904               0 :                 buf[0] = 0xfc | (k >> 30);
     905               0 :                 buf[1] = 0x80 | ((k >> 24) & 0x3f);
     906               0 :                 buf[2] = 0x80 | ((k >> 18) & 0x3f);
     907               0 :                 buf[3] = 0x80 | ((k >> 12) & 0x3f);
     908               0 :                 buf[4] = 0x80 | ((k >> 6) & 0x3f);
     909               0 :                 buf[5] = 0x80 | (k & 0x3f);
     910               0 :                 retval = 6;
     911                 :         }
     912            5526 :         buf[retval] = '\0';
     913                 : 
     914            5526 :         return retval;
     915                 : }
     916                 : /* }}} */
     917                 : 
     918                 : /* {{{ php_unescape_html_entities
     919                 :  */
     920                 : PHPAPI char *php_unescape_html_entities(char *orig, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
     921              20 : {
     922              20 :         unsigned char *old = (unsigned char*)orig;
     923                 :         int retlen;
     924                 :         int j, k;
     925                 :         char *replaced, *ret, *p, *q, *lim, *next;
     926              20 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
     927                 :         unsigned char replacement[15];
     928                 :         int replacement_len;
     929                 : 
     930              20 :         ret = estrndup((char*)old, oldlen);
     931              20 :         retlen = oldlen;
     932              20 :         if (!retlen) {
     933               2 :                 goto empty_source;
     934                 :         }
     935                 :         
     936              18 :         if (all) {
     937                 :                 /* look for a match in the maps for this charset */
     938             450 :                 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
     939             432 :                         if (entity_map[j].charset != charset)
     940             252 :                                 continue;
     941                 : 
     942           14184 :                         for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
     943                 :                                 char entity[32];
     944           14004 :                                 int entity_length = 0;
     945                 : 
     946           14004 :                                 if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
     947            8478 :                                         continue;
     948                 : 
     949            5526 :                                 entity[0] = '&';
     950            5526 :                                 entity_length = strlen(entity_map[j].table[k - entity_map[j].basechar]);
     951            5526 :                                 strncpy(&entity[1], entity_map[j].table[k - entity_map[j].basechar], sizeof(entity) - 2);
     952            5526 :                                 entity[entity_length+1] = ';';
     953            5526 :                                 entity[entity_length+2] = '\0';
     954            5526 :                                 entity_length += 2;
     955                 : 
     956                 :                                 /* When we have MBCS entities in the tables above, this will need to handle it */
     957            5526 :                                 replacement_len = 0;
     958            5526 :                                 switch (charset) {
     959                 :                                         case cs_8859_1:
     960                 :                                         case cs_cp1252:
     961                 :                                         case cs_8859_15:
     962                 :                                         case cs_cp1251:
     963                 :                                         case cs_8859_5:
     964                 :                                         case cs_cp866:
     965                 :                                         case cs_koi8r:
     966               0 :                                                 replacement[0] = k;
     967               0 :                                                 replacement[1] = '\0';
     968               0 :                                                 replacement_len = 1;
     969               0 :                                                 break;
     970                 : 
     971                 :                                         case cs_big5:
     972                 :                                         case cs_gb2312:
     973                 :                                         case cs_big5hkscs:
     974                 :                                         case cs_sjis:
     975                 :                                         case cs_eucjp:
     976                 :                                                 /* we cannot properly handle those multibyte encodings
     977                 :                                                  * with php_str_to_str. skip it. */ 
     978               0 :                                                 continue;
     979                 : 
     980                 :                                         case cs_utf_8:
     981            5526 :                                                 replacement_len = php_utf32_utf8(replacement, k);
     982            5526 :                                                 break;
     983                 : 
     984                 :                                         default:
     985               0 :                                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
     986               0 :                                                 efree(ret);
     987               0 :                                                 return NULL;
     988                 :                                 }
     989                 : 
     990            5526 :                                 if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
     991              18 :                                         replaced = php_str_to_str(ret, retlen, entity, entity_length, (char*)replacement, replacement_len, &retlen);
     992              18 :                                         efree(ret);
     993              18 :                                         ret = replaced;
     994                 :                                 }
     995                 :                         }
     996                 :                 }
     997                 :         }
     998                 : 
     999             108 :         for (j = 0; basic_entities[j].charcode != 0; j++) {
    1000                 : 
    1001              90 :                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
    1002              12 :                         continue;
    1003                 :                 
    1004              78 :                 replacement[0] = (unsigned char)basic_entities[j].charcode;
    1005              78 :                 replacement[1] = '\0';
    1006                 : 
    1007              78 :                 if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {              
    1008               0 :                         replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, (char*)replacement, 1, &retlen);
    1009               0 :                         efree(ret);
    1010               0 :                         ret = replaced;
    1011                 :                 }
    1012                 :         }
    1013                 : 
    1014                 :         /* replace numeric entities & "&amp;" */
    1015              18 :         lim = ret + retlen;
    1016             124 :         for (p = ret, q = ret; p < lim;) {
    1017                 :                 int code;
    1018                 : 
    1019              88 :                 if (p[0] == '&') {
    1020               6 :                         if (p + 2 < lim) {
    1021               6 :                                 if (p[1] == '#') {
    1022               0 :                                         int invalid_code = 0;
    1023                 : 
    1024               0 :                                         if (p[2] == 'x' || p[2] == 'X') {
    1025               0 :                                                 code = strtol(p + 3, &next, 16);
    1026                 :                                         } else {
    1027               0 :                                                 code = strtol(p + 2, &next, 10);
    1028                 :                                         }
    1029                 : 
    1030               0 :                                         if (next != NULL && *next == ';') {
    1031               0 :                                                 switch (charset) {
    1032                 :                                                         case cs_utf_8:
    1033               0 :                                                                 q += php_utf32_utf8((unsigned char*)q, code);
    1034               0 :                                                                 break;
    1035                 : 
    1036                 :                                                         case cs_8859_1:
    1037                 :                                                         case cs_8859_5:
    1038                 :                                                         case cs_8859_15:
    1039               0 :                                                                 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
    1040               0 :                                                                         invalid_code = 1;
    1041                 :                                                                 } else {
    1042               0 :                                                                         if (code == 39 || !quote_style) {
    1043               0 :                                                                                 invalid_code = 1;
    1044                 :                                                                         } else {
    1045               0 :                                                                                 *(q++) = code;
    1046                 :                                                                         }
    1047                 :                                                                 }
    1048               0 :                                                                 break;
    1049                 : 
    1050                 :                                                         case cs_cp1252:
    1051               0 :                                                                 if (code > 0xff) {
    1052               0 :                                                                         invalid_code = 1;
    1053                 :                                                                 } else {
    1054               0 :                                                                         *(q++) = code;
    1055                 :                                                                 }
    1056               0 :                                                                 break;
    1057                 : 
    1058                 :                                                         case cs_cp1251:
    1059                 :                                                         case cs_cp866:
    1060                 :                                                         case cs_big5:
    1061                 :                                                         case cs_big5hkscs:
    1062                 :                                                         case cs_sjis:
    1063                 :                                                         case cs_eucjp:
    1064               0 :                                                                 if (code >= 0x80) {
    1065               0 :                                                                         invalid_code = 1;
    1066                 :                                                                 } else {
    1067               0 :                                                                         *(q++) = code;
    1068                 :                                                                 }
    1069               0 :                                                                 break;
    1070                 : 
    1071                 :                                                         case cs_gb2312:
    1072               0 :                                                                 if (code >= 0x81) {
    1073               0 :                                                                         invalid_code = 1;
    1074                 :                                                                 } else {
    1075               0 :                                                                         *(q++) = code;
    1076                 :                                                                 }
    1077               0 :                                                                 break;
    1078                 : 
    1079                 :                                                         default:
    1080                 :                                                                 /* for backwards compatilibity */
    1081               0 :                                                                 invalid_code = 1;
    1082                 :                                                                 break;
    1083                 :                                                 }
    1084               0 :                                                 if (invalid_code) {
    1085               0 :                                                         for (; p <= next; p++) {
    1086               0 :                                                                 *(q++) = *p;
    1087                 :                                                         }
    1088                 :                                                 }
    1089               0 :                                                 p = next + 1;
    1090                 :                                         } else {
    1091               0 :                                                 *(q++) = *(p++);        
    1092               0 :                                                 *(q++) = *(p++);        
    1093                 :                                         }
    1094              12 :                                 } else if (p + 4 < lim &&
    1095                 :                                                         p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
    1096                 :                                                         p[4] == ';') {
    1097               6 :                                         *(q++) = '&';
    1098               6 :                                         p += 5;
    1099                 :                                 } else {
    1100               0 :                                         *(q++) = *(p++);
    1101               0 :                                         *(q++) = *(p++);
    1102                 :                                 }
    1103                 :                         } else {
    1104               0 :                                 *(q++) = *(p++);        
    1105                 :                         }
    1106                 :                 } else {
    1107              82 :                         *(q++) = *(p++);        
    1108                 :                 }
    1109                 :         }
    1110              18 :         *q = '\0';
    1111              18 :         retlen = (size_t)(q - ret);
    1112              20 : empty_source:   
    1113              20 :         *newlen = retlen;
    1114              20 :         return ret;
    1115                 : }
    1116                 : /* }}} */
    1117                 : 
    1118                 : /* {{{ php_escape_html_entities
    1119                 :  */
    1120                 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
    1121            6069 : {
    1122                 :         int i, j, maxlen, len;
    1123                 :         char *replaced;
    1124            6069 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
    1125                 :         int matches_map;
    1126                 : 
    1127            6069 :         maxlen = 2 * oldlen;
    1128            6069 :         if (maxlen < 128)
    1129            6018 :                 maxlen = 128;
    1130            6069 :         replaced = emalloc (maxlen);
    1131            6069 :         len = 0;
    1132            6069 :         i = 0;
    1133           50508 :         while (i < oldlen) {
    1134                 :                 unsigned char mbsequence[16];   /* allow up to 15 characters in a multibyte sequence */
    1135           38410 :                 int mbseqlen = sizeof(mbsequence);
    1136           38410 :                 int status = SUCCESS;
    1137           38410 :                 unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
    1138                 : 
    1139           38410 :                 if(status == FAILURE) {
    1140                 :                         /* invalid MB sequence */
    1141             116 :                         if (quote_style & ENT_HTML_IGNORE_ERRORS) {
    1142              76 :                                 continue;
    1143                 :                         }
    1144              40 :                         efree(replaced);
    1145              40 :                         if(!PG(display_errors)) {
    1146               0 :                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
    1147                 :                         }
    1148              40 :                         *newlen = 0;
    1149              40 :                         return STR_EMPTY_ALLOC();
    1150                 :                 }
    1151           38294 :                 matches_map = 0;
    1152                 : 
    1153           38294 :                 if (len + 16 > maxlen)
    1154               1 :                         replaced = erealloc (replaced, maxlen += 128);
    1155                 : 
    1156           38294 :                 if (all) {
    1157                 :                         /* look for a match in the maps for this charset */
    1158            2567 :                         char *rep = NULL;
    1159                 : 
    1160                 : 
    1161           61346 :                         for (j = 0; entity_map[j].charset != cs_terminator; j++) {
    1162           58932 :                                 if (entity_map[j].charset == charset
    1163                 :                                                 && this_char >= entity_map[j].basechar
    1164                 :                                                 && this_char <= entity_map[j].endchar) {
    1165             153 :                                         rep = (char*)entity_map[j].table[this_char - entity_map[j].basechar];
    1166             153 :                                         if (rep == NULL) {
    1167                 :                                                 /* there is no entity for this position; fall through and
    1168                 :                                                  * just output the character itself */
    1169               0 :                                                 break;
    1170                 :                                         }
    1171                 : 
    1172             153 :                                         matches_map = 1;
    1173             153 :                                         break;
    1174                 :                                 }
    1175                 :                         }
    1176                 : 
    1177            2567 :                         if (matches_map) {
    1178             153 :                                 int l = strlen(rep);
    1179                 :                                 /* increase the buffer size */
    1180             153 :                                 if (len + 2 + l >= maxlen) {
    1181               0 :                                         replaced = erealloc(replaced, maxlen += 128);
    1182                 :                                 }
    1183                 : 
    1184             153 :                                 replaced[len++] = '&';
    1185             153 :                                 strcpy(replaced + len, rep);
    1186             153 :                                 len += l;
    1187             153 :                                 replaced[len++] = ';';
    1188                 :                         }
    1189                 :                 }
    1190           38294 :                 if (!matches_map) {     
    1191           38141 :                         int is_basic = 0;
    1192                 : 
    1193           38141 :                         if (this_char == '&') {
    1194             385 :                                 if (double_encode) {
    1195             355 : encode_amp:
    1196             355 :                                         memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
    1197             355 :                                         len += sizeof("&amp;") - 1;
    1198                 :                                 } else {
    1199              50 :                                         char *e = memchr(old + i, ';', oldlen - i);
    1200              50 :                                         char *s = (char*)old + i;
    1201                 : 
    1202              50 :                                         if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
    1203                 :                                                 goto encode_amp;
    1204                 :                                         } else {
    1205              40 :                                                 if (*s == '#') { /* numeric entities */
    1206              12 :                                                         s++;
    1207                 :                                                         /* Hex (&#x5A;) */
    1208              16 :                                                         if (*s == 'x' || *s == 'X') {
    1209               6 :                                                                 s++;
    1210              20 :                                                                 while (s < e) {
    1211              10 :                                                                         if (!isxdigit((int)*(unsigned char *)s++)) {
    1212               2 :                                                                                 goto encode_amp;
    1213                 :                                                                         }
    1214                 :                                                                 }
    1215                 :                                                         /* Dec (&#90;)*/
    1216                 :                                                         } else {
    1217              22 :                                                                 while (s < e) {
    1218              12 :                                                                         if (!isdigit((int)*(unsigned char *)s++)) {
    1219               2 :                                                                                 goto encode_amp;
    1220                 :                                                                         }
    1221                 :                                                                 }
    1222                 :                                                         }
    1223                 :                                                 } else { /* text entities */
    1224             124 :                                                         while (s < e) {
    1225              74 :                                                                 if (!isalnum((int)*(unsigned char *)s++)) {
    1226               6 :                                                                         goto encode_amp;
    1227                 :                                                                 }
    1228                 :                                                         }
    1229                 :                                                 }
    1230              30 :                                                 replaced[len++] = '&';
    1231                 :                                         }
    1232                 :                                 }
    1233             385 :                                 is_basic = 1;
    1234                 :                         } else {
    1235          225739 :                                 for (j = 0; basic_entities[j].charcode != 0; j++) {
    1236          188282 :                                         if ((basic_entities[j].charcode != this_char) ||
    1237                 :                                                         (basic_entities[j].flags &&
    1238                 :                                                         (quote_style & basic_entities[j].flags) == 0)) {
    1239                 :                                                 continue;
    1240                 :                                         }
    1241                 : 
    1242             299 :                                         memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
    1243             299 :                                         len += basic_entities[j].entitylen;
    1244                 :                 
    1245             299 :                                         is_basic = 1;
    1246             299 :                                         break;
    1247                 :                                 }
    1248                 :                         }
    1249                 : 
    1250           38141 :                         if (!is_basic) {
    1251                 :                                 /* a wide char without a named entity; pass through the original sequence */
    1252           37457 :                                 if (mbseqlen > 1) {
    1253            7743 :                                         memcpy(replaced + len, mbsequence, mbseqlen);
    1254            7743 :                                         len += mbseqlen;
    1255                 :                                 } else {
    1256           29714 :                                         replaced[len++] = (unsigned char)this_char;
    1257                 :                                 }
    1258                 :                         }
    1259                 :                 }
    1260                 :         }
    1261            6029 :         replaced[len] = '\0';
    1262            6029 :         *newlen = len;
    1263                 : 
    1264            6029 :         return replaced;
    1265                 : 
    1266                 : 
    1267                 : }
    1268                 : /* }}} */
    1269                 : 
    1270                 : PHPAPI char *php_escape_html_entities(char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) /* {{{ */
    1271            1341 : {
    1272            1341 :         return php_escape_html_entities_ex((unsigned char*)old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
    1273                 : }
    1274                 : /* }}} */
    1275                 : 
    1276                 : /* {{{ php_html_entities
    1277                 :  */
    1278                 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
    1279            4730 : {
    1280                 :         zstr str;
    1281            4730 :         char *hint_charset = NULL;
    1282            4730 :         int str_len, hint_charset_len = 0;
    1283                 :         char *str_utf8;
    1284                 :         int str_utf8_len;
    1285                 :         int len;
    1286            4730 :         long quote_style = ENT_COMPAT;
    1287                 :         zend_uchar type;
    1288                 :         char *replaced;
    1289            4730 :         zend_bool double_encode = 1;
    1290                 : 
    1291            4730 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|ls!b", &str, &str_len, &type, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
    1292               2 :                 return;
    1293                 :         }
    1294                 : 
    1295            4728 :         if (type == IS_UNICODE) {
    1296            4068 :                 zend_unicode_to_string(UG(utf8_conv), &str_utf8, &str_utf8_len, str.u, str_len TSRMLS_CC);
    1297            4068 :                 str.s = str_utf8;
    1298            4068 :                 str_len = str_utf8_len;
    1299            4068 :                 hint_charset = "utf-8";
    1300                 :         }
    1301                 : 
    1302            4728 :         replaced = php_escape_html_entities_ex((unsigned char*)str.s, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
    1303                 : 
    1304            4728 :         if (type == IS_UNICODE) {
    1305            4068 :                 RETVAL_U_STRINGL(UG(utf8_conv), replaced, len, ZSTR_AUTOFREE);
    1306            4068 :                 efree(str_utf8);
    1307                 :         } else {
    1308             660 :                 RETVAL_STRINGL(replaced, len, 0);
    1309                 :         }
    1310                 : }
    1311                 : /* }}} */
    1312                 : 
    1313                 : #define HTML_SPECIALCHARS       0
    1314                 : #define HTML_ENTITIES           1
    1315                 : 
    1316                 : /* {{{ register_html_constants
    1317                 :  */
    1318                 : void register_html_constants(INIT_FUNC_ARGS)
    1319           17007 : {
    1320           17007 :         REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
    1321           17007 :         REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
    1322           17007 :         REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
    1323           17007 :         REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
    1324           17007 :         REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
    1325           17007 :         REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
    1326           17007 : }
    1327                 : /* }}} */
    1328                 : 
    1329                 : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
    1330                 :    Convert special characters to HTML entities */
    1331                 : PHP_FUNCTION(htmlspecialchars)
    1332            4320 : {
    1333            4320 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
    1334            4320 : }
    1335                 : /* }}} */
    1336                 : 
    1337                 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style]) U
    1338                 :    Convert special HTML entities back to characters */
    1339                 : PHP_FUNCTION(htmlspecialchars_decode)
    1340             113 : {
    1341                 :         zstr str;
    1342                 :         char *str_utf8;
    1343                 :         int str_utf8_len;
    1344                 :         zend_uchar type;
    1345                 :         char *new_str, *e, *p;
    1346                 :         int len, j, i, new_len;
    1347             113 :         long quote_style = ENT_COMPAT;
    1348                 :         struct basic_entities_dec basic_entities_dec[8];
    1349                 : 
    1350             113 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|l", &str, &len, &type, &quote_style) == FAILURE) {
    1351              19 :                 return;
    1352                 :         }
    1353                 : 
    1354              94 :         if (type == IS_UNICODE) {
    1355              90 :                 if (!u_memchr(str.u, 0x26 /*'&'*/, len)) {
    1356              24 :                         RETURN_UNICODEL(str.u, len, 1);
    1357                 :                 }
    1358                 : 
    1359              66 :                 zend_unicode_to_string(UG(utf8_conv), &str_utf8, &str_utf8_len, str.u, len TSRMLS_CC);
    1360              66 :                 new_str = str_utf8;
    1361              66 :                 new_len = str_utf8_len;
    1362              66 :                 p = memchr(new_str, '&', new_len);
    1363                 :         } else {
    1364               4 :                 new_str = estrndup(str.s, len);
    1365               4 :                 new_len = len;
    1366                 : 
    1367               4 :                 if (!(p = memchr(new_str, '&', new_len))) {
    1368               1 :                         RETURN_STRINGL(new_str, new_len, 0);
    1369                 :                 }
    1370                 :         }
    1371                 : 
    1372              69 :         e = new_str + new_len;
    1373                 : 
    1374             414 :         for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
    1375             345 :                 if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
    1376             133 :                         continue;
    1377                 :                 }
    1378             212 :                 basic_entities_dec[j].charcode = basic_entities[i].charcode;
    1379             212 :                 memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
    1380             212 :                 basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
    1381             212 :                 j++;
    1382                 :         }
    1383              69 :         basic_entities_dec[j].charcode = '&';
    1384              69 :         basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
    1385              69 :         memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
    1386              69 :         i = j + 1;
    1387                 :         
    1388                 :         do {
    1389             343 :                 int l = e - p;
    1390                 :         
    1391            1188 :                 for (j = 0; j < i; j++) {
    1392            1014 :                         if (basic_entities_dec[j].entitylen > l) {
    1393               0 :                                 continue;
    1394                 :                         }
    1395            1014 :                         if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
    1396             169 :                                 int e_len = basic_entities_dec[j].entitylen - 1;
    1397                 :                 
    1398             169 :                                 *p++ = (char)basic_entities_dec[j].charcode;
    1399             169 :                                 memmove(p, p + e_len, (e - p - e_len));
    1400             169 :                                 e -= e_len;
    1401             169 :                                 goto done;
    1402                 :                         }
    1403                 :                 }
    1404             174 :                 p++;
    1405                 : 
    1406             343 : done:
    1407             343 :                 if (p >= e) {
    1408              20 :                         break;
    1409                 :                 }
    1410             323 :         } while ((p = memchr(p, '&', (e - p))));
    1411                 : 
    1412              69 :         new_len = e - new_str;
    1413                 : 
    1414              69 :         new_str[new_len] = '\0';
    1415              69 :         if (type == IS_UNICODE) {
    1416              66 :                 RETVAL_U_STRINGL(UG(utf8_conv), new_str, new_len, ZSTR_AUTOFREE);
    1417                 :         } else {
    1418               3 :                 RETVAL_STRINGL(new_str, new_len, 0);
    1419                 :         }
    1420                 : }
    1421                 : /* }}} */
    1422                 : 
    1423                 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset]) U
    1424                 :    Convert all HTML entities to their applicable characters */
    1425                 : PHP_FUNCTION(html_entity_decode)
    1426              20 : {
    1427                 :         zstr str;
    1428              20 :         char *hint_charset = NULL;
    1429              20 :         int str_len, hint_charset_len = 0, len;
    1430                 :         char *str_utf8;
    1431                 :         int str_utf8_len;
    1432                 :         zend_uchar type;
    1433              20 :         long quote_style = ENT_COMPAT;
    1434                 :         char *replaced;
    1435                 : 
    1436              20 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "t|ls", &str, &str_len, &type,
    1437                 :                                                           &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
    1438               0 :                 return;
    1439                 :         }
    1440                 : 
    1441              20 :         if (type == IS_UNICODE) {
    1442              20 :                 zend_unicode_to_string(UG(utf8_conv), &str_utf8, &str_utf8_len, str.u, str_len TSRMLS_CC);
    1443              20 :                 str.s = str_utf8;
    1444              20 :                 str_len = str_utf8_len;
    1445              20 :                 hint_charset = "utf-8";
    1446                 :         }
    1447                 : 
    1448              20 :         replaced = php_unescape_html_entities(str.s, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
    1449                 : 
    1450              20 :         if (!replaced) {
    1451               0 :                 RETURN_FALSE;
    1452                 :         }
    1453                 : 
    1454              20 :         if (type == IS_UNICODE) {
    1455              20 :                 RETVAL_U_STRINGL(UG(utf8_conv), replaced, len, ZSTR_AUTOFREE);
    1456              20 :                 efree(str_utf8);
    1457                 :         } else {
    1458               0 :                 RETVAL_STRINGL(replaced, len, 0);
    1459                 :         }
    1460                 : }
    1461                 : /* }}} */
    1462                 : 
    1463                 : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
    1464                 :    Convert all applicable characters to HTML entities */
    1465                 : PHP_FUNCTION(htmlentities)
    1466             410 : {
    1467             410 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
    1468             410 : }
    1469                 : /* }}} */
    1470                 : 
    1471                 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]]) U
    1472                 :    Returns the internal translation table used by htmlspecialchars and htmlentities */
    1473                 : PHP_FUNCTION(get_html_translation_table)
    1474               0 : {
    1475               0 :         long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
    1476                 :         int i, j;
    1477                 :         char ind[2];
    1478               0 :         enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
    1479                 :         UChar32 cp;
    1480                 :         UChar key[3];
    1481                 :         int key_len;
    1482                 : 
    1483               0 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, &quote_style) == FAILURE) {
    1484               0 :                 return;
    1485                 :         }
    1486                 : 
    1487               0 :         charset = cs_utf_8;
    1488                 : 
    1489               0 :         array_init(return_value);
    1490                 : 
    1491               0 :         ind[1] = 0;
    1492                 : 
    1493               0 :         switch (which) {
    1494                 :                 case HTML_ENTITIES:
    1495               0 :                         for (j=0; entity_map[j].charset != cs_terminator; j++) {
    1496               0 :                                 if (entity_map[j].charset != charset)
    1497               0 :                                         continue;
    1498               0 :                                 for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
    1499                 :                                         char buffer[16];
    1500                 : 
    1501               0 :                                         if (entity_map[j].table[i] == NULL)
    1502               0 :                                                 continue;
    1503                 : 
    1504               0 :                                         cp = (UChar)(i + entity_map[j].basechar);
    1505               0 :                                         key_len = zend_codepoint_to_uchar(cp, key);
    1506               0 :                                         key[key_len] = 0;
    1507               0 :                                         snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
    1508               0 :                                         add_u_assoc_ascii_string_ex(return_value, IS_UNICODE, ZSTR(key), key_len+1, buffer, 1);
    1509                 :                                 }
    1510                 :                         }
    1511                 :                         /* break thru */
    1512                 : 
    1513                 :                 case HTML_SPECIALCHARS:
    1514               0 :                         for (j = 0; basic_entities[j].charcode != 0; j++) {
    1515                 : 
    1516               0 :                                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
    1517               0 :                                         continue;
    1518                 :                                 
    1519               0 :                                 ind[0] = (unsigned char)basic_entities[j].charcode;
    1520               0 :                                 add_ascii_assoc_ascii_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
    1521                 :                         }
    1522               0 :                         add_ascii_assoc_ascii_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
    1523                 : 
    1524                 :                         break;
    1525                 :         }
    1526                 : }
    1527                 : /* }}} */
    1528                 : 
    1529                 : /*
    1530                 :  * Local variables:
    1531                 :  * tab-width: 4
    1532                 :  * c-basic-offset: 4
    1533                 :  * End:
    1534                 :  * vim600: sw=4 ts=4 fdm=marker
    1535                 :  * vim<600: sw=4 ts=4
    1536                 :  */

Generated by: LTP GCOV extension version 1.5

Generated at Mon, 23 Nov 2009 17:39:42 +0000 (35 hours ago)

Copyright © 2005-2009 The PHP Group
All rights reserved.