PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LCOV - code coverage report
Current view: top level - ext/standard - html.c (source / functions) Hit Total Coverage
Test: PHP Code Coverage Lines: 417 491 84.9 %
Date: 2014-04-18 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    +----------------------------------------------------------------------+
       3             :    | PHP Version 5                                                        |
       4             :    +----------------------------------------------------------------------+
       5             :    | Copyright (c) 1997-2013 The PHP Group                                |
       6             :    +----------------------------------------------------------------------+
       7             :    | This source file is subject to version 3.01 of the PHP license,      |
       8             :    | that is bundled with this package in the file LICENSE, and is        |
       9             :    | available through the world-wide-web at the following url:           |
      10             :    | http://www.php.net/license/3_01.txt                                  |
      11             :    | If you did not receive a copy of the PHP license and are unable to   |
      12             :    | obtain it through the world-wide-web, please send a note to          |
      13             :    | license@php.net so we can mail you a copy immediately.               |
      14             :    +----------------------------------------------------------------------+
      15             :    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
      16             :    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
      17             :    |          Wez Furlong <wez@thebrainroom.com>                          |
      18             :    +----------------------------------------------------------------------+
      19             : */
      20             : 
      21             : /* $Id$ */
      22             : 
      23             : /*
      24             :  * HTML entity resources:
      25             :  *
      26             :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
      27             :  * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
      28             :  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
      29             :  *
      30             :  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
      31             :  * 
      32             :  */
      33             : 
      34             : #include "php.h"
      35             : #if PHP_WIN32
      36             : #include "config.w32.h"
      37             : #else
      38             : #include <php_config.h>
      39             : #endif
      40             : #include "html.h"
      41             : #include "php_string.h"
      42             : #include "SAPI.h"
      43             : #if HAVE_LOCALE_H
      44             : #include <locale.h>
      45             : #endif
      46             : #if HAVE_LANGINFO_H
      47             : #include <langinfo.h>
      48             : #endif
      49             : 
      50             : #if HAVE_MBSTRING
      51             : # include "ext/mbstring/mbstring.h"
      52             : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
      53             : #endif
      54             : 
      55             : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
      56             :                                           cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, 
      57             :                                           cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
      58             :                                           cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
      59             :                                         };
      60             : typedef const char *const entity_table_t;
      61             : 
      62             : /* codepage 1252 is a Windows extension to iso-8859-1. */
      63             : static entity_table_t ent_cp_1252[] = {
      64             :         "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
      65             :         "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
      66             :         NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
      67             :         "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
      68             :         "oelig", NULL, NULL, "Yuml" 
      69             : };
      70             : 
      71             : static entity_table_t ent_iso_8859_1[] = {
      72             :         "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
      73             :         "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
      74             :         "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
      75             :         "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
      76             :         "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
      77             :         "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      78             :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      79             :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      80             :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
      81             :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
      82             :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
      83             :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
      84             :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
      85             :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
      86             :         "uuml", "yacute", "thorn", "yuml"
      87             : };
      88             : 
      89             : static entity_table_t ent_iso_8859_15[] = {
      90             :         "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
      91             :         "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
      92             :         "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
      93             :         "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
      94             :         "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
      95             :         "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
      96             :         "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
      97             :         "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
      98             :         "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
      99             :         "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
     100             :         "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
     101             :         "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
     102             :         "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
     103             :         "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
     104             :         "uuml", "yacute", "thorn", "yuml"
     105             : };
     106             : 
     107             : static entity_table_t ent_uni_338_402[] = {
     108             :         /* 338 (0x0152) */
     109             :         "OElig", "oelig", NULL, NULL, NULL, NULL,
     110             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     111             :         /* 352 (0x0160) */
     112             :         "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
     113             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     114             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     115             :         /* 376 (0x0178) */
     116             :         "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     117             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     118             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     119             :         /* 400 (0x0190) */
     120             :         NULL, NULL, "fnof"
     121             : };
     122             : 
     123             : static entity_table_t ent_uni_spacing[] = {
     124             :         /* 710 */
     125             :         "circ",
     126             :         /* 711 - 730 */
     127             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     128             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     129             :         /* 731 - 732 */
     130             :         NULL, "tilde"
     131             : };
     132             : 
     133             : static entity_table_t ent_uni_greek[] = {
     134             :         /* 913 */
     135             :         "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
     136             :         "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
     137             :         NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
     138             :         /* 938 - 944 are not mapped */
     139             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     140             :         "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
     141             :         "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
     142             :         "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
     143             :         /* 970 - 976 are not mapped */
     144             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     145             :         "thetasym", "upsih",
     146             :         NULL, NULL, NULL,
     147             :         "piv"
     148             : };
     149             : 
     150             : static entity_table_t ent_uni_punct[] = {
     151             :         /* 8194 */
     152             :         "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
     153             :         "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
     154             :         NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
     155             :         /* 8216 */
     156             :         "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
     157             :         "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
     158             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
     159             :         /* 8242 */
     160             :         "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
     161             :         NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
     162             :         "frasl"
     163             : };
     164             : 
     165             : static entity_table_t ent_uni_euro[] = {
     166             :         "euro"
     167             : };
     168             : 
     169             : static entity_table_t ent_uni_8465_8501[] = {
     170             :         /* 8465 */
     171             :         "image", NULL, NULL, NULL, NULL, NULL, NULL,
     172             :         /* 8472 */
     173             :         "weierp", NULL, NULL, NULL,
     174             :         /* 8476 */
     175             :         "real", NULL, NULL, NULL, NULL, NULL,
     176             :         /* 8482 */
     177             :         "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     178             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     179             :         /* 8501 */
     180             :         "alefsym",
     181             : };
     182             : 
     183             : static entity_table_t ent_uni_8592_9002[] = {
     184             :         /* 8592 (0x2190) */
     185             :         "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
     186             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     187             :         /* 8608 (0x21a0) */
     188             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     189             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     190             :         /* 8624 (0x21b0) */
     191             :         NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
     192             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     193             :         /* 8640 (0x21c0) */
     194             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     195             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     196             :         /* 8656 (0x21d0) */
     197             :         "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL,
     198             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     199             :         /* 8672 (0x21e0) */
     200             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     201             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     202             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     203             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     204             :         /* 8704 (0x2200) */
     205             :         "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla",
     206             :         "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod",
     207             :         /* 8720 (0x2210) */
     208             :         NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast",
     209             :         NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL,
     210             :         /* 8736 (0x2220) */
     211             :         "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and",
     212             :         "or", "cap", "cup", "int", NULL, NULL, NULL, NULL,
     213             :         /* 8752 (0x2230) */
     214             :         NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL,
     215             :         NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL,
     216             :         /* 8768 (0x2240) */
     217             :         NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL,
     218             :         "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     219             :         /* 8784 (0x2250) */
     220             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     221             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     222             :         /* 8800 (0x2260) */
     223             :         "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL,
     224             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     225             :         /* 8816 (0x2270) */
     226             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     227             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     228             :         /* 8832 (0x2280) */
     229             :         NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe",
     230             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     231             :         /* 8848 (0x2290) */
     232             :         NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
     233             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     234             :         /* 8864 (0x22a0) */
     235             :         NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
     236             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     237             :         /* 8880 (0x22b0) */
     238             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     239             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     240             :         /* 8896 (0x22c0) */
     241             :         NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
     242             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     243             :         /* 8912 (0x22d0) */
     244             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     245             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     246             :         /* 8928 (0x22e0) */
     247             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     248             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     249             :         /* 8944 (0x22f0) */
     250             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     251             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     252             :         /* 8960 (0x2300) */
     253             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     254             :         "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
     255             :         /* 8976 (0x2310) */
     256             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     257             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     258             :         /* 8992 (0x2320) */
     259             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     260             :         NULL, "lang", "rang"
     261             : };
     262             : 
     263             : static entity_table_t ent_uni_9674[] = {
     264             :         /* 9674 */
     265             :         "loz"
     266             : };
     267             : 
     268             : static entity_table_t ent_uni_9824_9830[] = {
     269             :         /* 9824 */
     270             :         "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
     271             : };
     272             : 
     273             : static entity_table_t ent_koi8r[] = {
     274             :         "#1105", /* "jo "*/
     275             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     276             :         NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
     277             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
     278             :         "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", 
     279             :         "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", 
     280             :         "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", 
     281             :         "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", 
     282             :         "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", 
     283             :         "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", 
     284             :         "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", 
     285             :         "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
     286             :         "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", 
     287             :         "#1066"
     288             : };
     289             : 
     290             : static entity_table_t ent_cp_1251[] = {
     291             :         "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
     292             :         "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
     293             :         "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
     294             :         "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
     295             :         "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
     296             :         "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
     297             :         "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
     298             :         "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
     299             :         "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
     300             :         "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
     301             :         "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
     302             :         "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
     303             :         "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
     304             :         "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
     305             :         "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
     306             :         "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
     307             :         "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
     308             :         "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
     309             :         "#1103"
     310             : };
     311             : 
     312             : static entity_table_t ent_iso_8859_5[] = {
     313             :         "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
     314             :         "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
     315             :         "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
     316             :         "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
     317             :         "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
     318             :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
     319             :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
     320             :         "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
     321             :         "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
     322             :         "#1119"
     323             : };
     324             : 
     325             : static entity_table_t ent_cp_866[] = {
     326             : 
     327             :         "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", 
     328             :         "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", 
     329             :         "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", 
     330             :         "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", 
     331             :         "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", 
     332             :         "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", 
     333             :         "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", 
     334             :         "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", 
     335             :         "#176", "#8729", "#183", "#8730", "#8470", "#164",  "#9632", 
     336             :         "#160"
     337             : };
     338             : 
     339             : /* MacRoman has a couple of low-ascii chars that need mapping too */
     340             : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
     341             : /* DB exports, this mapping changes it to a space */
     342             : static entity_table_t ent_macroman[] = {
     343             :         "sp", NULL, NULL, NULL,
     344             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     345             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     346             :         NULL, NULL, NULL, NULL, NULL, "quot", NULL,
     347             :         NULL, NULL, "amp", NULL, NULL, NULL, NULL,
     348             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     349             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     350             :         NULL, NULL, NULL, "lt", NULL, "gt", NULL,
     351             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     352             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     353             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     354             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     355             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     356             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     357             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     358             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     359             :         NULL, NULL, NULL, NULL, NULL, NULL, NULL,
     360             :         NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
     361             :         "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
     362             :         "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
     363             :         "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
     364             :         "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
     365             :         "cent", "pound", "sect", "bull", "para", "szlig", "reg",
     366             :         "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
     367             :         "infin", "plusmn", "le", "ge", "yen", "micro", "part",
     368             :         "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
     369             :         "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
     370             :         "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
     371             :         "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
     372             :         "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
     373             :         "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
     374             :         "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
     375             :         "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
     376             :         "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
     377             :         "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
     378             :         "#733", "#731", "#711"
     379             : };
     380             : 
     381             : struct html_entity_map {
     382             :         enum entity_charset charset;    /* charset identifier */
     383             :         unsigned int basechar;                  /* char code at start of table */
     384             :         unsigned int endchar;                   /* last char code in the table */
     385             :         entity_table_t *table;                  /* the table of mappings */
     386             : };
     387             : 
     388             : static const struct html_entity_map entity_map[] = {
     389             :         { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
     390             :         { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
     391             :         { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
     392             :         { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
     393             :         { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
     394             :         { cs_utf_8,             338,  402,  ent_uni_338_402 },
     395             :         { cs_utf_8,             710,  732,  ent_uni_spacing },
     396             :         { cs_utf_8,             913,  982,  ent_uni_greek },
     397             :         { cs_utf_8,             8194, 8260, ent_uni_punct },
     398             :         { cs_utf_8,             8364, 8364, ent_uni_euro }, 
     399             :         { cs_utf_8,             8465, 8501, ent_uni_8465_8501 },
     400             :         { cs_utf_8,             8592, 9002, ent_uni_8592_9002 },
     401             :         { cs_utf_8,             9674, 9674, ent_uni_9674 },
     402             :         { cs_utf_8,             9824, 9830, ent_uni_9824_9830 },
     403             :         { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
     404             :         { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
     405             :         { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
     406             :         { cs_sjis,                      0xa0, 0xff, ent_iso_8859_1 },
     407             :         { cs_eucjp,                     0xa0, 0xff, ent_iso_8859_1 },
     408             :         { cs_koi8r,                 0xa3, 0xff, ent_koi8r },
     409             :         { cs_cp1251,            0x80, 0xff, ent_cp_1251 },
     410             :         { cs_8859_5,            0xc0, 0xff, ent_iso_8859_5 },
     411             :         { cs_cp866,                 0xc0, 0xff, ent_cp_866 },
     412             :         { cs_macroman,          0x0b, 0xff, ent_macroman },
     413             :         { cs_terminator }
     414             : };
     415             : 
     416             : static const struct {
     417             :         const char *codeset;
     418             :         enum entity_charset charset;
     419             : } charset_map[] = {
     420             :         { "ISO-8859-1",       cs_8859_1 },
     421             :         { "ISO8859-1",                cs_8859_1 },
     422             :         { "ISO-8859-15",      cs_8859_15 },
     423             :         { "ISO8859-15",       cs_8859_15 },
     424             :         { "utf-8",                    cs_utf_8 },
     425             :         { "cp1252",           cs_cp1252 },
     426             :         { "Windows-1252",     cs_cp1252 },
     427             :         { "1252",           cs_cp1252 }, 
     428             :         { "BIG5",                     cs_big5 },
     429             :         { "950",            cs_big5 },
     430             :         { "GB2312",                   cs_gb2312 },
     431             :         { "936",            cs_gb2312 },
     432             :         { "BIG5-HKSCS",               cs_big5hkscs },
     433             :         { "Shift_JIS",                cs_sjis },
     434             :         { "SJIS",             cs_sjis },
     435             :         { "932",            cs_sjis },
     436             :         { "EUCJP",            cs_eucjp },
     437             :         { "EUC-JP",                   cs_eucjp },
     438             :         { "KOI8-R",         cs_koi8r },
     439             :         { "koi8-ru",        cs_koi8r },
     440             :         { "koi8r",          cs_koi8r },
     441             :         { "cp1251",         cs_cp1251 },
     442             :         { "Windows-1251",   cs_cp1251 },
     443             :         { "win-1251",       cs_cp1251 },
     444             :         { "iso8859-5",      cs_8859_5 },
     445             :         { "iso-8859-5",     cs_8859_5 },
     446             :         { "cp866",          cs_cp866 },
     447             :         { "866",            cs_cp866 },    
     448             :         { "ibm866",         cs_cp866 },
     449             :         { "MacRoman",       cs_macroman },
     450             :         { NULL }
     451             : };
     452             : 
     453             : static const struct {
     454             :         unsigned short charcode;
     455             :         char *entity;
     456             :         int entitylen;
     457             :         int flags;
     458             : } basic_entities[] = {
     459             :         { '"',     "&quot;", 6,      ENT_HTML_QUOTE_DOUBLE },
     460             :         { '\'', "&#039;", 6,      ENT_HTML_QUOTE_SINGLE },
     461             :         { '\'', "&#39;",  5,      ENT_HTML_QUOTE_SINGLE },
     462             :         { '<',       "&lt;",           4,      0 },
     463             :         { '>',       "&gt;",           4,      0 },
     464             :         { 0, NULL, 0, 0 }
     465             : };
     466             :         
     467             : struct basic_entities_dec {
     468             :         unsigned short charcode;
     469             :         char entity[8];
     470             :         int entitylen;  
     471             : };
     472             :         
     473             : #define MB_RETURN { \
     474             :                         *newpos = pos;       \
     475             :                         mbseq[mbpos] = '\0'; \
     476             :                         *mbseqlen = mbpos;   \
     477             :                         return this_char; }
     478             :                                         
     479             : #define MB_WRITE(mbchar) { \
     480             :                         mbspace--;  \
     481             :                         if (mbspace == 0) {      \
     482             :                                 MB_RETURN;           \
     483             :                         }                        \
     484             :                         mbseq[mbpos++] = (mbchar); }
     485             : 
     486             : /* skip one byte and return */
     487             : #define MB_FAILURE(pos) do { \
     488             :         *newpos = pos + 1; \
     489             :         *status = FAILURE; \
     490             :         return 0; \
     491             : } while (0)
     492             : 
     493             : #define CHECK_LEN(pos, chars_need)                      \
     494             :         if (chars_need < 1) {                                                \
     495             :                 if((str_len - (pos)) < chars_need) { \
     496             :                         *newpos = pos;                                          \
     497             :                         *status = FAILURE;                                      \
     498             :                         return 0;                                                       \
     499             :                 }                                                                               \
     500             :         } else {                                                                        \
     501             :                 if((str_len - (pos)) < chars_need) { \
     502             :                         *newpos = pos + 1;                                      \
     503             :                         *status = FAILURE;                                      \
     504             :                         return 0;                                                       \
     505             :                 }                                                                               \
     506             :         }
     507             : 
     508             : /* {{{ get_next_char
     509             :  */
     510       44454 : inline static unsigned int get_next_char(enum entity_charset charset,
     511             :                 unsigned char * str,
     512             :                 int str_len,
     513             :                 int * newpos,
     514             :                 unsigned char * mbseq,
     515             :                 int * mbseqlen, 
     516             :                 int *status)
     517             : {
     518       44454 :         int pos = *newpos;
     519       44454 :         int mbpos = 0;
     520       44454 :         int mbspace = *mbseqlen;
     521       44454 :         unsigned int this_char = 0;
     522             :         unsigned char next_char;
     523             : 
     524       44454 :         *status = SUCCESS;
     525             : 
     526       44454 :         if (mbspace <= 0) {
     527           0 :                 *mbseqlen = 0;
     528           0 :                 CHECK_LEN(pos, 1);
     529           0 :                 *newpos = pos + 1;
     530           0 :                 return str[pos];
     531             :         }
     532             : 
     533       44454 :         switch (charset) {
     534             :                 case cs_utf_8:
     535             :                         {
     536             :                                 unsigned char c;
     537        8048 :                                 CHECK_LEN(pos, 1);
     538        8048 :                                 c = str[pos];
     539        8048 :                                 if (c < 0x80) {
     540         154 :                                         MB_WRITE(c);
     541         154 :                                         this_char = c;
     542         154 :                                         pos++;
     543        7894 :                                 } else if (c < 0xc2) {
     544         165 :                                         MB_FAILURE(pos);
     545        7729 :                                 } else if (c < 0xe0) {
     546        1931 :                                         CHECK_LEN(pos, 2);
     547        1923 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     548           6 :                                                 MB_FAILURE(pos);
     549             :                                         }
     550        1917 :                                         this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
     551        1917 :                                         if (this_char < 0x80) {
     552           0 :                                                 MB_FAILURE(pos);
     553             :                                         }
     554        1917 :                                         MB_WRITE((unsigned char)c);
     555        1917 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     556        1917 :                                         pos += 2;
     557        5798 :                                 } else if (c < 0xf0) {
     558        1949 :                                         CHECK_LEN(pos, 3);
     559        1925 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     560           3 :                                                 MB_FAILURE(pos);
     561             :                                         }
     562        1922 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     563           2 :                                                 MB_FAILURE(pos);
     564             :                                         }
     565        1920 :                                         this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
     566        1920 :                                         if (this_char < 0x800) {
     567          72 :                                                 MB_FAILURE(pos);
     568        1848 :                                         } else if (this_char >= 0xd800 && this_char <= 0xdfff) {
     569          64 :                                                 MB_FAILURE(pos);
     570             :                                         }
     571        1784 :                                         MB_WRITE((unsigned char)c);
     572        1784 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     573        1784 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     574        1784 :                                         pos += 3;
     575        3849 :                                 } else if (c < 0xf5) {
     576        1265 :                                         CHECK_LEN(pos, 4);
     577        1265 :                                         if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
     578           0 :                                                 MB_FAILURE(pos);
     579             :                                         }
     580        1265 :                                         if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
     581           0 :                                                 MB_FAILURE(pos);
     582             :                                         }
     583        1265 :                                         if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
     584           0 :                                                 MB_FAILURE(pos);
     585             :                                         }
     586        1265 :                                         this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
     587        1265 :                                         if (this_char < 0x10000 || this_char > 0x10FFFF) {
     588         255 :                                                 MB_FAILURE(pos);
     589             :                                         }
     590        1010 :                                         MB_WRITE((unsigned char)c);
     591        1010 :                                         MB_WRITE((unsigned char)str[pos + 1]);
     592        1010 :                                         MB_WRITE((unsigned char)str[pos + 2]);
     593        1010 :                                         MB_WRITE((unsigned char)str[pos + 3]);
     594        1010 :                                         pos += 4;
     595             :                                 } else {
     596        2584 :                                         MB_FAILURE(pos);
     597             :                                 }
     598             :                         }
     599        4865 :                         break;
     600             :                 case cs_big5:
     601             :                 case cs_gb2312:
     602             :                 case cs_big5hkscs:
     603             :                         {
     604        1262 :                                 CHECK_LEN(pos, 1);
     605        1262 :                                 this_char = str[pos++];
     606             :                                 /* check if this is the first of a 2-byte sequence */
     607        1766 :                                 if (this_char >= 0x81 && this_char <= 0xfe) {
     608             :                                         /* peek at the next char */
     609        1260 :                                         CHECK_LEN(pos, 1);
     610        1134 :                                         next_char = str[pos++];
     611        1638 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     612             :                                                         (next_char >= 0xa1 && next_char <= 0xfe)) {
     613             :                                                 /* yes, this a wide char */
     614         504 :                                                 MB_WRITE(this_char);
     615         504 :                                                 MB_WRITE(next_char);
     616         504 :                                                 this_char = (this_char << 8) | next_char;
     617             :                                         } else {
     618         630 :                                                 MB_FAILURE(pos);
     619             :                                         }
     620             :                                 } else {
     621           2 :                                         MB_WRITE(this_char);
     622             :                                 }
     623             :                         }
     624         506 :                         break;
     625             :                 case cs_sjis:
     626             :                         {
     627         681 :                                 CHECK_LEN(pos, 1);
     628         681 :                                 this_char = str[pos++];
     629             :                                 /* check if this is the first of a 2-byte sequence */
     630         930 :                                 if ((this_char >= 0x81 && this_char <= 0x9f) ||
     631             :                                         (this_char >= 0xe0 && this_char <= 0xfc)) {
     632             :                                         /* peek at the next char */
     633         609 :                                         CHECK_LEN(pos, 1);
     634         549 :                                         next_char = str[pos++];
     635        1218 :                                         if ((next_char >= 0x40 && next_char <= 0x7e) ||
     636         420 :                                                 (next_char >= 0x80 && next_char <= 0xfc))
     637             :                                         {
     638             :                                                 /* yes, this a wide char */
     639         249 :                                                 MB_WRITE(this_char);
     640         249 :                                                 MB_WRITE(next_char);
     641         249 :                                                 this_char = (this_char << 8) | next_char;
     642             :                                         } else {
     643         300 :                                                 MB_FAILURE(pos);
     644             :                                         }
     645             :                                 } else {
     646          72 :                                         MB_WRITE(this_char);
     647             :                                 }
     648         321 :                                 break;
     649             :                         }
     650             :                 case cs_eucjp:
     651             :                         {
     652        2402 :                                 CHECK_LEN(pos, 1);
     653        2402 :                                 this_char = str[pos++];
     654             :                                 /* check if this is the first of a multi-byte sequence */
     655        2602 :                                 if (this_char >= 0xa1 && this_char <= 0xfe) {
     656             :                                         /* peek at the next char */
     657         797 :                                         CHECK_LEN(pos, 1);
     658         576 :                                         next_char = str[pos++];
     659         776 :                                         if (next_char >= 0xa1 && next_char <= 0xfe) {
     660             :                                                 /* yes, this a jis kanji char */
     661         200 :                                                 MB_WRITE(this_char);
     662         200 :                                                 MB_WRITE(next_char);
     663         200 :                                                 this_char = (this_char << 8) | next_char;
     664             :                                         } else {
     665         376 :                                                 MB_FAILURE(pos);
     666             :                                         }
     667        1605 :                                 } else if (this_char == 0x8e) {
     668             :                                         /* peek at the next char */
     669         661 :                                         CHECK_LEN(pos, 1);
     670         660 :                                         next_char = str[pos++];
     671        1103 :                                         if (next_char >= 0xa1 && next_char <= 0xdf) {
     672             :                                                 /* JIS X 0201 kana */
     673         443 :                                                 MB_WRITE(this_char);
     674         443 :                                                 MB_WRITE(next_char);
     675         443 :                                                 this_char = (this_char << 8) | next_char;
     676             :                                         } else {
     677         217 :                                                 MB_FAILURE(pos);
     678             :                                         }
     679         944 :                                 } else if (this_char == 0x8f) {
     680             :                                         /* peek at the next two char */
     681             :                                         unsigned char next2_char;
     682         661 :                                         CHECK_LEN(pos, 2);
     683         565 :                                         next_char = str[pos];
     684         565 :                                         next2_char = str[pos + 1];
     685         565 :                                         pos += 2;
     686         754 :                                         if ((next_char >= 0xa1 && next_char <= 0xfe) &&
     687             :                                                 (next2_char >= 0xa1 && next2_char <= 0xfe)) {
     688             :                                                 /* JIS X 0212 hojo-kanji */
     689         189 :                                                 MB_WRITE(this_char);
     690         189 :                                                 MB_WRITE(next_char);
     691         189 :                                                 MB_WRITE(next2_char);
     692         189 :                                                 this_char = (this_char << 16) | (next_char << 8) | next2_char;
     693             :                                         } else {
     694         376 :                                                 MB_FAILURE(pos);
     695             :                                         }
     696             :                                 } else {
     697         283 :                                         MB_WRITE(this_char);
     698             :                                 }
     699        1115 :                                 break;
     700             :                         }
     701             :                 default:
     702             :                         /* single-byte charsets */
     703       32061 :                         CHECK_LEN(pos, 1);
     704       32061 :                         this_char = str[pos++];
     705       32061 :                         MB_WRITE(this_char);
     706             :                         break;
     707             :         }
     708       38868 :         MB_RETURN;
     709             : }
     710             : /* }}} */
     711             : 
     712             : /* {{{ entity_charset determine_charset
     713             :  * returns the charset identifier based on current locale or a hint.
     714             :  * defaults to iso-8859-1 */
     715       14445 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
     716             : {
     717             :         int i;
     718       14445 :         enum entity_charset charset = cs_8859_1;
     719       14445 :         int len = 0;
     720       14445 :         zval *uf_result = NULL;
     721             : 
     722             :         /* Guarantee default behaviour for backwards compatibility */
     723       14445 :         if (charset_hint == NULL)
     724        2284 :                 return cs_8859_1;
     725             : 
     726       12161 :         if ((len = strlen(charset_hint)) != 0) {
     727       12138 :                 goto det_charset;
     728             :         }
     729             : #if HAVE_MBSTRING
     730             : #if !defined(COMPILE_DL_MBSTRING)
     731             :         /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
     732          23 :         switch (MBSTRG(current_internal_encoding)) {
     733             :                 case mbfl_no_encoding_8859_1:
     734           2 :                         return cs_8859_1;
     735             : 
     736             :                 case mbfl_no_encoding_utf8:
     737           0 :                         return cs_utf_8;
     738             : 
     739             :                 case mbfl_no_encoding_euc_jp:
     740             :                 case mbfl_no_encoding_eucjp_win:
     741           2 :                         return cs_eucjp;
     742             : 
     743             :                 case mbfl_no_encoding_sjis:
     744             :                 case mbfl_no_encoding_sjis_open:
     745             :                 case mbfl_no_encoding_cp932:
     746           2 :                         return cs_sjis;
     747             : 
     748             :                 case mbfl_no_encoding_cp1252:
     749           3 :                         return cs_cp1252;
     750             : 
     751             :                 case mbfl_no_encoding_8859_15:
     752           2 :                         return cs_8859_15;
     753             : 
     754             :                 case mbfl_no_encoding_big5:
     755           0 :                         return cs_big5;
     756             : 
     757             :                 case mbfl_no_encoding_euc_cn:
     758             :                 case mbfl_no_encoding_hz:
     759             :                 case mbfl_no_encoding_cp936:
     760           0 :                         return cs_gb2312;
     761             : 
     762             :                 case mbfl_no_encoding_koi8r:
     763           0 :                         return cs_koi8r;
     764             : 
     765             :                 case mbfl_no_encoding_cp866:
     766           0 :                         return cs_cp866;
     767             : 
     768             :                 case mbfl_no_encoding_cp1251:
     769           2 :                         return cs_cp1251;
     770             : 
     771             :                 case mbfl_no_encoding_8859_5:
     772           0 :                         return cs_8859_5;
     773             : 
     774             :                 default:
     775             :                         ;
     776             :         }
     777             : #else
     778             :         {
     779             :                 zval nm_mb_internal_encoding;
     780             : 
     781             :                 ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
     782             : 
     783             :                 if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
     784             : 
     785             :                         charset_hint = Z_STRVAL_P(uf_result);
     786             :                         len = Z_STRLEN_P(uf_result);
     787             :                         
     788             :                         if (charset_hint != NULL && len != 0) {
     789             :                                 if (len == 4) { /* sizeof(none|auto|pass)-1 */
     790             :                                         if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
     791             :                                                 !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
     792             :                                                 !memcmp("none", charset_hint, sizeof("none") - 1)) {
     793             : 
     794             :                                                 charset_hint = NULL;
     795             :                                                 len = 0;
     796             :                                         }
     797             :                                 } else {
     798             :                                         /* Jump to det_charset only if mbstring isn't one of above eq pass, auto, none.
     799             :                                            Otherwise try default_charset next */
     800             :                                         goto det_charset;
     801             :                                 }
     802             :                         }
     803             :                 }
     804             :         }
     805             : #endif
     806             : #endif
     807             : 
     808          10 :         charset_hint = SG(default_charset);
     809          10 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     810           6 :                 goto det_charset;
     811             :         }
     812             : 
     813             :         /* try to detect the charset for the locale */
     814             : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
     815           4 :         charset_hint = nl_langinfo(CODESET);
     816           4 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     817           4 :                 goto det_charset;
     818             :         }
     819             : #endif
     820             : 
     821             : #if HAVE_LOCALE_H
     822             :         /* try to figure out the charset from the locale */
     823             :         {
     824             :                 char *localename;
     825             :                 char *dot, *at;
     826             : 
     827             :                 /* lang[_territory][.codeset][@modifier] */
     828           0 :                 localename = setlocale(LC_CTYPE, NULL);
     829             : 
     830           0 :                 dot = strchr(localename, '.');
     831           0 :                 if (dot) {
     832           0 :                         dot++;
     833             :                         /* locale specifies a codeset */
     834           0 :                         at = strchr(dot, '@');
     835           0 :                         if (at)
     836           0 :                                 len = at - dot;
     837             :                         else
     838           0 :                                 len = strlen(dot);
     839           0 :                         charset_hint = dot;
     840             :                 } else {
     841             :                         /* no explicit name; see if the name itself
     842             :                          * is the charset */
     843           0 :                         charset_hint = localename;
     844           0 :                         len = strlen(charset_hint);
     845             :                 }
     846             :         }
     847             : #endif
     848             : 
     849             : det_charset:
     850             : 
     851       12148 :         if (charset_hint) {
     852       12148 :                 int found = 0;
     853             :                 
     854             :                 /* now walk the charset map and look for the codeset */
     855       98167 :                 for (i = 0; charset_map[i].codeset; i++) {
     856       98161 :                         if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
     857       12142 :                                 charset = charset_map[i].charset;
     858       12142 :                                 found = 1;
     859       12142 :                                 break;
     860             :                         }
     861             :                 }
     862       12148 :                 if (!found) {
     863           6 :                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
     864             :                                         charset_hint);
     865             :                 }
     866             :         }
     867       12148 :         if (uf_result != NULL) {
     868           0 :                 zval_ptr_dtor(&uf_result);
     869             :         }
     870       12148 :         return charset;
     871             : }
     872             : /* }}} */
     873             : 
     874             : /* {{{ php_utf32_utf8 */
     875       69690 : size_t php_utf32_utf8(unsigned char *buf, unsigned k)
     876             : {
     877       69690 :         size_t retval = 0;
     878             : 
     879       69690 :         if (k < 0x80) {
     880           2 :                 buf[0] = k;
     881           2 :                 retval = 1;
     882       69688 :         } else if (k < 0x800) {
     883       43836 :                 buf[0] = 0xc0 | (k >> 6);
     884       43836 :                 buf[1] = 0x80 | (k & 0x3f);
     885       43836 :                 retval = 2;
     886       25852 :         } else if (k < 0x10000) {
     887       25852 :                 buf[0] = 0xe0 | (k >> 12);
     888       25852 :                 buf[1] = 0x80 | ((k >> 6) & 0x3f);
     889       25852 :                 buf[2] = 0x80 | (k & 0x3f);
     890       25852 :                 retval = 3;
     891           0 :         } else if (k < 0x200000) {
     892           0 :                 buf[0] = 0xf0 | (k >> 18);
     893           0 :                 buf[1] = 0x80 | ((k >> 12) & 0x3f);
     894           0 :                 buf[2] = 0x80 | ((k >> 6) & 0x3f);
     895           0 :                 buf[3] = 0x80 | (k & 0x3f);
     896           0 :                 retval = 4;
     897           0 :         } else if (k < 0x4000000) {
     898           0 :                 buf[0] = 0xf8 | (k >> 24);
     899           0 :                 buf[1] = 0x80 | ((k >> 18) & 0x3f);
     900           0 :                 buf[2] = 0x80 | ((k >> 12) & 0x3f);
     901           0 :                 buf[3] = 0x80 | ((k >> 6) & 0x3f);
     902           0 :                 buf[4] = 0x80 | (k & 0x3f);
     903           0 :                 retval = 5;
     904             :         } else {
     905           0 :                 buf[0] = 0xfc | (k >> 30);
     906           0 :                 buf[1] = 0x80 | ((k >> 24) & 0x3f);
     907           0 :                 buf[2] = 0x80 | ((k >> 18) & 0x3f);
     908           0 :                 buf[3] = 0x80 | ((k >> 12) & 0x3f);
     909           0 :                 buf[4] = 0x80 | ((k >> 6) & 0x3f);
     910           0 :                 buf[5] = 0x80 | (k & 0x3f);
     911           0 :                 retval = 6;
     912             :         }
     913       69690 :         buf[retval] = '\0';
     914             : 
     915       69690 :         return retval;
     916             : }
     917             : /* }}} */
     918             : 
     919             : /* {{{ php_unescape_html_entities
     920             :  */
     921         283 : PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
     922             : {
     923             :         int retlen;
     924             :         int j, k;
     925             :         char *replaced, *ret, *p, *q, *lim, *next;
     926         283 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
     927             :         unsigned char replacement[15];
     928             :         int replacement_len;
     929             : 
     930         283 :         ret = estrndup(old, oldlen);
     931         283 :         retlen = oldlen;
     932         283 :         if (!retlen) {
     933           2 :                 goto empty_source;
     934             :         }
     935             :         
     936         281 :         if (all) {
     937             :                 /* look for a match in the maps for this charset */
     938        7025 :                 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
     939        6744 :                         if (entity_map[j].charset != charset)
     940        4015 :                                 continue;
     941             : 
     942      215200 :                         for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
     943             :                                 unsigned char entity[32];
     944      212471 :                                 int entity_length = 0;
     945             : 
     946      212471 :                                 if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
     947      144241 :                                         continue;
     948             : 
     949       68230 :                                 entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
     950       68230 :                                 if (entity_length >= sizeof(entity)) {
     951           0 :                                         continue;
     952             :                                 }
     953             : 
     954             :                                 /* When we have MBCS entities in the tables above, this will need to handle it */
     955       68230 :                                 replacement_len = 0;
     956       68230 :                                 switch (charset) {
     957             :                                         case cs_8859_1:
     958             :                                         case cs_cp1252:
     959             :                                         case cs_8859_15:
     960             :                                         case cs_cp1251:
     961             :                                         case cs_8859_5:
     962             :                                         case cs_cp866:
     963             :                                         case cs_koi8r:
     964         774 :                                                 replacement[0] = k;
     965         774 :                                                 replacement[1] = '\0';
     966         774 :                                                 replacement_len = 1;
     967         774 :                                                 break;
     968             : 
     969             :                                         case cs_big5:
     970             :                                         case cs_gb2312:
     971             :                                         case cs_big5hkscs:
     972             :                                         case cs_sjis:
     973             :                                         case cs_eucjp:
     974             :                                                 /* we cannot properly handle those multibyte encodings
     975             :                                                  * with php_str_to_str. skip it. */ 
     976           0 :                                                 continue;
     977             : 
     978             :                                         case cs_utf_8:
     979       67456 :                                                 replacement_len = php_utf32_utf8(replacement, k);
     980       67456 :                                                 break;
     981             : 
     982             :                                         default:
     983           0 :                                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
     984           0 :                                                 efree(ret);
     985           0 :                                                 return NULL;
     986             :                                 }
     987             : 
     988       68230 :                                 if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
     989         267 :                                         replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
     990         267 :                                         efree(ret);
     991         267 :                                         ret = replaced;
     992             :                                 }
     993             :                         }
     994             :                 }
     995             :         }
     996             : 
     997        1686 :         for (j = 0; basic_entities[j].charcode != 0; j++) {
     998             : 
     999        1405 :                 if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
    1000          31 :                         continue;
    1001             :                 
    1002        1374 :                 replacement[0] = (unsigned char)basic_entities[j].charcode;
    1003        1374 :                 replacement[1] = '\0';
    1004             : 
    1005        1374 :                 if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {              
    1006          12 :                         replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
    1007          12 :                         efree(ret);
    1008          12 :                         ret = replaced;
    1009             :                 }
    1010             :         }
    1011             : 
    1012             :         /* replace numeric entities & "&amp;" */
    1013         281 :         lim = ret + retlen;
    1014        1276 :         for (p = ret, q = ret; p < lim;) {
    1015             :                 int code;
    1016             : 
    1017         714 :                 if (p[0] == '&') {
    1018          16 :                         if (p + 2 < lim) {
    1019          16 :                                 if (p[1] == '#') {
    1020           6 :                                         int invalid_code = 0;
    1021             : 
    1022           6 :                                         if (p[2] == 'x' || p[2] == 'X') {
    1023           0 :                                                 code = strtol(p + 3, &next, 16);
    1024             :                                         } else {
    1025           6 :                                                 code = strtol(p + 2, &next, 10);
    1026             :                                         }
    1027             : 
    1028           9 :                                         if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) ||
    1029           3 :                                                 (code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) {
    1030           3 :                                                 invalid_code = 1;
    1031             :                                         }
    1032             : 
    1033           9 :                                         if (next != NULL && *next == ';' && !invalid_code) {
    1034           3 :                                                 switch (charset) {
    1035             :                                                         case cs_utf_8:
    1036           2 :                                                                 q += php_utf32_utf8(q, code);
    1037           2 :                                                                 break;
    1038             : 
    1039             :                                                         case cs_8859_1:
    1040             :                                                         case cs_8859_5:
    1041             :                                                         case cs_8859_15:
    1042           1 :                                                                 if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
    1043           0 :                                                                         invalid_code = 1;
    1044             :                                                                 } else {
    1045           1 :                                                                         *(q++) = code;
    1046             :                                                                 }
    1047           1 :                                                                 break;
    1048             : 
    1049             :                                                         case cs_cp1252:
    1050           0 :                                                                 if (code > 0xff) {
    1051           0 :                                                                         invalid_code = 1;
    1052             :                                                                 } else {
    1053           0 :                                                                         *(q++) = code;
    1054             :                                                                 }
    1055           0 :                                                                 break;
    1056             : 
    1057             :                                                         case cs_cp1251:
    1058             :                                                         case cs_cp866:
    1059             :                                                         case cs_big5:
    1060             :                                                         case cs_big5hkscs:
    1061             :                                                         case cs_sjis:
    1062             :                                                         case cs_eucjp:
    1063           0 :                                                                 if (code >= 0x80) {
    1064           0 :                                                                         invalid_code = 1;
    1065             :                                                                 } else {
    1066           0 :                                                                         *(q++) = code;
    1067             :                                                                 }
    1068           0 :                                                                 break;
    1069             : 
    1070             :                                                         case cs_gb2312:
    1071           0 :                                                                 if (code >= 0x81) {
    1072           0 :                                                                         invalid_code = 1;
    1073             :                                                                 } else {
    1074           0 :                                                                         *(q++) = code;
    1075             :                                                                 }
    1076           0 :                                                                 break;
    1077             : 
    1078             :                                                         default:
    1079             :                                                                 /* for backwards compatilibity */
    1080           0 :                                                                 invalid_code = 1;
    1081             :                                                                 break;
    1082             :                                                 }
    1083           3 :                                                 if (invalid_code) {
    1084           0 :                                                         for (; p <= next; p++) {
    1085           0 :                                                                 *(q++) = *p;
    1086             :                                                         }
    1087             :                                                 }
    1088           3 :                                                 p = next + 1;
    1089             :                                         } else {
    1090           3 :                                                 *(q++) = *(p++);        
    1091           3 :                                                 *(q++) = *(p++);        
    1092             :                                         }
    1093          56 :                                 } else if (p + 4 < lim &&
    1094          28 :                                                         p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
    1095           9 :                                                         p[4] == ';') {
    1096           9 :                                         *(q++) = '&';
    1097           9 :                                         p += 5;
    1098             :                                 } else {
    1099           1 :                                         *(q++) = *(p++);
    1100           1 :                                         *(q++) = *(p++);
    1101             :                                 }
    1102             :                         } else {
    1103           0 :                                 *(q++) = *(p++);        
    1104             :                         }
    1105             :                 } else {
    1106         698 :                         *(q++) = *(p++);        
    1107             :                 }
    1108             :         }
    1109         281 :         *q = '\0';
    1110         281 :         retlen = (size_t)(q - ret);
    1111             : empty_source:   
    1112         283 :         *newlen = retlen;
    1113         283 :         return ret;
    1114             : }
    1115             : /* }}} */
    1116             : 
    1117        1695 : PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
    1118             : {
    1119        1695 :         return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC);
    1120             : }
    1121             : 
    1122             : 
    1123             : /* {{{ php_escape_html_entities
    1124             :  */
    1125       14127 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC)
    1126             : {
    1127             :         int i, j, maxlen, len;
    1128             :         char *replaced;
    1129       14127 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
    1130             :         int matches_map;
    1131             : 
    1132       14127 :         maxlen = 2 * oldlen;
    1133       14127 :         if (maxlen < 128)
    1134       14071 :                 maxlen = 128;
    1135       14127 :         replaced = emalloc (maxlen);
    1136       14127 :         len = 0;
    1137       14127 :         i = 0;
    1138       67215 :         while (i < oldlen) {
    1139             :                 unsigned char mbsequence[16];   /* allow up to 15 characters in a multibyte sequence */
    1140       44454 :                 int mbseqlen = sizeof(mbsequence);
    1141       44454 :                 int status = SUCCESS;
    1142       44454 :                 unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
    1143             : 
    1144       44454 :                 if(status == FAILURE) {
    1145             :                         /* invalid MB sequence */
    1146        5586 :                         if (quote_style & ENT_HTML_IGNORE_ERRORS) {
    1147          93 :                                 continue;
    1148             :                         }
    1149        5493 :                         efree(replaced);
    1150        5493 :                         if(!PG(display_errors)) {
    1151           0 :                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
    1152             :                         }
    1153        5493 :                         *newlen = 0;
    1154        5493 :                         return STR_EMPTY_ALLOC();
    1155             :                 }
    1156       38868 :                 matches_map = 0;
    1157             : 
    1158       38868 :                 if (len + 16 > maxlen)
    1159           2 :                         replaced = erealloc (replaced, maxlen += 128);
    1160             : 
    1161       38868 :                 if (all) {
    1162             :                         /* look for a match in the maps for this charset */
    1163        2547 :                         unsigned char *rep = NULL;
    1164             : 
    1165             : 
    1166       61117 :                         for (j = 0; entity_map[j].charset != cs_terminator; j++) {
    1167       62909 :                                 if (entity_map[j].charset == charset
    1168       58712 :                                                 && this_char >= entity_map[j].basechar
    1169        4197 :                                                 && this_char <= entity_map[j].endchar) {
    1170         142 :                                         rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
    1171         142 :                                         if (rep == NULL) {
    1172             :                                                 /* there is no entity for this position; fall through and
    1173             :                                                  * just output the character itself */
    1174           1 :                                                 break;
    1175             :                                         }
    1176             : 
    1177         141 :                                         matches_map = 1;
    1178         141 :                                         break;
    1179             :                                 }
    1180             :                         }
    1181             : 
    1182        2547 :                         if (matches_map) {
    1183         141 :                                 int l = strlen(rep);
    1184             :                                 /* increase the buffer size */
    1185         141 :                                 if (len + 2 + l >= maxlen) {
    1186           0 :                                         replaced = erealloc(replaced, maxlen += 128);
    1187             :                                 }
    1188             : 
    1189         141 :                                 replaced[len++] = '&';
    1190         141 :                                 strlcpy(replaced + len, rep, maxlen);
    1191         141 :                                 len += l;
    1192         141 :                                 replaced[len++] = ';';
    1193             :                         }
    1194             :                 }
    1195       38868 :                 if (!matches_map) {     
    1196       38727 :                         int is_basic = 0;
    1197             : 
    1198       38727 :                         if (this_char == '&') {
    1199         168 :                                 if (double_encode) {
    1200             : encode_amp:
    1201         138 :                                         memcpy(replaced + len, "&amp;", sizeof("&amp;") - 1);
    1202         138 :                                         len += sizeof("&amp;") - 1;
    1203             :                                 } else {
    1204          50 :                                         char *e = memchr(old + i, ';', oldlen - i);
    1205          50 :                                         char *s = old + i;
    1206             : 
    1207          50 :                                         if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */
    1208             :                                                 goto encode_amp;
    1209             :                                         } else {
    1210          40 :                                                 if (*s == '#') { /* numeric entities */
    1211          12 :                                                         s++;
    1212             :                                                         /* Hex (&#x5A;) */
    1213          16 :                                                         if (*s == 'x' || *s == 'X') {
    1214           6 :                                                                 s++;
    1215          20 :                                                                 while (s < e) {
    1216          10 :                                                                         if (!isxdigit((int)*(unsigned char *)s++)) {
    1217           2 :                                                                                 goto encode_amp;
    1218             :                                                                         }
    1219             :                                                                 }
    1220             :                                                         /* Dec (&#90;)*/
    1221             :                                                         } else {
    1222          22 :                                                                 while (s < e) {
    1223          12 :                                                                         if (!isdigit((int)*(unsigned char *)s++)) {
    1224           2 :                                                                                 goto encode_amp;
    1225             :                                                                         }
    1226             :                                                                 }
    1227             :                                                         }
    1228             :                                                 } else { /* text entities */
    1229         124 :                                                         while (s < e) {
    1230          74 :                                                                 if (!isalnum((int)*(unsigned char *)s++)) {
    1231           6 :                                                                         goto encode_amp;
    1232             :                                                                 }
    1233             :                                                         }
    1234             :                                                 }
    1235          30 :                                                 replaced[len++] = '&';
    1236             :                                         }
    1237             :                                 }
    1238         168 :                                 is_basic = 1;
    1239             :                         } else {
    1240      460972 :                                 for (j = 0; basic_entities[j].charcode != 0; j++) {
    1241      192856 :                                         if ((basic_entities[j].charcode != this_char) ||
    1242         391 :                                                         (basic_entities[j].flags &&
    1243         218 :                                                         (quote_style & basic_entities[j].flags) == 0)) {
    1244      191927 :                                                 continue;
    1245             :                                         }
    1246             : 
    1247         320 :                                         memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
    1248         320 :                                         len += basic_entities[j].entitylen;
    1249             :                 
    1250         320 :                                         is_basic = 1;
    1251         320 :                                         break;
    1252             :                                 }
    1253             :                         }
    1254             : 
    1255       38727 :                         if (!is_basic) {
    1256             :                                 /* a wide char without a named entity; pass through the original sequence */
    1257       38239 :                                 if (mbseqlen > 1) {
    1258        6279 :                                         memcpy(replaced + len, mbsequence, mbseqlen);
    1259        6279 :                                         len += mbseqlen;
    1260             :                                 } else {
    1261       31960 :                                         replaced[len++] = (unsigned char)this_char;
    1262             :                                 }
    1263             :                         }
    1264             :                 }
    1265             :         }
    1266        8634 :         replaced[len] = '\0';
    1267        8634 :         *newlen = len;
    1268             : 
    1269        8634 :         return replaced;
    1270             : 
    1271             : 
    1272             : }
    1273             : /* }}} */
    1274             : 
    1275             : /* {{{ php_html_entities
    1276             :  */
    1277       12425 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
    1278             : {
    1279       12425 :         char *str, *hint_charset = NULL;
    1280       12425 :         int str_len, hint_charset_len = 0;
    1281             :         int len;
    1282       12425 :         long quote_style = ENT_COMPAT;
    1283             :         char *replaced;
    1284       12425 :         zend_bool double_encode = 1;
    1285             : 
    1286       12425 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &quote_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
    1287           2 :                 return;
    1288             :         }
    1289             : 
    1290       12423 :         replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC);
    1291       12423 :         RETVAL_STRINGL(replaced, len, 0);
    1292             : }
    1293             : /* }}} */
    1294             : 
    1295             : #define HTML_SPECIALCHARS       0
    1296             : #define HTML_ENTITIES           1
    1297             : 
    1298             : /* {{{ register_html_constants
    1299             :  */
    1300       19341 : void register_html_constants(INIT_FUNC_ARGS)
    1301             : {
    1302       19341 :         REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
    1303       19341 :         REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
    1304       19341 :         REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
    1305       19341 :         REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
    1306       19341 :         REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
    1307       19341 :         REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
    1308       19341 : }
    1309             : /* }}} */
    1310             : 
    1311             : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
    1312             :    Convert special characters to HTML entities */
    1313       12010 : PHP_FUNCTION(htmlspecialchars)
    1314             : {
    1315       12010 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
    1316       12010 : }
    1317             : /* }}} */
    1318             : 
    1319             : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
    1320             :    Convert special HTML entities back to characters */
    1321         113 : PHP_FUNCTION(htmlspecialchars_decode)
    1322             : {
    1323             :         char *str, *new_str, *e, *p;
    1324             :         int len, j, i, new_len;
    1325         113 :         long quote_style = ENT_COMPAT;
    1326             :         struct basic_entities_dec basic_entities_dec[8];
    1327             : 
    1328         113 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &quote_style) == FAILURE) {
    1329          19 :                 return;
    1330             :         }
    1331             : 
    1332          94 :         new_str = estrndup(str, len);
    1333          94 :         new_len = len;
    1334          94 :         e = new_str + new_len;
    1335             : 
    1336          94 :         if (!(p = memchr(new_str, '&', new_len))) {
    1337          25 :                 RETURN_STRINGL(new_str, new_len, 0);
    1338             :         }
    1339             : 
    1340         414 :         for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
    1341         345 :                 if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
    1342         133 :                         continue;
    1343             :                 }
    1344         212 :                 basic_entities_dec[j].charcode = basic_entities[i].charcode;
    1345         212 :                 memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
    1346         212 :                 basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
    1347         212 :                 j++;
    1348             :         }
    1349          69 :         basic_entities_dec[j].charcode = '&';
    1350          69 :         basic_entities_dec[j].entitylen = sizeof("&amp;") - 1;
    1351          69 :         memcpy(basic_entities_dec[j].entity, "&amp;", sizeof("&amp;"));
    1352          69 :         i = j + 1;
    1353             :         
    1354             :         do {
    1355         343 :                 int l = e - p;
    1356             :         
    1357        1188 :                 for (j = 0; j < i; j++) {
    1358        1014 :                         if (basic_entities_dec[j].entitylen > l) {
    1359           0 :                                 continue;
    1360             :                         }
    1361        1014 :                         if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
    1362         169 :                                 int e_len = basic_entities_dec[j].entitylen - 1;
    1363             :                 
    1364         169 :                                 *p++ = basic_entities_dec[j].charcode;
    1365         169 :                                 memmove(p, p + e_len, (e - p - e_len));
    1366         169 :                                 e -= e_len;
    1367         169 :                                 goto done;
    1368             :                         }
    1369             :                 }
    1370         174 :                 p++;
    1371             : 
    1372             : done:
    1373         343 :                 if (p >= e) {
    1374          20 :                         break;
    1375             :                 }
    1376         323 :         } while ((p = memchr(p, '&', (e - p))));
    1377             : 
    1378          69 :         new_len = e - new_str;
    1379             : 
    1380          69 :         new_str[new_len] = '\0';
    1381          69 :         RETURN_STRINGL(new_str, new_len, 0);
    1382             : }
    1383             : /* }}} */
    1384             : 
    1385             : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
    1386             :    Convert all HTML entities to their applicable characters */
    1387         283 : PHP_FUNCTION(html_entity_decode)
    1388             : {
    1389         283 :         char *str, *hint_charset = NULL;
    1390         283 :         int str_len, hint_charset_len = 0, len;
    1391         283 :         long quote_style = ENT_COMPAT;
    1392             :         char *replaced;
    1393             : 
    1394         283 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
    1395             :                                                           &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
    1396           0 :                 return;
    1397             :         }
    1398             : 
    1399         283 :         replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
    1400         283 :         if (replaced) {
    1401         283 :                 RETURN_STRINGL(replaced, len, 0);
    1402             :         }
    1403           0 :         RETURN_FALSE;
    1404             : }
    1405             : /* }}} */
    1406             : 
    1407             : 
    1408             : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
    1409             :    Convert all applicable characters to HTML entities */
    1410         415 : PHP_FUNCTION(htmlentities)
    1411             : {
    1412         415 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
    1413         415 : }
    1414             : /* }}} */
    1415             : 
    1416             : /* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]])
    1417             :    Returns the internal translation table used by htmlspecialchars and htmlentities */
    1418          69 : PHP_FUNCTION(get_html_translation_table)
    1419             : {
    1420          69 :         long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
    1421             :         unsigned int i;
    1422             :         int j;
    1423             :         unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */
    1424             :         void *dummy;
    1425          69 :         char *charset_hint = NULL;
    1426             :         int charset_hint_len;
    1427             :         enum entity_charset charset;
    1428             : 
    1429          69 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
    1430             :                         &which, &quote_style, &charset_hint, &charset_hint_len) == FAILURE) {
    1431          34 :                 return;
    1432             :         }
    1433             : 
    1434          35 :         charset = determine_charset(charset_hint TSRMLS_CC);
    1435             : 
    1436          35 :         array_init(return_value);
    1437             : 
    1438          35 :         switch (which) {
    1439             :         case HTML_ENTITIES:
    1440         250 :                 for (j = 0; entity_map[j].charset != cs_terminator; j++) {
    1441         240 :                         if (entity_map[j].charset != charset)
    1442         148 :                                 continue;
    1443        7222 :                         for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
    1444             :                                 char buffer[16];
    1445             :                                 unsigned k;
    1446             :                                 size_t written;
    1447             : 
    1448        7130 :                                 if (entity_map[j].table[i] == NULL)
    1449        4777 :                                         continue;
    1450             :                                         
    1451        2353 :                                 k = i + entity_map[j].basechar;
    1452             : 
    1453        2353 :                                 switch (charset) {
    1454             :                                 case cs_utf_8:
    1455        2232 :                                         written = php_utf32_utf8(ind, k);
    1456        2232 :                                         ind[written] = '\0';
    1457        2232 :                                         break;
    1458             :                                 case cs_big5:
    1459             :                                 case cs_gb2312:
    1460             :                                 case cs_big5hkscs:
    1461             :                                 case cs_sjis:
    1462             :                                         /* we have no mappings for these, but if we had... */
    1463             :                                         /* break through */
    1464             :                                 default: /* one byte */
    1465         121 :                                         written = 1;
    1466         121 :                                         ind[0] = (unsigned char)k;
    1467         121 :                                         ind[1] = '\0';
    1468             :                                         break;
    1469             :                                 }
    1470             : 
    1471        2353 :                                 snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
    1472        2353 :                                 if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) {
    1473             :                                         /* in case of the single quote, which is repeated, the first one wins,
    1474             :                                                 * so don't replace the existint mapping */
    1475        2353 :                                         add_assoc_string(return_value, (const char*)ind, buffer, 1);
    1476             :                                 }
    1477             :                         }
    1478             :                 }
    1479             :                 /* break thru */
    1480             : 
    1481             :         case HTML_SPECIALCHARS:
    1482          35 :                 add_assoc_stringl(return_value, "&", "&amp;", sizeof("&amp;") - 1, 1);
    1483         210 :                 for (j = 0; basic_entities[j].charcode != 0; j++) {
    1484         175 :                         if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
    1485          70 :                                 continue;
    1486             :                                 
    1487         105 :                         ind[0] = (unsigned char)basic_entities[j].charcode;
    1488         105 :                         ind[1] = '\0';
    1489         105 :                         if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) {
    1490         100 :                                 add_assoc_stringl(return_value, ind, basic_entities[j].entity,
    1491             :                                         basic_entities[j].entitylen, 1);
    1492             :                         }
    1493             :                 }
    1494             : 
    1495             :                 break;
    1496             :         }
    1497             : }
    1498             : /* }}} */
    1499             : 
    1500             : /*
    1501             :  * Local variables:
    1502             :  * tab-width: 4
    1503             :  * c-basic-offset: 4
    1504             :  * End:
    1505             :  * vim600: sw=4 ts=4 fdm=marker
    1506             :  * vim<600: sw=4 ts=4
    1507             :  */

Generated by: LCOV version 1.10

Generated at Fri, 18 Apr 2014 07:01:37 +0000 (6 days ago)

Copyright © 2005-2014 The PHP Group
All rights reserved.