PHP  
 PHP: Test and Code Coverage Analysis
downloads | QA | documentation | faq | getting help | mailing lists | reporting bugs | php.net sites | links | my php.net 
 

LCOV - code coverage report
Current view: top level - ext/standard - html.c (source / functions) Hit Total Coverage
Test: PHP Code Coverage Lines: 638 718 88.9 %
Date: 2014-10-24 Functions: 29 29 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :    +----------------------------------------------------------------------+
       3             :    | PHP Version 5                                                        |
       4             :    +----------------------------------------------------------------------+
       5             :    | Copyright (c) 1997-2014 The PHP Group                                |
       6             :    +----------------------------------------------------------------------+
       7             :    | This source file is subject to version 3.01 of the PHP license,      |
       8             :    | that is bundled with this package in the file LICENSE, and is        |
       9             :    | available through the world-wide-web at the following url:           |
      10             :    | http://www.php.net/license/3_01.txt                                  |
      11             :    | If you did not receive a copy of the PHP license and are unable to   |
      12             :    | obtain it through the world-wide-web, please send a note to          |
      13             :    | license@php.net so we can mail you a copy immediately.               |
      14             :    +----------------------------------------------------------------------+
      15             :    | Authors: Rasmus Lerdorf <rasmus@php.net>                             |
      16             :    |          Jaakko Hyvätti <jaakko.hyvatti@iki.fi>                      |
      17             :    |          Wez Furlong    <wez@thebrainroom.com>                       |
      18             :    |          Gustavo Lopes  <cataphract@php.net>                         |
      19             :    +----------------------------------------------------------------------+
      20             : */
      21             : 
      22             : /* $Id$ */
      23             : 
      24             : /*
      25             :  * HTML entity resources:
      26             :  *
      27             :  * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
      28             :  *
      29             :  * XHTML 1.0 DTD
      30             :  * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
      31             :  *
      32             :  * From HTML 4.01 strict DTD:
      33             :  * http://www.w3.org/TR/html4/HTMLlat1.ent
      34             :  * http://www.w3.org/TR/html4/HTMLsymbol.ent
      35             :  * http://www.w3.org/TR/html4/HTMLspecial.ent
      36             :  *
      37             :  * HTML 5:
      38             :  * http://dev.w3.org/html5/spec/Overview.html#named-character-references
      39             :  */
      40             : 
      41             : #include "php.h"
      42             : #if PHP_WIN32
      43             : #include "config.w32.h"
      44             : #else
      45             : #include <php_config.h>
      46             : #endif
      47             : #include "php_standard.h"
      48             : #include "php_string.h"
      49             : #include "SAPI.h"
      50             : #if HAVE_LOCALE_H
      51             : #include <locale.h>
      52             : #endif
      53             : #if HAVE_LANGINFO_H
      54             : #include <langinfo.h>
      55             : #endif
      56             : 
      57             : #include <zend_hash.h>
      58             : #include "html_tables.h"
      59             : 
      60             : /* Macro for disabling flag of translation of non-basic entities where this isn't supported.
      61             :  * Not appropriate for html_entity_decode/htmlspecialchars_decode */
      62             : #define LIMIT_ALL(all, doctype, charset) do { \
      63             :         (all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
      64             : } while (0)
      65             : 
      66             : #define MB_FAILURE(pos, advance) do { \
      67             :         *cursor = pos + (advance); \
      68             :         *status = FAILURE; \
      69             :         return 0; \
      70             : } while (0)
      71             : 
      72             : #define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))
      73             : 
      74             : /* valid as single byte character or leading byte */
      75             : #define utf8_lead(c)  ((c) < 0x80 || ((c) >= 0xC2 && (c) <= 0xF4))
      76             : /* whether it's actually valid depends on other stuff;
      77             :  * this macro cannot check for non-shortest forms, surrogates or
      78             :  * code points above 0x10FFFF */
      79             : #define utf8_trail(c) ((c) >= 0x80 && (c) <= 0xBF)
      80             : 
      81             : #define gb2312_lead(c) ((c) != 0x8E && (c) != 0x8F && (c) != 0xA0 && (c) != 0xFF)
      82             : #define gb2312_trail(c) ((c) >= 0xA1 && (c) <= 0xFE)
      83             : 
      84             : #define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD)
      85             : #define sjis_trail(c) ((c) >= 0x40  && (c) != 0x7F && (c) < 0xFD)
      86             : 
      87             : /* {{{ get_next_char
      88             :  */
      89      187514 : static inline unsigned int get_next_char(
      90             :                 enum entity_charset charset,
      91             :                 const unsigned char *str,
      92             :                 size_t str_len,
      93             :                 size_t *cursor,
      94             :                 int *status)
      95             : {
      96      187514 :         size_t pos = *cursor;
      97      187514 :         unsigned int this_char = 0;
      98             : 
      99      187514 :         *status = SUCCESS;
     100             :         assert(pos <= str_len);
     101             : 
     102      187514 :         if (!CHECK_LEN(pos, 1))
     103           0 :                 MB_FAILURE(pos, 1);
     104             : 
     105      187514 :         switch (charset) {
     106             :         case cs_utf_8:
     107             :                 {
     108             :                         /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
     109             :                          * "In a reported illegal byte sequence, do not include any
     110             :                          *  non-initial byte that encodes a valid character or is a leading
     111             :                          *  byte for a valid sequence." */
     112             :                         unsigned char c;
     113      182012 :                         c = str[pos];
     114      182012 :                         if (c < 0x80) {
     115       52826 :                                 this_char = c;
     116       52826 :                                 pos++;
     117      129186 :                         } else if (c < 0xc2) {
     118         167 :                                 MB_FAILURE(pos, 1);
     119      129019 :                         } else if (c < 0xe0) {
     120        4085 :                                 if (!CHECK_LEN(pos, 2))
     121           8 :                                         MB_FAILURE(pos, 1);
     122             : 
     123        4077 :                                 if (!utf8_trail(str[pos + 1])) {
     124           9 :                                         MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
     125             :                                 }
     126        4068 :                                 this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
     127        4068 :                                 if (this_char < 0x80) { /* non-shortest form */
     128           0 :                                         MB_FAILURE(pos, 2);
     129             :                                 }
     130        4068 :                                 pos += 2;
     131      124934 :                         } else if (c < 0xf0) {
     132       63701 :                                 size_t avail = str_len - pos;
     133             : 
     134      318354 :                                 if (avail < 3 ||
     135      254653 :                                                 !utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
     136          45 :                                         if (avail < 2 || utf8_lead(str[pos + 1]))
     137          29 :                                                 MB_FAILURE(pos, 1);
     138          16 :                                         else if (avail < 3 || utf8_lead(str[pos + 2]))
     139          14 :                                                 MB_FAILURE(pos, 2);
     140             :                                         else
     141           2 :                                                 MB_FAILURE(pos, 3);
     142             :                                 }
     143             : 
     144       63656 :                                 this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
     145       63656 :                                 if (this_char < 0x800) { /* non-shortest form */
     146          72 :                                         MB_FAILURE(pos, 3);
     147       63584 :                                 } else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
     148          80 :                                         MB_FAILURE(pos, 3);
     149             :                                 }
     150       63504 :                                 pos += 3;
     151       61233 :                         } else if (c < 0xf5) {
     152       58649 :                                 size_t avail = str_len - pos;
     153             : 
     154      410537 :                                 if (avail < 4 ||
     155      234592 :                                                 !utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
     156      117296 :                                                 !utf8_trail(str[pos + 3])) {
     157           2 :                                         if (avail < 2 || utf8_lead(str[pos + 1]))
     158           0 :                                                 MB_FAILURE(pos, 1);
     159           2 :                                         else if (avail < 3 || utf8_lead(str[pos + 2]))
     160           0 :                                                 MB_FAILURE(pos, 2);
     161           2 :                                         else if (avail < 4 || utf8_lead(str[pos + 3]))
     162           2 :                                                 MB_FAILURE(pos, 3);
     163             :                                         else
     164           0 :                                                 MB_FAILURE(pos, 4);
     165             :                                 }
     166             :                                 
     167       58647 :                                 this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
     168       58647 :                                 if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
     169         255 :                                         MB_FAILURE(pos, 4);
     170             :                                 }
     171       58392 :                                 pos += 4;
     172             :                         } else {
     173        2584 :                                 MB_FAILURE(pos, 1);
     174             :                         }
     175             :                 }
     176      178790 :                 break;
     177             : 
     178             :         case cs_big5:
     179             :                 /* reference http://demo.icu-project.org/icu-bin/convexp?conv=big5 */
     180             :                 {
     181        1262 :                         unsigned char c = str[pos];
     182        1766 :                         if (c >= 0x81 && c <= 0xFE) {
     183             :                                 unsigned char next;
     184        1260 :                                 if (!CHECK_LEN(pos, 2))
     185         126 :                                         MB_FAILURE(pos, 1);
     186             : 
     187        1134 :                                 next = str[pos + 1];
     188             : 
     189        1638 :                                 if ((next >= 0x40 && next <= 0x7E) ||
     190             :                                                 (next >= 0xA1 && next <= 0xFE)) {
     191         504 :                                         this_char = (c << 8) | next;
     192             :                                 } else {
     193         630 :                                         MB_FAILURE(pos, 1);
     194             :                                 }
     195         504 :                                 pos += 2;
     196             :                         } else {
     197           2 :                                 this_char = c;
     198           2 :                                 pos += 1;
     199             :                         }
     200             :                 }
     201         506 :                 break;
     202             : 
     203             :         case cs_big5hkscs:
     204             :                 {
     205           0 :                         unsigned char c = str[pos];
     206           0 :                         if (c >= 0x81 && c <= 0xFE) {
     207             :                                 unsigned char next;
     208           0 :                                 if (!CHECK_LEN(pos, 2))
     209           0 :                                         MB_FAILURE(pos, 1);
     210             : 
     211           0 :                                 next = str[pos + 1];
     212             : 
     213           0 :                                 if ((next >= 0x40 && next <= 0x7E) ||
     214             :                                                 (next >= 0xA1 && next <= 0xFE)) {
     215           0 :                                         this_char = (c << 8) | next;
     216           0 :                                 } else if (next != 0x80 && next != 0xFF) {
     217           0 :                                         MB_FAILURE(pos, 1);
     218             :                                 } else {
     219           0 :                                         MB_FAILURE(pos, 2);
     220             :                                 }
     221           0 :                                 pos += 2;
     222             :                         } else {
     223           0 :                                 this_char = c;
     224           0 :                                 pos += 1;
     225             :                         }
     226             :                 }
     227           0 :                 break;
     228             : 
     229             :         case cs_gb2312: /* EUC-CN */
     230             :                 {
     231           0 :                         unsigned char c = str[pos];
     232           0 :                         if (c >= 0xA1 && c <= 0xFE) {
     233             :                                 unsigned char next;
     234           0 :                                 if (!CHECK_LEN(pos, 2))
     235           0 :                                         MB_FAILURE(pos, 1);
     236             : 
     237           0 :                                 next = str[pos + 1];
     238             : 
     239           0 :                                 if (gb2312_trail(next)) {
     240           0 :                                         this_char = (c << 8) | next;
     241           0 :                                 } else if (gb2312_lead(next)) {
     242           0 :                                         MB_FAILURE(pos, 1);
     243             :                                 } else {
     244           0 :                                         MB_FAILURE(pos, 2);
     245             :                                 }
     246           0 :                                 pos += 2;
     247           0 :                         } else if (gb2312_lead(c)) {
     248           0 :                                 this_char = c;
     249           0 :                                 pos += 1;
     250             :                         } else {
     251           0 :                                 MB_FAILURE(pos, 1);
     252             :                         }
     253             :                 }
     254           0 :                 break;
     255             : 
     256             :         case cs_sjis:
     257             :                 {
     258        1032 :                         unsigned char c = str[pos];
     259        1278 :                         if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xFC)) {
     260             :                                 unsigned char next;
     261         614 :                                 if (!CHECK_LEN(pos, 2))
     262          68 :                                         MB_FAILURE(pos, 1);
     263             : 
     264         546 :                                 next = str[pos + 1];
     265             : 
     266         792 :                                 if (sjis_trail(next)) {
     267         246 :                                         this_char = (c << 8) | next;
     268         300 :                                 } else if (sjis_lead(next)) {
     269         120 :                                         MB_FAILURE(pos, 1);
     270             :                                 } else {
     271         180 :                                         MB_FAILURE(pos, 2);
     272             :                                 }
     273         246 :                                 pos += 2;
     274         815 :                         } else if (c < 0x80 || (c >= 0xA1 && c <= 0xDF)) {
     275         397 :                                 this_char = c;
     276         397 :                                 pos += 1;
     277             :                         } else {
     278          21 :                                 MB_FAILURE(pos, 1);
     279             :                         }
     280             :                 }
     281         643 :                 break;
     282             : 
     283             :         case cs_eucjp:
     284             :                 {
     285        2439 :                         unsigned char c = str[pos];
     286             : 
     287        2636 :                         if (c >= 0xA1 && c <= 0xFE) {
     288             :                                 unsigned next;
     289         806 :                                 if (!CHECK_LEN(pos, 2))
     290         225 :                                         MB_FAILURE(pos, 1);
     291         581 :                                 next = str[pos + 1];
     292             : 
     293         778 :                                 if (next >= 0xA1 && next <= 0xFE) {
     294             :                                         /* this a jis kanji char */
     295         197 :                                         this_char = (c << 8) | next;
     296             :                                 } else {
     297         384 :                                         MB_FAILURE(pos, (next != 0xA0 && next != 0xFF) ? 1 : 2);
     298             :                                 }
     299         197 :                                 pos += 2;
     300        1633 :                         } else if (c == 0x8E) {
     301             :                                 unsigned next;
     302         667 :                                 if (!CHECK_LEN(pos, 2))
     303           3 :                                         MB_FAILURE(pos, 1);
     304             : 
     305         664 :                                 next = str[pos + 1];
     306        1109 :                                 if (next >= 0xA1 && next <= 0xDF) {
     307             :                                         /* JIS X 0201 kana */
     308         445 :                                         this_char = (c << 8) | next;
     309             :                                 } else {
     310         219 :                                         MB_FAILURE(pos, (next != 0xA0 && next != 0xFF) ? 1 : 2);
     311             :                                 }
     312         445 :                                 pos += 2;
     313         966 :                         } else if (c == 0x8F) {
     314         673 :                                 size_t avail = str_len - pos;
     315             : 
     316        1527 :                                 if (avail < 3 || !(str[pos + 1] >= 0xA1 && str[pos + 1] <= 0xFE) ||
     317         854 :                                                 !(str[pos + 2] >= 0xA1 && str[pos + 2] <= 0xFE)) {
     318         484 :                                         if (avail < 2 || (str[pos + 1] != 0xA0 && str[pos + 1] != 0xFF))
     319         482 :                                                 MB_FAILURE(pos, 1);
     320           2 :                                         else if (avail < 3 || (str[pos + 2] != 0xA0 && str[pos + 2] != 0xFF))
     321           2 :                                                 MB_FAILURE(pos, 2);
     322             :                                         else
     323           0 :                                                 MB_FAILURE(pos, 3);
     324             :                                 } else {
     325             :                                         /* JIS X 0212 hojo-kanji */
     326         189 :                                         this_char = (c << 16) | (str[pos + 1] << 8) | str[pos + 2];
     327             :                                 }
     328         189 :                                 pos += 3;
     329         457 :                         } else if (c != 0xA0 && c != 0xFF) {
     330             :                                 /* character encoded in 1 code unit */
     331         164 :                                 this_char = c;
     332         164 :                                 pos += 1;
     333             :                         } else {
     334         129 :                                 MB_FAILURE(pos, 1);
     335             :                         }
     336             :                 }
     337         995 :                 break;
     338             :         default:
     339             :                 /* single-byte charsets */
     340         769 :                 this_char = str[pos++];
     341             :                 break;
     342             :         }
     343             : 
     344      181703 :         *cursor = pos;
     345      181703 :         return this_char;
     346             : }
     347             : /* }}} */
     348             : 
     349             : /* {{{ php_next_utf8_char
     350             :  * Public interface for get_next_char used with UTF-8 */
     351       20253 :  PHPAPI unsigned int php_next_utf8_char(
     352             :                 const unsigned char *str,
     353             :                 size_t str_len,
     354             :                 size_t *cursor,
     355             :                 int *status)
     356             : {
     357       20253 :         return get_next_char(cs_utf_8, str, str_len, cursor, status);
     358             : }
     359             : /* }}} */
     360             : 
     361             : /* {{{ entity_charset determine_charset
     362             :  * returns the charset identifier based on current locale or a hint.
     363             :  * defaults to UTF-8 */
     364      139985 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
     365             : {
     366             :         int i;
     367      139985 :         enum entity_charset charset = cs_utf_8;
     368      139985 :         int len = 0;
     369             :         const zend_encoding *zenc;
     370             : 
     371             :         /* Default is now UTF-8 */
     372      139985 :         if (charset_hint == NULL)
     373         916 :                 return cs_utf_8;
     374             : 
     375      139069 :         if ((len = strlen(charset_hint)) != 0) {
     376      139051 :                 goto det_charset;
     377             :         }
     378             : 
     379          18 :         zenc = zend_multibyte_get_internal_encoding(TSRMLS_C);
     380          18 :         if (zenc != NULL) {
     381          18 :                 charset_hint = (char *)zend_multibyte_get_encoding_name(zenc);
     382          18 :                 if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     383          31 :                         if ((len == 4) /* sizeof (none|auto|pass) */ &&
     384          11 :                                         (!memcmp("pass", charset_hint, 4) ||
     385           1 :                                          !memcmp("auto", charset_hint, 4) ||
     386           1 :                                          !memcmp("auto", charset_hint, 4))) {
     387          10 :                                 charset_hint = NULL;
     388          10 :                                 len = 0;
     389             :                         } else {
     390             :                                 goto det_charset;
     391             :                         }
     392             :                 }
     393             :         }
     394             : 
     395          10 :         charset_hint = SG(default_charset);
     396          10 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     397           6 :                 goto det_charset;
     398             :         }
     399             : 
     400             :         /* try to detect the charset for the locale */
     401             : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
     402           4 :         charset_hint = nl_langinfo(CODESET);
     403           4 :         if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
     404           4 :                 goto det_charset;
     405             :         }
     406             : #endif
     407             : 
     408             : #if HAVE_LOCALE_H
     409             :         /* try to figure out the charset from the locale */
     410             :         {
     411             :                 char *localename;
     412             :                 char *dot, *at;
     413             : 
     414             :                 /* lang[_territory][.codeset][@modifier] */
     415           0 :                 localename = setlocale(LC_CTYPE, NULL);
     416             : 
     417           0 :                 dot = strchr(localename, '.');
     418           0 :                 if (dot) {
     419           0 :                         dot++;
     420             :                         /* locale specifies a codeset */
     421           0 :                         at = strchr(dot, '@');
     422           0 :                         if (at)
     423           0 :                                 len = at - dot;
     424             :                         else
     425           0 :                                 len = strlen(dot);
     426           0 :                         charset_hint = dot;
     427             :                 } else {
     428             :                         /* no explicit name; see if the name itself
     429             :                          * is the charset */
     430           0 :                         charset_hint = localename;
     431           0 :                         len = strlen(charset_hint);
     432             :                 }
     433             :         }
     434             : #endif
     435             : 
     436             : det_charset:
     437             : 
     438      139069 :         if (charset_hint) {
     439      139069 :                 int found = 0;
     440             :                 
     441             :                 /* now walk the charset map and look for the codeset */
     442      770239 :                 for (i = 0; charset_map[i].codeset; i++) {
     443      770233 :                         if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
     444      139063 :                                 charset = charset_map[i].charset;
     445      139063 :                                 found = 1;
     446      139063 :                                 break;
     447             :                         }
     448             :                 }
     449      139069 :                 if (!found) {
     450           6 :                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming utf-8",
     451             :                                         charset_hint);
     452             :                 }
     453             :         }
     454      139069 :         return charset;
     455             : }
     456             : /* }}} */
     457             : 
     458             : /* {{{ php_utf32_utf8 */
     459       10382 : static inline size_t php_utf32_utf8(unsigned char *buf, unsigned k)
     460             : {
     461       10382 :         size_t retval = 0;
     462             : 
     463             :         /* assert(0x0 <= k <= 0x10FFFF); */
     464             : 
     465       10382 :         if (k < 0x80) {
     466         253 :                 buf[0] = k;
     467         253 :                 retval = 1;
     468       10129 :         } else if (k < 0x800) {
     469        3902 :                 buf[0] = 0xc0 | (k >> 6);
     470        3902 :                 buf[1] = 0x80 | (k & 0x3f);
     471        3902 :                 retval = 2;
     472        6227 :         } else if (k < 0x10000) {
     473        5689 :                 buf[0] = 0xe0 | (k >> 12);
     474        5689 :                 buf[1] = 0x80 | ((k >> 6) & 0x3f);
     475        5689 :                 buf[2] = 0x80 | (k & 0x3f);
     476        5689 :                 retval = 3;
     477             :         } else {
     478         538 :                 buf[0] = 0xf0 | (k >> 18);
     479         538 :                 buf[1] = 0x80 | ((k >> 12) & 0x3f);
     480         538 :                 buf[2] = 0x80 | ((k >> 6) & 0x3f);
     481         538 :                 buf[3] = 0x80 | (k & 0x3f);
     482         538 :                 retval = 4;
     483             :         }
     484             :         /* UTF-8 has been restricted to max 4 bytes since RFC 3629 */
     485             : 
     486       10382 :         return retval;
     487             : }
     488             : /* }}} */
     489             : 
     490             : /* {{{ php_mb2_int_to_char
     491             :  * Convert back big endian int representation of sequence of one or two 8-bit code units. */
     492             : static inline size_t php_mb2_int_to_char(unsigned char *buf, unsigned k)
     493             : {
     494             :         assert(k <= 0xFFFFU);
     495             :         /* one or two bytes */
     496             :         if (k <= 0xFFU) { /* 1 */
     497             :                 buf[0] = k;
     498             :                 return 1U;
     499             :         } else { /* 2 */
     500             :                 buf[0] = k >> 8;
     501             :                 buf[1] = k & 0xFFU;
     502             :                 return 2U;
     503             :         }
     504             : }
     505             : /* }}} */
     506             : 
     507             : /* {{{ php_mb3_int_to_char
     508             :  * Convert back big endian int representation of sequence of one to three 8-bit code units.
     509             :  * For EUC-JP. */
     510             : static inline size_t php_mb3_int_to_char(unsigned char *buf, unsigned k)
     511             : {
     512             :         assert(k <= 0xFFFFFFU);
     513             :         /* one to three bytes */
     514             :         if (k <= 0xFFU) { /* 1 */
     515             :                 buf[0] = k;
     516             :                 return 1U;
     517             :         } else if (k <= 0xFFFFU) { /* 2 */
     518             :                 buf[0] = k >> 8;
     519             :                 buf[1] = k & 0xFFU;
     520             :                 return 2U;
     521             :         } else {
     522             :                 buf[0] = k >> 16;
     523             :                 buf[1] = (k >> 8) & 0xFFU;
     524             :                 buf[2] = k & 0xFFU;
     525             :                 return 3U;
     526             :         }
     527             : }
     528             : /* }}} */
     529             : 
     530             : 
     531             : /* {{{ unimap_bsearc_cmp
     532             :  * Binary search of unicode code points in unicode <--> charset mapping.
     533             :  * Returns the code point in the target charset (whose mapping table was given) or 0 if
     534             :  * the unicode code point is not in the table.
     535             :  */
     536         985 : static inline unsigned char unimap_bsearch(const uni_to_enc *table, unsigned code_key_a, size_t num)
     537             : {
     538         985 :         const uni_to_enc *l = table,
     539         985 :                                          *h = &table[num-1],
     540             :                                          *m;
     541             :         unsigned short code_key;
     542             : 
     543             :         /* we have no mappings outside the BMP */
     544         985 :         if (code_key_a > 0xFFFFU)
     545           0 :                 return 0;
     546             : 
     547         985 :         code_key = (unsigned short) code_key_a;
     548             :         
     549        7369 :         while (l <= h) {
     550        6088 :                 m = l + (h - l) / 2;
     551        6088 :                 if (code_key < m->un_code_point)
     552        3113 :                         h = m - 1;
     553        2975 :                 else if (code_key > m->un_code_point)
     554        2286 :                         l = m + 1;
     555             :                 else
     556         689 :                         return m->cs_code;
     557             :         }
     558         296 :         return 0;
     559             : }
     560             : /* }}} */
     561             : 
     562             : /* {{{ map_from_unicode */
     563        1585 : static inline int map_from_unicode(unsigned code, enum entity_charset charset, unsigned *res)
     564             : {
     565             :         unsigned char found;
     566             :         const uni_to_enc *table;
     567             :         size_t table_size;
     568             : 
     569        1585 :         switch (charset) {
     570             :         case cs_8859_1:
     571             :                 /* identity mapping of code points to unicode */
     572         264 :                 if (code > 0xFF) {
     573           1 :                         return FAILURE;
     574             :                 } 
     575         263 :                 *res = code;
     576         263 :                 break;
     577             : 
     578             :         case cs_8859_5:
     579         196 :                 if (code <= 0xA0 || code == 0xAD /* soft hyphen */) {
     580           4 :                         *res = code;
     581         188 :                 } else if (code == 0x2116) {
     582           1 :                         *res = 0xF0; /* numero sign */
     583         187 :                 } else if (code == 0xA7) {
     584           2 :                         *res = 0xFD; /* section sign */
     585         263 :                 } else if (code >= 0x0401 && code <= 0x044F) {
     586          78 :                         if (code == 0x040D || code == 0x0450 || code == 0x045D)
     587           0 :                                 return FAILURE;
     588          78 :                         *res = code - 0x360;
     589             :                 } else {
     590         107 :                         return FAILURE;
     591             :                 }
     592          85 :                 break;
     593             :                 
     594             :         case cs_8859_15:
     595         330 :                 if (code < 0xA4 || (code > 0xBE && code <= 0xFF)) {
     596         138 :                         *res = code;
     597             :                 } else { /* between A4 and 0xBE */
     598          54 :                         found = unimap_bsearch(unimap_iso885915,
     599             :                                 code, sizeof(unimap_iso885915) / sizeof(*unimap_iso885915));
     600          54 :                         if (found)
     601          46 :                                 *res = found;
     602             :                         else
     603           8 :                                 return FAILURE;
     604             :                 }
     605         184 :                 break;
     606             : 
     607             :         case cs_cp1252:
     608          27 :                 if (code <= 0x7F || (code >= 0xA0 && code <= 0xFF)) {
     609           0 :                         *res = code;
     610             :                 } else {
     611          27 :                         found = unimap_bsearch(unimap_win1252,
     612             :                                 code, sizeof(unimap_win1252) / sizeof(*unimap_win1252));
     613          27 :                         if (found)
     614          27 :                                 *res = found;
     615             :                         else
     616           0 :                                 return FAILURE;
     617             :                 }
     618          27 :                 break;
     619             : 
     620             :         case cs_macroman:
     621         224 :                 if (code == 0x7F)
     622           0 :                         return FAILURE;
     623         224 :                 table = unimap_macroman;
     624         224 :                 table_size = sizeof(unimap_macroman) / sizeof(*unimap_macroman);
     625         224 :                 goto table_over_7F;
     626             :         case cs_cp1251:
     627         235 :                 table = unimap_win1251;
     628         235 :                 table_size = sizeof(unimap_win1251) / sizeof(*unimap_win1251);
     629         235 :                 goto table_over_7F;
     630             :         case cs_koi8r:
     631         227 :                 table = unimap_koi8r;
     632         227 :                 table_size = sizeof(unimap_koi8r) / sizeof(*unimap_koi8r);
     633         227 :                 goto table_over_7F;
     634             :         case cs_cp866:
     635         224 :                 table = unimap_cp866;
     636         224 :                 table_size = sizeof(unimap_cp866) / sizeof(*unimap_cp866);
     637             :                 
     638             : table_over_7F:
     639         910 :                 if (code <= 0x7F) {
     640           6 :                         *res = code;
     641             :                 } else {
     642         904 :                         found = unimap_bsearch(table, code, table_size);
     643         904 :                         if (found)
     644         616 :                                 *res = found;
     645             :                         else
     646         288 :                                 return FAILURE;
     647             :                 }
     648         622 :                 break;
     649             : 
     650             :         /* from here on, only map the possible characters in the ASCII range.
     651             :          * to improve support here, it's a matter of building the unicode mappings.
     652             :          * See <http://www.unicode.org/Public/6.0.0/ucd/Unihan.zip> */
     653             :         case cs_sjis:
     654             :         case cs_eucjp:
     655             :                 /* we interpret 0x5C as the Yen symbol. This is not universal.
     656             :                  * See <http://www.w3.org/Submission/japanese-xml/#ambiguity_of_yen> */
     657           0 :                 if (code >= 0x20 && code <= 0x7D) {
     658           0 :                         if (code == 0x5C)
     659           0 :                                 return FAILURE;
     660           0 :                         *res = code;
     661             :                 } else {
     662           0 :                         return FAILURE;
     663             :                 }
     664           0 :                 break;
     665             : 
     666             :         case cs_big5:
     667             :         case cs_big5hkscs:
     668             :         case cs_gb2312:
     669           0 :                 if (code >= 0x20 && code <= 0x7D) {
     670           0 :                         *res = code;
     671             :                 } else {
     672           0 :                         return FAILURE;
     673             :                 }
     674           0 :                 break;
     675             : 
     676             :         default:
     677           0 :                 return FAILURE;
     678             :         }
     679             : 
     680        1181 :         return SUCCESS;
     681             : }
     682             : /* }}} */
     683             : 
     684             : /* {{{ */
     685        1260 : static inline void map_to_unicode(unsigned code, const enc_to_uni *table, unsigned *res)
     686             : {
     687             :         /* only single byte encodings are currently supported; assumed code <= 0xFF */
     688        1260 :         *res = table->inner[ENT_ENC_TO_UNI_STAGE1(code)]->uni_cp[ENT_ENC_TO_UNI_STAGE2(code)];
     689        1260 : }
     690             : /* }}} */
     691             : 
     692             : /* {{{ unicode_cp_is_allowed */
     693        2973 : static inline int unicode_cp_is_allowed(unsigned uni_cp, int document_type)
     694             : {
     695             :         /* XML 1.0                              HTML 4.01                       HTML 5
     696             :          * 0x09..0x0A                   0x09..0x0A                      0x09..0x0A
     697             :          * 0x0D                                 0x0D                            0x0C..0x0D
     698             :          * 0x0020..0xD7FF               0x20..0x7E                      0x20..0x7E
     699             :          *                                              0x00A0..0xD7FF          0x00A0..0xD7FF
     700             :          * 0xE000..0xFFFD               0xE000..0x10FFFF        0xE000..0xFDCF
     701             :          * 0x010000..0x10FFFF                                           0xFDF0..0x10FFFF (*)
     702             :          *
     703             :          * (*) exclude code points where ((code & 0xFFFF) >= 0xFFFE)
     704             :          *
     705             :          * References:
     706             :          * XML 1.0:   <http://www.w3.org/TR/REC-xml/#charsets>
     707             :          * HTML 4.01: <http://www.w3.org/TR/1999/PR-html40-19990824/sgml/sgmldecl.html>
     708             :          * HTML 5:    <http://dev.w3.org/html5/spec/Overview.html#preprocessing-the-input-stream>
     709             :          *
     710             :          * Not sure this is the relevant part for HTML 5, though. I opted to
     711             :          * disallow the characters that would result in a parse error when
     712             :          * preprocessing of the input stream. See also section 8.1.3.
     713             :          * 
     714             :          * It's unclear if XHTML 1.0 allows C1 characters. I'll opt to apply to
     715             :          * XHTML 1.0 the same rules as for XML 1.0.
     716             :          * See <http://cmsmcq.com/2007/C1.xml>.
     717             :          */
     718             : 
     719        2973 :         switch (document_type) {
     720             :         case ENT_HTML_DOC_HTML401:
     721        1730 :                 return (uni_cp >= 0x20 && uni_cp <= 0x7E) ||
     722             :                         (uni_cp == 0x0A || uni_cp == 0x09 || uni_cp == 0x0D) ||
     723             :                         (uni_cp >= 0xA0 && uni_cp <= 0xD7FF) ||
     724             :                         (uni_cp >= 0xE000 && uni_cp <= 0x10FFFF);
     725             :         case ENT_HTML_DOC_HTML5:
     726         666 :                 return (uni_cp >= 0x20 && uni_cp <= 0x7E) ||
     727             :                         (uni_cp >= 0x09 && uni_cp <= 0x0D && uni_cp != 0x0B) || /* form feed U+0C allowed */
     728             :                         (uni_cp >= 0xA0 && uni_cp <= 0xD7FF) ||
     729             :                         (uni_cp >= 0xE000 && uni_cp <= 0x10FFFF &&
     730          27 :                                 ((uni_cp & 0xFFFF) < 0xFFFE) && /* last two of each plane (nonchars) disallowed */
     731             :                                 (uni_cp < 0xFDD0 || uni_cp > 0xFDEF)); /* U+FDD0-U+FDEF (nonchars) disallowed */
     732             :         case ENT_HTML_DOC_XHTML:
     733             :         case ENT_HTML_DOC_XML1:
     734         604 :                 return (uni_cp >= 0x20 && uni_cp <= 0xD7FF) ||
     735             :                         (uni_cp == 0x0A || uni_cp == 0x09 || uni_cp == 0x0D) ||
     736             :                         (uni_cp >= 0xE000 && uni_cp <= 0x10FFFF && uni_cp != 0xFFFE && uni_cp != 0xFFFF);
     737             :         default:
     738           0 :                 return 1;
     739             :         }
     740             : }
     741             : /* }}} */
     742             : 
     743             : /* {{{ unicode_cp_is_allowed */
     744         312 : static inline int numeric_entity_is_allowed(unsigned uni_cp, int document_type)
     745             : {
     746             :         /* less restrictive than unicode_cp_is_allowed */
     747         312 :         switch (document_type) {
     748             :         case ENT_HTML_DOC_HTML401:
     749             :                 /* all non-SGML characters (those marked with UNUSED in DESCSET) should be
     750             :                  * representable with numeric entities */
     751          52 :                 return uni_cp <= 0x10FFFF;
     752             :         case ENT_HTML_DOC_HTML5:
     753             :                 /* 8.1.4. The numeric character reference forms described above are allowed to
     754             :                  * reference any Unicode code point other than U+0000, U+000D, permanently
     755             :                  * undefined Unicode characters (noncharacters), and control characters other
     756             :                  * than space characters (U+0009, U+000A, U+000C and U+000D) */
     757             :                 /* seems to allow surrogate characters, then */
     758         234 :                 return (uni_cp >= 0x20 && uni_cp <= 0x7E) ||
     759             :                         (uni_cp >= 0x09 && uni_cp <= 0x0C && uni_cp != 0x0B) || /* form feed U+0C allowed, but not U+0D */
     760             :                         (uni_cp >= 0xA0 && uni_cp <= 0x10FFFF &&
     761          78 :                                 ((uni_cp & 0xFFFF) < 0xFFFE) && /* last two of each plane (nonchars) disallowed */
     762             :                                 (uni_cp < 0xFDD0 || uni_cp > 0xFDEF)); /* U+FDD0-U+FDEF (nonchars) disallowed */
     763             :         case ENT_HTML_DOC_XHTML:
     764             :         case ENT_HTML_DOC_XML1:
     765             :                 /* OTOH, XML 1.0 requires "character references to match the production for Char
     766             :                  * See <http://www.w3.org/TR/REC-xml/#NT-CharRef> */
     767         104 :                 return unicode_cp_is_allowed(uni_cp, document_type);
     768             :         default:
     769           0 :                 return 1;
     770             :         }
     771             : }
     772             : /* }}} */
     773             : 
     774             : /* {{{ process_numeric_entity
     775             :  * Auxiliary function to traverse_for_entities.
     776             :  * On input, *buf should point to the first character after # and on output, it's the last
     777             :  * byte read, no matter if there was success or insuccess. 
     778             :  */
     779        2166 : static inline int process_numeric_entity(const char **buf, unsigned *code_point)
     780             : {
     781             :         long code_l;
     782        2166 :         int hexadecimal = (**buf == 'x' || **buf == 'X'); /* TODO: XML apparently disallows "X" */
     783             :         char *endptr;
     784             : 
     785        2166 :         if (hexadecimal && (**buf != '\0'))
     786        1967 :                 (*buf)++;
     787             :                         
     788             :         /* strtol allows whitespace and other stuff in the beginning
     789             :                 * we're not interested */
     790        2365 :         if ((hexadecimal && !isxdigit(**buf)) ||
     791         199 :                         (!hexadecimal && !isdigit(**buf))) {
     792           8 :                 return FAILURE;
     793             :         }
     794             : 
     795        2158 :         code_l = strtol(*buf, &endptr, hexadecimal ? 16 : 10);
     796             :         /* we're guaranteed there were valid digits, so *endptr > buf */
     797        2158 :         *buf = endptr;
     798             : 
     799        2158 :         if (**buf != ';')
     800          18 :                 return FAILURE;
     801             : 
     802             :         /* many more are invalid, but that depends on whether it's HTML
     803             :          * (and which version) or XML. */
     804        2140 :         if (code_l > 0x10FFFFL)
     805          14 :                 return FAILURE;
     806             : 
     807        2126 :         if (code_point != NULL)
     808        2126 :                 *code_point = (unsigned)code_l;
     809             : 
     810        2126 :         return SUCCESS;
     811             : }
     812             : /* }}} */
     813             : 
     814             : /* {{{ process_named_entity */
     815        2743 : static inline int process_named_entity_html(const char **buf, const char **start, size_t *length)
     816             : {
     817        2743 :         *start = *buf;
     818             : 
     819             :         /* "&" is represented by a 0x26 in all supported encodings. That means
     820             :          * the byte after represents a character or is the leading byte of an
     821             :          * sequence of 8-bit code units. If in the ranges below, it represents
     822             :          * necessarily a alpha character because none of the supported encodings
     823             :          * has an overlap with ASCII in the leading byte (only on the second one) */
     824       32708 :         while ((**buf >= 'a' && **buf <= 'z') ||
     825        5423 :                         (**buf >= 'A' && **buf <= 'Z') ||
     826        5563 :                         (**buf >= '0' && **buf <= '9')) {
     827       16236 :                 (*buf)++;
     828             :         }
     829             : 
     830        2743 :         if (**buf != ';')
     831          51 :                 return FAILURE;
     832             : 
     833             :         /* cast to size_t OK as the quantity is always non-negative */
     834        2692 :         *length = *buf - *start;
     835             : 
     836        2692 :         if (*length == 0)
     837           4 :                 return FAILURE;
     838             : 
     839        2688 :         return SUCCESS;
     840             : }
     841             : /* }}} */
     842             : 
     843             : /* {{{ resolve_named_entity_html */
     844        2688 : static inline int resolve_named_entity_html(const char *start, size_t length, const entity_ht *ht, unsigned *uni_cp1, unsigned *uni_cp2)
     845             : {
     846             :         const entity_cp_map *s;
     847        2688 :         ulong hash = zend_inline_hash_func(start, length);
     848             : 
     849        2688 :         s = ht->buckets[hash % ht->num_elems];
     850        6038 :         while (s->entity) {
     851        3323 :                 if (s->entity_len == length) {
     852        2750 :                         if (memcmp(start, s->entity, length) == 0) {
     853        2661 :                                 *uni_cp1 = s->codepoint1;
     854        2661 :                                 *uni_cp2 = s->codepoint2;
     855        2661 :                                 return SUCCESS;
     856             :                         }
     857             :                 }
     858         662 :                 s++;
     859             :         }
     860          27 :         return FAILURE;
     861             : }
     862             : /* }}} */
     863             : 
     864       12437 : static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) {
     865             :         /* code is not necessarily a unicode code point */
     866       12437 :         switch (charset) {
     867             :         case cs_utf_8:
     868       10382 :                 return php_utf32_utf8(buf, code);
     869             : 
     870             :         case cs_8859_1:
     871             :         case cs_cp1252:
     872             :         case cs_8859_15:
     873             :         case cs_koi8r:
     874             :         case cs_cp1251:
     875             :         case cs_8859_5:
     876             :         case cs_cp866:
     877             :         case cs_macroman:
     878             :                 /* single byte stuff */
     879        2055 :                 *buf = code;
     880        2055 :                 return 1;
     881             : 
     882             :         case cs_big5:
     883             :         case cs_big5hkscs:
     884             :         case cs_sjis:
     885             :         case cs_gb2312:
     886             :                 /* we don't have complete unicode mappings for these yet in entity_decode,
     887             :                  * and we opt to pass through the octet sequences for these in htmlentities
     888             :                  * instead of converting to an int and then converting back. */
     889             : #if 0
     890             :                 return php_mb2_int_to_char(buf, code);
     891             : #else
     892             : #ifdef ZEND_DEBUG
     893             :                 assert(code <= 0xFFU);
     894             : #endif
     895           0 :                 *buf = code;
     896           0 :                 return 1;
     897             : #endif
     898             : 
     899             :         case cs_eucjp:
     900             : #if 0 /* idem */
     901             :                 return php_mb2_int_to_char(buf, code);
     902             : #else
     903             : #ifdef ZEND_DEBUG
     904             :                 assert(code <= 0xFFU);
     905             : #endif
     906           0 :                 *buf = code;
     907           0 :                 return 1;
     908             : #endif
     909             : 
     910             :         default:
     911             :                 assert(0);
     912           0 :                 return 0;
     913             :         }
     914             : }
     915             : 
     916             : /* {{{ traverse_for_entities
     917             :  * Auxiliary function to php_unescape_html_entities().
     918             :  * - The argument "all" determines if all numeric entities are decode or only those
     919             :  *   that correspond to quotes (depending on quote_style).
     920             :  */
     921             : /* maximum expansion (factor 1.2) for HTML 5 with &nGt; and &nLt; */
     922             : /* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
     923             : #define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
     924        4188 : static void traverse_for_entities(
     925             :         const char *old,
     926             :         size_t oldlen,
     927             :         char *ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
     928             :         size_t *retlen,
     929             :         int all,
     930             :         int flags,
     931             :         const entity_ht *inv_map,
     932             :         enum entity_charset charset)
     933             : {
     934             :         const char *p,
     935             :                            *lim;
     936             :         char       *q;
     937        4188 :         int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
     938             : 
     939        4188 :         lim = old + oldlen; /* terminator address */
     940             :         assert(*lim == '\0');
     941             : 
     942       17961 :         for (p = old, q = ret; p < lim;) {
     943        9585 :                 unsigned code, code2 = 0;
     944        9585 :                 const char *next = NULL; /* when set, next > p, otherwise possible inf loop */
     945             : 
     946             :                 /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
     947             :                  * ASCII range byte can be part of a multi-byte sequence.
     948             :                  * However, they start at 0x40, therefore if we find a 0x26 byte,
     949             :                  * we're sure it represents the '&' character. */
     950             : 
     951             :                 /* assumes there are no single-char entities */
     952        9585 :                 if (p[0] != '&' || (p + 3 >= lim)) {
     953        5105 :                         *(q++) = *(p++);
     954        5105 :                         continue;
     955             :                 }
     956             : 
     957             :                 /* now p[3] is surely valid and is no terminator */
     958             : 
     959             :                 /* numerical entity */
     960        4480 :                 if (p[1] == '#') {
     961        1773 :                         next = &p[2];
     962        1773 :                         if (process_numeric_entity(&next, &code) == FAILURE)
     963          20 :                                 goto invalid_code;
     964             : 
     965             :                         /* If we're in htmlspecialchars_decode, we're only decoding entities
     966             :                          * that represent &, <, >, " and '. Is this one of them? */
     967        1917 :                         if (!all && (code > 63U ||
     968         164 :                                         stage3_table_be_apos_00000[code].data.ent.entity == NULL))
     969             :                                 goto invalid_code;
     970             : 
     971             :                         /* are we allowed to decode this entity in this document type?
     972             :                          * HTML 5 is the only that has a character that cannot be used in 
     973             :                          * a numeric entity but is allowed literally (U+000D). The
     974             :                          * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
     975        1786 :                         if (!unicode_cp_is_allowed(code, doctype) ||
     976          39 :                                         (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))
     977             :                                 goto invalid_code;
     978             :                 } else {
     979             :                         const char *start;
     980             :                         size_t ent_len;
     981             : 
     982        2707 :                         next = &p[1];
     983        2707 :                         start = next;
     984             : 
     985        2707 :                         if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
     986          41 :                                 goto invalid_code;
     987             : 
     988        2666 :                         if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) {
     989          27 :                                 if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
     990           4 :                                                         && start[1] == 'p' && start[2] == 'o' && start[3] == 's') {
     991             :                                         /* uses html4 inv_map, which doesn't include apos;. This is a
     992             :                                          * hack to support it */
     993           1 :                                         code = (unsigned) '\'';
     994             :                                 } else {
     995             :                                         goto invalid_code;
     996             :                                 }
     997             :                         }
     998             :                 }
     999             :                 
    1000             :                 assert(*next == ';');
    1001             :                 
    1002        8384 :                 if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
    1003        4202 :                                 (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
    1004             :                                 /* && code2 == '\0' always true for current maps */)
    1005             :                         goto invalid_code;
    1006             : 
    1007             :                 /* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
    1008             :                  * the call is needed to ensure the codepoint <= U+00FF)  */
    1009        4053 :                 if (charset != cs_utf_8) {
    1010             :                         /* replace unicode code point */
    1011        1573 :                         if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
    1012             :                                 goto invalid_code; /* not representable in target charset */
    1013             :                 }
    1014             : 
    1015        3658 :                 q += write_octet_sequence(q, charset, code);
    1016        3658 :                 if (code2) {
    1017          93 :                         q += write_octet_sequence(q, charset, code2);
    1018             :                 }
    1019             : 
    1020             :                 /* jump over the valid entity; may go beyond size of buffer; np */
    1021        3658 :                 p = next + 1;
    1022        3658 :                 continue;
    1023             : 
    1024             : invalid_code:
    1025        4793 :                 for (; p < next; p++) {
    1026        3971 :                         *(q++) = *p;
    1027             :                 }
    1028             :         }
    1029             :         
    1030        4188 :         *q = '\0';
    1031        4188 :         *retlen = (size_t)(q - ret);
    1032        4188 : }
    1033             : /* }}} */
    1034             : 
    1035             : /* {{{ unescape_inverse_map */
    1036        4599 : static const entity_ht *unescape_inverse_map(int all, int flags)
    1037             : {
    1038        4599 :         int document_type = flags & ENT_HTML_DOC_TYPE_MASK;
    1039             : 
    1040        4599 :         if (all) {
    1041        4417 :                 switch (document_type) {
    1042             :                 case ENT_HTML_DOC_HTML401:
    1043             :                 case ENT_HTML_DOC_XHTML: /* but watch out for &apos;...*/
    1044        1968 :                         return &ent_ht_html4;
    1045             :                 case ENT_HTML_DOC_HTML5:
    1046        2368 :                         return &ent_ht_html5;
    1047             :                 default:
    1048          81 :                         return &ent_ht_be_apos;
    1049             :                 }
    1050             :         } else {
    1051         182 :                 switch (document_type) {
    1052             :                 case ENT_HTML_DOC_HTML401:
    1053         101 :                         return &ent_ht_be_noapos;
    1054             :                 default:
    1055          81 :                         return &ent_ht_be_apos;
    1056             :                 }
    1057             :         }
    1058             : }
    1059             : /* }}} */
    1060             : 
    1061             : /* {{{ determine_entity_table
    1062             :  * Entity table to use. Note that entity tables are defined in terms of
    1063             :  * unicode code points */
    1064      135977 : static entity_table_opt determine_entity_table(int all, int doctype)
    1065             : {
    1066      135977 :         entity_table_opt retval = {NULL};
    1067             : 
    1068             :         assert(!(doctype == ENT_HTML_DOC_XML1 && all));
    1069             :         
    1070      135977 :         if (all) {
    1071      121588 :                 retval.ms_table = (doctype == ENT_HTML_DOC_HTML5) ?
    1072             :                         entity_ms_table_html5 : entity_ms_table_html4;
    1073             :         } else {
    1074       14389 :                 retval.table = (doctype == ENT_HTML_DOC_HTML401) ?
    1075             :                         stage3_table_be_noapos_00000 : stage3_table_be_apos_00000;
    1076             :         }
    1077      135977 :         return retval;
    1078             : }
    1079             : /* }}} */
    1080             : 
    1081             : /* {{{ php_unescape_html_entities
    1082             :  * The parameter "all" should be true to decode all possible entities, false to decode
    1083             :  * only the basic ones, i.e., those in basic_entities_ex + the numeric entities
    1084             :  * that correspond to quotes.
    1085             :  */
    1086        4200 : PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC)
    1087             : {
    1088             :         size_t retlen;
    1089             :         char *ret;
    1090             :         enum entity_charset charset;
    1091        4200 :         const entity_ht *inverse_map = NULL;
    1092        4200 :         size_t new_size = TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen);
    1093             : 
    1094        4200 :         if (all) {
    1095        4008 :                 charset = determine_charset(hint_charset TSRMLS_CC);
    1096             :         } else {
    1097         192 :                 charset = cs_8859_1; /* charset shouldn't matter, use ISO-8859-1 for performance */
    1098             :         }
    1099             : 
    1100             :         /* don't use LIMIT_ALL! */
    1101             : 
    1102        4200 :         if (oldlen > new_size) {
    1103             :                 /* overflow, refuse to do anything */
    1104           0 :                 ret = estrndup((char*)old, oldlen);
    1105           0 :                 retlen = oldlen;
    1106           0 :                 goto empty_source;
    1107             :         }
    1108        4200 :         ret = emalloc(new_size);
    1109        4200 :         *ret = '\0';
    1110        4200 :         retlen = oldlen;
    1111        4200 :         if (retlen == 0) {
    1112          12 :                 goto empty_source;
    1113             :         }
    1114             :         
    1115        4188 :         inverse_map = unescape_inverse_map(all, flags);
    1116             :         
    1117             :         /* replace numeric entities */
    1118        4188 :         traverse_for_entities(old, oldlen, ret, &retlen, all, flags, inverse_map, charset);
    1119             : 
    1120             : empty_source:   
    1121        4200 :         *newlen = retlen;
    1122        4200 :         return ret;
    1123             : }
    1124             : /* }}} */
    1125             : 
    1126        1742 : PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC)
    1127             : {
    1128        1742 :         return php_escape_html_entities_ex(old, oldlen, newlen, all, flags, hint_charset, 1 TSRMLS_CC);
    1129             : }
    1130             : 
    1131             : /* {{{ find_entity_for_char */
    1132      123714 : static inline void find_entity_for_char(
    1133             :         unsigned int k,
    1134             :         enum entity_charset charset,
    1135             :         const entity_stage1_row *table,
    1136             :         const unsigned char **entity,
    1137             :         size_t *entity_len,
    1138             :         unsigned char *old,
    1139             :         size_t oldlen,
    1140             :         size_t *cursor)
    1141             : {
    1142      123714 :         unsigned stage1_idx = ENT_STAGE1_INDEX(k);
    1143             :         const entity_stage3_row *c;
    1144             :         
    1145      123714 :         if (stage1_idx > 0x1D) {
    1146           6 :                 *entity     = NULL;
    1147           6 :                 *entity_len = 0;
    1148           6 :                 return;
    1149             :         }
    1150             : 
    1151      123708 :         c = &table[stage1_idx][ENT_STAGE2_INDEX(k)][ENT_STAGE3_INDEX(k)];
    1152             : 
    1153      123708 :         if (!c->ambiguous) {
    1154      123580 :                 *entity     = (const unsigned char *)c->data.ent.entity;
    1155      123580 :                 *entity_len = c->data.ent.entity_len;
    1156             :         } else {
    1157             :                 /* peek at next char */
    1158         128 :                 size_t   cursor_before  = *cursor;
    1159         128 :                 int              status                 = SUCCESS;
    1160             :                 unsigned next_char;
    1161             : 
    1162         128 :                 if (!(*cursor < oldlen))
    1163          63 :                         goto no_suitable_2nd;
    1164             : 
    1165          65 :                 next_char = get_next_char(charset, old, oldlen, cursor, &status); 
    1166             : 
    1167          65 :                 if (status == FAILURE)
    1168           0 :                         goto no_suitable_2nd;
    1169             : 
    1170             :                 {
    1171             :                         const entity_multicodepoint_row *s, *e;
    1172             : 
    1173          65 :                         s = &c->data.multicodepoint_table[1];
    1174          65 :                         e = s - 1 + c->data.multicodepoint_table[0].leading_entry.size;
    1175             :                         /* we could do a binary search but it's not worth it since we have
    1176             :                          * at most two entries... */
    1177          68 :                         for ( ; s <= e; s++) {
    1178          67 :                                 if (s->normal_entry.second_cp == next_char) {
    1179          64 :                                         *entity     = s->normal_entry.entity;
    1180          64 :                                         *entity_len = s->normal_entry.entity_len;
    1181          64 :                                         return;
    1182             :                                 }
    1183             :                         }
    1184             :                 }
    1185             : no_suitable_2nd:
    1186          64 :                 *cursor = cursor_before;
    1187          64 :                 *entity = (const unsigned char *)
    1188          64 :                         c->data.multicodepoint_table[0].leading_entry.default_entity;
    1189          64 :                 *entity_len = c->data.multicodepoint_table[0].leading_entry.default_entity_len;
    1190             :         }       
    1191             : }
    1192             : /* }}} */
    1193             : 
    1194             : /* {{{ find_entity_for_char_basic */
    1195       37096 : static inline void find_entity_for_char_basic(
    1196             :         unsigned int k,
    1197             :         const entity_stage3_row *table,
    1198             :         const unsigned char **entity,
    1199             :         size_t *entity_len)
    1200             : {
    1201       37096 :         if (k >= 64U) {
    1202       30429 :                 *entity     = NULL;
    1203       30429 :                 *entity_len = 0;
    1204       30429 :                 return;
    1205             :         }
    1206             : 
    1207        6667 :         *entity     = table[k].data.ent.entity;
    1208        6667 :         *entity_len = table[k].data.ent.entity_len;
    1209             : }
    1210             : /* }}} */
    1211             : 
    1212             : /* {{{ php_escape_html_entities
    1213             :  */
    1214      135910 : PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC)
    1215             : {
    1216             :         size_t cursor, maxlen, len;
    1217             :         char *replaced;
    1218      135910 :         enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
    1219      135910 :         int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
    1220             :         entity_table_opt entity_table;
    1221      135910 :         const enc_to_uni *to_uni_table = NULL;
    1222      135910 :         const entity_ht *inv_map = NULL; /* used for !double_encode */
    1223             :         /* only used if flags includes ENT_HTML_IGNORE_ERRORS or ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS */
    1224      135910 :         const unsigned char *replacement = NULL;
    1225      135910 :         size_t replacement_len = 0;
    1226             : 
    1227      135910 :         if (all) { /* replace with all named entities */
    1228      121735 :                 if (CHARSET_PARTIAL_SUPPORT(charset)) {
    1229         101 :                         php_error_docref0(NULL TSRMLS_CC, E_STRICT, "Only basic entities "
    1230             :                                 "substitution is supported for multi-byte encodings other than UTF-8; "
    1231             :                                 "functionality is equivalent to htmlspecialchars");
    1232             :                 }
    1233      121735 :                 LIMIT_ALL(all, doctype, charset);
    1234             :         }
    1235      135910 :         entity_table = determine_entity_table(all, doctype);
    1236      135910 :         if (all && !CHARSET_UNICODE_COMPAT(charset)) {
    1237          84 :                 to_uni_table = enc_to_uni_index[charset];
    1238             :         }
    1239             : 
    1240      135910 :         if (!double_encode) {
    1241             :                 /* first arg is 1 because we want to identify valid named entities
    1242             :                  * even if we are only encoding the basic ones */
    1243         411 :                 inv_map = unescape_inverse_map(1, flags);
    1244             :         }
    1245             : 
    1246      135910 :         if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) {
    1247         790 :                 if (charset == cs_utf_8) {
    1248         432 :                         replacement = (const unsigned char*)"\xEF\xBF\xBD";
    1249         432 :                         replacement_len = sizeof("\xEF\xBF\xBD") - 1;
    1250             :                 } else {
    1251         358 :                         replacement = (const unsigned char*)"&#xFFFD;";
    1252         358 :                         replacement_len = sizeof("&#xFFFD;") - 1;
    1253             :                 }
    1254             :         }
    1255             : 
    1256             :         /* initial estimate */
    1257      135910 :         if (oldlen < 64) {
    1258      135851 :                 maxlen = 128;   
    1259             :         } else {
    1260          59 :                 maxlen = 2 * oldlen;
    1261          59 :                 if (maxlen < oldlen) {
    1262           0 :                         zend_error_noreturn(E_ERROR, "Input string is too long");
    1263             :                         return NULL;
    1264             :                 }
    1265             :         }
    1266             : 
    1267      135910 :         replaced = emalloc(maxlen + 1); /* adding 1 is safe: maxlen is even */
    1268      135910 :         len = 0;
    1269      135910 :         cursor = 0;
    1270      433348 :         while (cursor < oldlen) {
    1271      167196 :                 const unsigned char *mbsequence = NULL;
    1272      167196 :                 size_t mbseqlen                                 = 0,
    1273      167196 :                        cursor_before                    = cursor;
    1274      167196 :                 int status                                              = SUCCESS;
    1275      167196 :                 unsigned int this_char                  = get_next_char(charset, old, oldlen, &cursor, &status);
    1276             : 
    1277             :                 /* guarantee we have at least 40 bytes to write.
    1278             :                  * In HTML5, entities may take up to 33 bytes */
    1279      167196 :                 if (len > maxlen - 40) { /* maxlen can never be smaller than 128 */
    1280           2 :                         replaced = safe_erealloc(replaced, maxlen , 1, 128 + 1);
    1281           2 :                         maxlen += 128;
    1282             :                 }
    1283             : 
    1284      167196 :                 if (status == FAILURE) {
    1285             :                         /* invalid MB sequence */
    1286        5794 :                         if (flags & ENT_HTML_IGNORE_ERRORS) {
    1287          82 :                                 continue;
    1288        5712 :                         } else if (flags & ENT_HTML_SUBSTITUTE_ERRORS) {
    1289          44 :                                 memcpy(&replaced[len], replacement, replacement_len);
    1290          44 :                                 len += replacement_len;
    1291          44 :                                 continue;
    1292             :                         } else {
    1293        5668 :                                 efree(replaced);
    1294        5668 :                                 *newlen = 0;
    1295        5668 :                                 return STR_EMPTY_ALLOC();
    1296             :                         }
    1297             :                 } else { /* SUCCESS */
    1298      161402 :                         mbsequence = &old[cursor_before];
    1299      161402 :                         mbseqlen = cursor - cursor_before;
    1300             :                 }
    1301             : 
    1302      161402 :                 if (this_char != '&') { /* no entity on this position */
    1303      160854 :                         const unsigned char *rep        = NULL;
    1304      160854 :                         size_t                          rep_len = 0;
    1305             : 
    1306      321745 :                         if (((this_char == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
    1307      160891 :                                         (this_char == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
    1308             :                                 goto pass_char_through;
    1309             : 
    1310      160811 :                         if (all) { /* false that CHARSET_PARTIAL_SUPPORT(charset) */
    1311      123715 :                                 if (to_uni_table != NULL) {
    1312             :                                         /* !CHARSET_UNICODE_COMPAT therefore not UTF-8; since UTF-8
    1313             :                                          * is the only multibyte encoding with !CHARSET_PARTIAL_SUPPORT,
    1314             :                                          * we're using a single byte encoding */
    1315         240 :                                         map_to_unicode(this_char, to_uni_table, &this_char);
    1316         240 :                                         if (this_char == 0xFFFF) /* no mapping; pass through */
    1317           1 :                                                 goto pass_char_through;
    1318             :                                 }
    1319             :                                 /* the cursor may advance */
    1320      123714 :                                 find_entity_for_char(this_char, charset, entity_table.ms_table, &rep,
    1321             :                                         &rep_len, old, oldlen, &cursor);
    1322             :                         } else {
    1323       37096 :                                 find_entity_for_char_basic(this_char, entity_table.table, &rep, &rep_len);
    1324             :                         }
    1325             : 
    1326      160810 :                         if (rep != NULL) {
    1327        2072 :                                 replaced[len++] = '&';
    1328        2072 :                                 memcpy(&replaced[len], rep, rep_len);
    1329        2072 :                                 len += rep_len;
    1330        2072 :                                 replaced[len++] = ';';
    1331             :                         } else {
    1332             :                                 /* we did not find an entity for this char.
    1333             :                                  * check for its validity, if its valid pass it unchanged */
    1334      158738 :                                 if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
    1335        1150 :                                         if (CHARSET_UNICODE_COMPAT(charset)) {
    1336         613 :                                                 if (!unicode_cp_is_allowed(this_char, doctype)) {
    1337          78 :                                                         mbsequence = replacement;
    1338          78 :                                                         mbseqlen = replacement_len;
    1339             :                                                 }
    1340         537 :                                         } else if (to_uni_table) {
    1341          97 :                                                 if (!all) /* otherwise we already did this */
    1342           0 :                                                         map_to_unicode(this_char, to_uni_table, &this_char);
    1343          97 :                                                 if (!unicode_cp_is_allowed(this_char, doctype)) {
    1344          19 :                                                         mbsequence = replacement;
    1345          19 :                                                         mbseqlen = replacement_len;
    1346             :                                                 }
    1347             :                                         } else {
    1348             :                                                 /* not a unicode code point, unless, coincidentally, it's in
    1349             :                                                  * the 0x20..0x7D range (except 0x5C in sjis). We know nothing
    1350             :                                                  * about other code points, because we have no tables. Since
    1351             :                                                  * Unicode code points in that range are not disallowed in any
    1352             :                                                  * document type, we could do nothing. However, conversion
    1353             :                                                  * tables frequently map 0x00-0x1F to the respective C0 code
    1354             :                                                  * points. Let's play it safe and admit that's the case */
    1355         852 :                                                 if (this_char <= 0x7D &&
    1356         412 :                                                                 !unicode_cp_is_allowed(this_char, doctype)) {
    1357          75 :                                                         mbsequence = replacement;
    1358          75 :                                                         mbseqlen = replacement_len;
    1359             :                                                 }
    1360             :                                         }
    1361             :                                 }
    1362             : pass_char_through:
    1363      158782 :                                 if (mbseqlen > 1) {
    1364      125945 :                                         memcpy(replaced + len, mbsequence, mbseqlen);
    1365      125945 :                                         len += mbseqlen;
    1366             :                                 } else {
    1367       32837 :                                         replaced[len++] = mbsequence[0];
    1368             :                                 }
    1369             :                         }
    1370             :                 } else { /* this_char == '&' */
    1371         548 :                         if (double_encode) {
    1372             : encode_amp:
    1373         287 :                                 memcpy(&replaced[len], "&amp;", sizeof("&amp;") - 1);
    1374         287 :                                 len += sizeof("&amp;") - 1;
    1375             :                         } else { /* no double encode */
    1376             :                                 /* check if entity is valid */
    1377             :                                 size_t ent_len; /* not counting & or ; */
    1378             :                                 /* peek at next char */
    1379         429 :                                 if (old[cursor] == '#') { /* numeric entity */
    1380             :                                         unsigned code_point;
    1381             :                                         int valid;
    1382         393 :                                         char *pos = (char*)&old[cursor+1];
    1383         393 :                                         valid = process_numeric_entity((const char **)&pos, &code_point);
    1384         393 :                                         if (valid == FAILURE)
    1385          20 :                                                 goto encode_amp;
    1386         373 :                                         if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
    1387         312 :                                                 if (!numeric_entity_is_allowed(code_point, doctype))
    1388         130 :                                                         goto encode_amp;
    1389             :                                         }
    1390         243 :                                         ent_len = pos - (char*)&old[cursor];
    1391             :                                 } else { /* named entity */
    1392             :                                         /* check for vality of named entity */
    1393          36 :                                         const char *start = &old[cursor],
    1394          36 :                                                            *next = start;
    1395             :                                         unsigned   dummy1, dummy2;
    1396             : 
    1397          36 :                                         if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
    1398          14 :                                                 goto encode_amp;
    1399          22 :                                         if (resolve_named_entity_html(start, ent_len, inv_map, &dummy1, &dummy2) == FAILURE) {
    1400           4 :                                                 if (!(doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
    1401           0 :                                                                         && start[1] == 'p' && start[2] == 'o' && start[3] == 's')) {
    1402             :                                                         /* uses html4 inv_map, which doesn't include apos;. This is a
    1403             :                                                          * hack to support it */
    1404             :                                                         goto encode_amp;
    1405             :                                                 }
    1406             :                                         }
    1407             :                                 }
    1408             :                                 /* checks passed; copy entity to result */
    1409             :                                 /* entity size is unbounded, we may need more memory */
    1410             :                                 /* at this point maxlen - len >= 40 */
    1411         261 :                                 if (maxlen - len < ent_len + 2 /* & and ; */) {
    1412             :                                         /* ent_len < oldlen, which is certainly <= SIZE_MAX/2 */
    1413           1 :                                         replaced = safe_erealloc(replaced, maxlen, 1, ent_len + 128 + 1);
    1414           1 :                                         maxlen += ent_len + 128;
    1415             :                                 }
    1416         261 :                                 replaced[len++] = '&';
    1417         261 :                                 memcpy(&replaced[len], &old[cursor], ent_len);
    1418         261 :                                 len += ent_len;
    1419         261 :                                 replaced[len++] = ';';
    1420         261 :                                 cursor += ent_len + 1;
    1421             :                         }
    1422             :                 }
    1423             :         }
    1424      130242 :         replaced[len] = '\0';
    1425      130242 :         *newlen = len;
    1426             : 
    1427      130242 :         return replaced;
    1428             : }
    1429             : /* }}} */
    1430             : 
    1431             : /* {{{ php_html_entities
    1432             :  */
    1433      134161 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
    1434             : {
    1435      134161 :         char *str, *hint_charset = NULL;
    1436      134161 :         int str_len, hint_charset_len = 0;
    1437             :         size_t new_len;
    1438      134161 :         long flags = ENT_COMPAT;
    1439             :         char *replaced;
    1440      134161 :         zend_bool double_encode = 1;
    1441             : 
    1442      134161 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &flags, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
    1443           2 :                 return;
    1444             :         }
    1445             : 
    1446      134159 :         replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC);
    1447      134159 :         RETVAL_STRINGL(replaced, (int)new_len, 0);
    1448             : }
    1449             : /* }}} */
    1450             : 
    1451             : #define HTML_SPECIALCHARS       0
    1452             : #define HTML_ENTITIES           1
    1453             : 
    1454             : /* {{{ register_html_constants
    1455             :  */
    1456       20225 : void register_html_constants(INIT_FUNC_ARGS)
    1457             : {
    1458       20225 :         REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
    1459       20225 :         REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
    1460       20225 :         REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
    1461       20225 :         REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
    1462       20225 :         REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
    1463       20225 :         REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS);
    1464       20225 :         REGISTER_LONG_CONSTANT("ENT_SUBSTITUTE", ENT_SUBSTITUTE, CONST_PERSISTENT|CONST_CS);
    1465       20225 :         REGISTER_LONG_CONSTANT("ENT_DISALLOWED", ENT_DISALLOWED, CONST_PERSISTENT|CONST_CS);
    1466       20225 :         REGISTER_LONG_CONSTANT("ENT_HTML401", ENT_HTML401, CONST_PERSISTENT|CONST_CS);
    1467       20225 :         REGISTER_LONG_CONSTANT("ENT_XML1", ENT_XML1, CONST_PERSISTENT|CONST_CS);
    1468       20225 :         REGISTER_LONG_CONSTANT("ENT_XHTML", ENT_XHTML, CONST_PERSISTENT|CONST_CS);
    1469       20225 :         REGISTER_LONG_CONSTANT("ENT_HTML5", ENT_HTML5, CONST_PERSISTENT|CONST_CS);
    1470       20225 : }
    1471             : /* }}} */
    1472             : 
    1473             : /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]])
    1474             :    Convert special characters to HTML entities */
    1475       12434 : PHP_FUNCTION(htmlspecialchars)
    1476             : {
    1477       12434 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
    1478       12434 : }
    1479             : /* }}} */
    1480             : 
    1481             : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
    1482             :    Convert special HTML entities back to characters */
    1483         211 : PHP_FUNCTION(htmlspecialchars_decode)
    1484             : {
    1485             :         char *str;
    1486             :         int str_len;
    1487         211 :         size_t new_len = 0;
    1488         211 :         long quote_style = ENT_COMPAT;
    1489             :         char *replaced;
    1490             : 
    1491         211 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &str_len, &quote_style) == FAILURE) {
    1492          19 :                 return;
    1493             :         }
    1494             : 
    1495         192 :         replaced = php_unescape_html_entities(str, str_len, &new_len, 0 /*!all*/, quote_style, NULL TSRMLS_CC);
    1496         192 :         if (replaced) {
    1497         192 :                 RETURN_STRINGL(replaced, (int)new_len, 0);
    1498             :         }
    1499           0 :         RETURN_FALSE;
    1500             : }
    1501             : /* }}} */
    1502             : 
    1503             : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
    1504             :    Convert all HTML entities to their applicable characters */
    1505        4008 : PHP_FUNCTION(html_entity_decode)
    1506             : {
    1507        4008 :         char *str, *hint_charset = NULL;
    1508        4008 :         int str_len, hint_charset_len = 0;
    1509        4008 :         size_t new_len = 0;
    1510        4008 :         long quote_style = ENT_COMPAT;
    1511             :         char *replaced;
    1512             : 
    1513        4008 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
    1514             :                                                           &quote_style, &hint_charset, &hint_charset_len) == FAILURE) {
    1515           0 :                 return;
    1516             :         }
    1517             : 
    1518        4008 :         replaced = php_unescape_html_entities(str, str_len, &new_len, 1 /*all*/, quote_style, hint_charset TSRMLS_CC);
    1519        4008 :         if (replaced) {
    1520        4008 :                 RETURN_STRINGL(replaced, (int)new_len, 0);
    1521             :         }
    1522           0 :         RETURN_FALSE;
    1523             : }
    1524             : /* }}} */
    1525             : 
    1526             : 
    1527             : /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]])
    1528             :    Convert all applicable characters to HTML entities */
    1529      121727 : PHP_FUNCTION(htmlentities)
    1530             : {
    1531      121727 :         php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
    1532      121727 : }
    1533             : /* }}} */
    1534             : 
    1535             : /* {{{ write_s3row_data */
    1536        8488 : static inline void write_s3row_data(
    1537             :         const entity_stage3_row *r,
    1538             :         unsigned orig_cp,
    1539             :         enum entity_charset charset,
    1540             :         zval *arr)
    1541             : {
    1542        8488 :         char key[9] = ""; /* two unicode code points in UTF-8 */
    1543        8488 :         char entity[LONGEST_ENTITY_LENGTH + 2] = {'&'};
    1544             :         size_t written_k1;
    1545             : 
    1546        8488 :         written_k1 = write_octet_sequence(key, charset, orig_cp);
    1547             : 
    1548        8488 :         if (!r->ambiguous) {
    1549        8287 :                 size_t l = r->data.ent.entity_len;
    1550        8287 :                 memcpy(&entity[1], r->data.ent.entity, l);
    1551        8287 :                 entity[l + 1] = ';';
    1552        8287 :                 add_assoc_stringl_ex(arr, key, written_k1 + 1, entity, l + 2, 1);
    1553             :         } else {
    1554             :                 unsigned i,
    1555             :                              num_entries;
    1556         201 :                 const entity_multicodepoint_row *mcpr = r->data.multicodepoint_table;
    1557             : 
    1558         201 :                 if (mcpr[0].leading_entry.default_entity != NULL) {
    1559         195 :                         size_t l = mcpr[0].leading_entry.default_entity_len;
    1560         195 :                         memcpy(&entity[1], mcpr[0].leading_entry.default_entity, l);
    1561         195 :                         entity[l + 1] = ';';
    1562         195 :                         add_assoc_stringl_ex(arr, key, written_k1 + 1, entity, l + 2, 1);
    1563             :                 }
    1564         201 :                 num_entries = mcpr[0].leading_entry.size;
    1565         408 :                 for (i = 1; i <= num_entries; i++) {
    1566             :                         size_t   l,
    1567             :                                      written_k2;
    1568             :                         unsigned uni_cp,
    1569             :                                          spe_cp;
    1570             : 
    1571         207 :                         uni_cp = mcpr[i].normal_entry.second_cp;
    1572         207 :                         l = mcpr[i].normal_entry.entity_len;
    1573             : 
    1574         207 :                         if (!CHARSET_UNICODE_COMPAT(charset)) {
    1575          12 :                                 if (map_from_unicode(uni_cp, charset, &spe_cp) == FAILURE)
    1576           9 :                                         continue; /* non representable in this charset */
    1577             :                         } else {
    1578         195 :                                 spe_cp = uni_cp;
    1579             :                         }
    1580             :                         
    1581         198 :                         written_k2 = write_octet_sequence(&key[written_k1], charset, spe_cp);
    1582         198 :                         memcpy(&entity[1], mcpr[i].normal_entry.entity, l);
    1583         198 :                         entity[l + 1] = ';';
    1584         198 :                         entity[l + 1] = '\0';
    1585         198 :                         add_assoc_stringl_ex(arr, key, written_k1 + written_k2 + 1, entity, l + 1, 1);
    1586             :                 }
    1587             :         }
    1588        8488 : }
    1589             : /* }}} */
    1590             : 
    1591             : /* {{{ proto array get_html_translation_table([int table [, int flags [, string charset_hint]]])
    1592             :    Returns the internal translation table used by htmlspecialchars and htmlentities */
    1593         101 : PHP_FUNCTION(get_html_translation_table)
    1594             : {
    1595         101 :         long all = HTML_SPECIALCHARS,
    1596         101 :                  flags = ENT_COMPAT;
    1597             :         int doctype;
    1598             :         entity_table_opt entity_table;
    1599         101 :         const enc_to_uni *to_uni_table = NULL;
    1600         101 :         char *charset_hint = NULL;
    1601             :         int charset_hint_len;
    1602             :         enum entity_charset charset;
    1603             : 
    1604             :         /* in this function we have to jump through some loops because we're
    1605             :          * getting the translated table from data structures that are optimized for
    1606             :          * random access, not traversal */
    1607             : 
    1608         101 :         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls",
    1609             :                         &all, &flags, &charset_hint, &charset_hint_len) == FAILURE) {
    1610          34 :                 return;
    1611             :         }
    1612             : 
    1613          67 :         charset = determine_charset(charset_hint TSRMLS_CC);
    1614          67 :         doctype = flags & ENT_HTML_DOC_TYPE_MASK;
    1615          67 :         LIMIT_ALL(all, doctype, charset);
    1616             : 
    1617          67 :         array_init(return_value);
    1618             :         
    1619          67 :         entity_table = determine_entity_table(all, doctype);
    1620          67 :         if (all && !CHARSET_UNICODE_COMPAT(charset)) {
    1621           4 :                 to_uni_table = enc_to_uni_index[charset];
    1622             :         }
    1623             : 
    1624          67 :         if (all) { /* HTML_ENTITIES (actually, any non-zero value for 1st param) */
    1625          21 :                 const entity_stage1_row *ms_table = entity_table.ms_table;
    1626             : 
    1627          21 :                 if (CHARSET_UNICODE_COMPAT(charset)) {
    1628             :                         unsigned i, j, k,
    1629             :                                          max_i, max_j, max_k;
    1630             :                         /* no mapping to unicode required */
    1631          18 :                         if (CHARSET_SINGLE_BYTE(charset)) { /* ISO-8859-1 */
    1632           1 :                                 max_i = 1; max_j = 4; max_k = 64;
    1633             :                         } else {
    1634          16 :                                 max_i = 0x1E; max_j = 64; max_k = 64;
    1635             :                         }
    1636             : 
    1637         498 :                         for (i = 0; i < max_i; i++) {
    1638         481 :                                 if (ms_table[i] == empty_stage2_table)
    1639         442 :                                         continue;
    1640        2475 :                                 for (j = 0; j < max_j; j++) {
    1641        2436 :                                         if (ms_table[i][j] == empty_stage3_table)
    1642        1995 :                                                 continue;
    1643       28665 :                                         for (k = 0; k < max_k; k++) {
    1644       28224 :                                                 const entity_stage3_row *r = &ms_table[i][j][k];
    1645             :                                                 unsigned code;
    1646             : 
    1647       28224 :                                                 if (r->data.ent.entity == NULL)
    1648       20493 :                                                         continue;
    1649             : 
    1650        7731 :                                                 code = ENT_CODE_POINT_FROM_STAGES(i, j, k);
    1651        7748 :                                                 if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
    1652          17 :                                                                 (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
    1653          17 :                                                         continue;
    1654        7714 :                                                 write_s3row_data(r, code, charset, return_value);
    1655             :                                         }
    1656             :                                 }
    1657             :                         }
    1658             :                 } else {
    1659             :                         /* we have to iterate through the set of code points for this
    1660             :                          * encoding and map them to unicode code points */
    1661             :                         unsigned i;
    1662        1028 :                         for (i = 0; i <= 0xFF; i++) {
    1663             :                                 const entity_stage3_row *r;
    1664             :                                 unsigned uni_cp;
    1665             : 
    1666             :                                 /* can be done before mapping, they're invariant */
    1667        1028 :                                 if (((i == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
    1668           4 :                                                 (i == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
    1669           4 :                                         continue;
    1670             : 
    1671        1020 :                                 map_to_unicode(i, to_uni_table, &uni_cp);
    1672        1020 :                                 r = &ms_table[ENT_STAGE1_INDEX(uni_cp)][ENT_STAGE2_INDEX(uni_cp)][ENT_STAGE3_INDEX(uni_cp)];
    1673        1020 :                                 if (r->data.ent.entity == NULL)
    1674         424 :                                         continue;
    1675             : 
    1676         596 :                                 write_s3row_data(r, i, charset, return_value);
    1677             :                         }
    1678             :                 }
    1679             :         } else {
    1680             :                 /* we could use sizeof(stage3_table_be_apos_00000) as well */
    1681             :                 unsigned          j,
    1682             :                                           numelems = sizeof(stage3_table_be_noapos_00000) /
    1683          46 :                                                         sizeof(*stage3_table_be_noapos_00000);
    1684             : 
    1685        2990 :                 for (j = 0; j < numelems; j++) {
    1686        2944 :                         const entity_stage3_row *r = &entity_table.table[j];
    1687        2944 :                         if (r->data.ent.entity == NULL)
    1688        2714 :                                 continue;
    1689             : 
    1690         276 :                         if (((j == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
    1691          46 :                                         (j == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
    1692          52 :                                 continue;
    1693             : 
    1694             :                         /* charset is indifferent, used cs_8859_1 for efficiency */
    1695         178 :                         write_s3row_data(r, j, cs_8859_1, return_value);
    1696             :                 }
    1697             :         }
    1698             : }
    1699             : /* }}} */
    1700             : 
    1701             : /*
    1702             :  * Local variables:
    1703             :  * tab-width: 4
    1704             :  * c-basic-offset: 4
    1705             :  * End:
    1706             :  * vim600: sw=4 ts=4 fdm=marker
    1707             :  * vim<600: sw=4 ts=4
    1708             :  */

Generated by: LCOV version 1.10

Generated at Fri, 24 Oct 2014 05:21:57 +0000 (12 hours ago)

Copyright © 2005-2014 The PHP Group
All rights reserved.