1 : /*
2 : ** 2002 April 25
3 : **
4 : ** The author disclaims copyright to this source code. In place of
5 : ** a legal notice, here is a blessing:
6 : **
7 : ** May you do good and not evil.
8 : ** May you find forgiveness for yourself and forgive others.
9 : ** May you share freely, never taking more than you give.
10 : **
11 : *************************************************************************
12 : ** This file contains helper routines used to translate binary data into
13 : ** a null-terminated string (suitable for use in SQLite) and back again.
14 : ** These are convenience routines for use by people who want to store binary
15 : ** data in an SQLite database. The code in this file is not used by any other
16 : ** part of the SQLite library.
17 : **
18 : ** $Id: encode.c 225725 2006-12-24 20:50:02Z iliaa $
19 : */
20 : #include <string.h>
21 : #include <assert.h>
22 :
23 : /*
24 : ** How This Encoder Works
25 : **
26 : ** The output is allowed to contain any character except 0x27 (') and
27 : ** 0x00. This is accomplished by using an escape character to encode
28 : ** 0x27 and 0x00 as a two-byte sequence. The escape character is always
29 : ** 0x01. An 0x00 is encoded as the two byte sequence 0x01 0x01. The
30 : ** 0x27 character is encoded as the two byte sequence 0x01 0x28. Finally,
31 : ** the escape character itself is encoded as the two-character sequence
32 : ** 0x01 0x02.
33 : **
34 : ** To summarize, the encoder works by using an escape sequences as follows:
35 : **
36 : ** 0x00 -> 0x01 0x01
37 : ** 0x01 -> 0x01 0x02
38 : ** 0x27 -> 0x01 0x28
39 : **
40 : ** If that were all the encoder did, it would work, but in certain cases
41 : ** it could double the size of the encoded string. For example, to
42 : ** encode a string of 100 0x27 characters would require 100 instances of
43 : ** the 0x01 0x03 escape sequence resulting in a 200-character output.
44 : ** We would prefer to keep the size of the encoded string smaller than
45 : ** this.
46 : **
47 : ** To minimize the encoding size, we first add a fixed offset value to each
48 : ** byte in the sequence. The addition is modulo 256. (That is to say, if
49 : ** the sum of the original character value and the offset exceeds 256, then
50 : ** the higher order bits are truncated.) The offset is chosen to minimize
51 : ** the number of characters in the string that need to be escaped. For
52 : ** example, in the case above where the string was composed of 100 0x27
53 : ** characters, the offset might be 0x01. Each of the 0x27 characters would
54 : ** then be converted into an 0x28 character which would not need to be
55 : ** escaped at all and so the 100 character input string would be converted
56 : ** into just 100 characters of output. Actually 101 characters of output -
57 : ** we have to record the offset used as the first byte in the sequence so
58 : ** that the string can be decoded. Since the offset value is stored as
59 : ** part of the output string and the output string is not allowed to contain
60 : ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
61 : **
62 : ** Here, then, are the encoding steps:
63 : **
64 : ** (1) Choose an offset value and make it the first character of
65 : ** output.
66 : **
67 : ** (2) Copy each input character into the output buffer, one by
68 : ** one, adding the offset value as you copy.
69 : **
70 : ** (3) If the value of an input character plus offset is 0x00, replace
71 : ** that one character by the two-character sequence 0x01 0x01.
72 : ** If the sum is 0x01, replace it with 0x01 0x02. If the sum
73 : ** is 0x27, replace it with 0x01 0x03.
74 : **
75 : ** (4) Put a 0x00 terminator at the end of the output.
76 : **
77 : ** Decoding is obvious:
78 : **
79 : ** (5) Copy encoded characters except the first into the decode
80 : ** buffer. Set the first encoded character aside for use as
81 : ** the offset in step 7 below.
82 : **
83 : ** (6) Convert each 0x01 0x01 sequence into a single character 0x00.
84 : ** Convert 0x01 0x02 into 0x01. Convert 0x01 0x28 into 0x27.
85 : **
86 : ** (7) Subtract the offset value that was the first character of
87 : ** the encoded buffer from all characters in the output buffer.
88 : **
89 : ** The only tricky part is step (1) - how to compute an offset value to
90 : ** minimize the size of the output buffer. This is accomplished by testing
91 : ** all offset values and picking the one that results in the fewest number
92 : ** of escapes. To do that, we first scan the entire input and count the
93 : ** number of occurances of each character value in the input. Suppose
94 : ** the number of 0x00 characters is N(0), the number of occurances of 0x01
95 : ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
96 : ** An offset of 0 is not allowed so we don't have to test it. The number
97 : ** of escapes required for an offset of 1 is N(1)+N(2)+N(40). The number
98 : ** of escapes required for an offset of 2 is N(2)+N(3)+N(41). And so forth.
99 : ** In this way we find the offset that gives the minimum number of escapes,
100 : ** and thus minimizes the length of the output string.
101 : */
102 :
103 : /*
104 : ** Encode a binary buffer "in" of size n bytes so that it contains
105 : ** no instances of characters '\'' or '\000'. The output is
106 : ** null-terminated and can be used as a string value in an INSERT
107 : ** or UPDATE statement. Use sqlite_decode_binary() to convert the
108 : ** string back into its original binary.
109 : **
110 : ** The result is written into a preallocated output buffer "out".
111 : ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
112 : ** In other words, the output will be expanded by as much as 3
113 : ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
114 : ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
115 : **
116 : ** The return value is the number of characters in the encoded
117 : ** string, excluding the "\000" terminator.
118 : **
119 : ** If out==NULL then no output is generated but the routine still returns
120 : ** the number of characters that would have been generated if out had
121 : ** not been NULL.
122 : */
123 6 : int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
124 : int i, j, e, m;
125 : unsigned char x;
126 : int cnt[256];
127 6 : if( n<=0 ){
128 0 : if( out ){
129 0 : out[0] = 'x';
130 0 : out[1] = 0;
131 : }
132 0 : return 1;
133 : }
134 6 : memset(cnt, 0, sizeof(cnt));
135 6 : for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
136 6 : m = n;
137 8 : for(i=1; i<256; i++){
138 : int sum;
139 8 : if( i=='\'' ) continue;
140 8 : sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
141 8 : if( sum<m ){
142 8 : m = sum;
143 8 : e = i;
144 8 : if( m==0 ) break;
145 : }
146 : }
147 6 : if( out==0 ){
148 0 : return n+m+1;
149 : }
150 6 : out[0] = e;
151 6 : j = 1;
152 40000091 : for(i=0; i<n; i++){
153 40000085 : x = in[i] - e;
154 40000085 : if( x==0 || x==1 || x=='\''){
155 0 : out[j++] = 1;
156 0 : x++;
157 : }
158 40000085 : out[j++] = x;
159 : }
160 6 : out[j] = 0;
161 6 : assert( j==n+m+1 );
162 6 : return j;
163 : }
164 :
165 : /*
166 : ** Decode the string "in" into binary data and write it into "out".
167 : ** This routine reverses the encoding created by sqlite_encode_binary().
168 : ** The output will always be a few bytes less than the input. The number
169 : ** of bytes of output is returned. If the input is not a well-formed
170 : ** encoding, -1 is returned.
171 : **
172 : ** The "in" and "out" parameters may point to the same buffer in order
173 : ** to decode a string in place.
174 : */
175 4 : int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
176 : int i, e;
177 : unsigned char c;
178 4 : e = *(in++);
179 4 : if (e == 0) {
180 0 : return 0;
181 : }
182 4 : i = 0;
183 93 : while( (c = *(in++))!=0 ){
184 85 : if (c == 1) {
185 0 : c = *(in++) - 1;
186 : }
187 85 : out[i++] = c + e;
188 : }
189 4 : return i;
190 : }
191 :
192 : #ifdef ENCODER_TEST
193 : #include <stdio.h>
194 : /*
195 : ** The subroutines above are not tested by the usual test suite. To test
196 : ** these routines, compile just this one file with a -DENCODER_TEST=1 option
197 : ** and run the result.
198 : */
199 : int main(int argc, char **argv){
200 : int i, j, n, m, nOut, nByteIn, nByteOut;
201 : unsigned char in[30000];
202 : unsigned char out[33000];
203 :
204 : nByteIn = nByteOut = 0;
205 : for(i=0; i<sizeof(in); i++){
206 : printf("Test %d: ", i+1);
207 : n = rand() % (i+1);
208 : if( i%100==0 ){
209 : int k;
210 : for(j=k=0; j<n; j++){
211 : /* if( k==0 || k=='\'' ) k++; */
212 : in[j] = k;
213 : k = (k+1)&0xff;
214 : }
215 : }else{
216 : for(j=0; j<n; j++) in[j] = rand() & 0xff;
217 : }
218 : nByteIn += n;
219 : nOut = sqlite_encode_binary(in, n, out);
220 : nByteOut += nOut;
221 : if( nOut!=strlen(out) ){
222 : printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
223 : exit(1);
224 : }
225 : if( nOut!=sqlite_encode_binary(in, n, 0) ){
226 : printf(" ERROR actual output size disagrees with predicted size\n");
227 : exit(1);
228 : }
229 : m = (256*n + 1262)/253;
230 : printf("size %d->%d (max %d)", n, strlen(out)+1, m);
231 : if( strlen(out)+1>m ){
232 : printf(" ERROR output too big\n");
233 : exit(1);
234 : }
235 : for(j=0; out[j]; j++){
236 : if( out[j]=='\'' ){
237 : printf(" ERROR contains (')\n");
238 : exit(1);
239 : }
240 : }
241 : j = sqlite_decode_binary(out, out);
242 : if( j!=n ){
243 : printf(" ERROR decode size %d\n", j);
244 : exit(1);
245 : }
246 : if( memcmp(in, out, n)!=0 ){
247 : printf(" ERROR decode mismatch\n");
248 : exit(1);
249 : }
250 : printf(" OK\n");
251 : }
252 : fprintf(stderr,"Finished. Total encoding: %d->%d bytes\n",
253 : nByteIn, nByteOut);
254 : fprintf(stderr,"Avg size increase: %.3f%%\n",
255 : (nByteOut-nByteIn)*100.0/(double)nByteIn);
256 : }
257 : #endif /* ENCODER_TEST */
|