1 : /*
2 : ** 2004 April 6
3 : **
4 : ** The author disclaims copyright to this source code. In place of
5 : ** a legal notice, here is a blessing:
6 : **
7 : ** May you do good and not evil.
8 : ** May you find forgiveness for yourself and forgive others.
9 : ** May you share freely, never taking more than you give.
10 : **
11 : *************************************************************************
12 : ** $Id$
13 : **
14 : ** This file implements a external (disk-based) database using BTrees.
15 : ** For a detailed discussion of BTrees, refer to
16 : **
17 : ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18 : ** "Sorting And Searching", pages 473-480. Addison-Wesley
19 : ** Publishing Company, Reading, Massachusetts.
20 : **
21 : ** The basic idea is that each page of the file contains N database
22 : ** entries and N+1 pointers to subpages.
23 : **
24 : ** ----------------------------------------------------------------
25 : ** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N-1) | Ptr(N) |
26 : ** ----------------------------------------------------------------
27 : **
28 : ** All of the keys on the page that Ptr(0) points to have values less
29 : ** than Key(0). All of the keys on page Ptr(1) and its subpages have
30 : ** values greater than Key(0) and less than Key(1). All of the keys
31 : ** on Ptr(N) and its subpages have values greater than Key(N-1). And
32 : ** so forth.
33 : **
34 : ** Finding a particular key requires reading O(log(M)) pages from the
35 : ** disk where M is the number of entries in the tree.
36 : **
37 : ** In this implementation, a single file can hold one or more separate
38 : ** BTrees. Each BTree is identified by the index of its root page. The
39 : ** key and data for any entry are combined to form the "payload". A
40 : ** fixed amount of payload can be carried directly on the database
41 : ** page. If the payload is larger than the preset amount then surplus
42 : ** bytes are stored on overflow pages. The payload for an entry
43 : ** and the preceding pointer are combined to form a "Cell". Each
44 : ** page has a small header which contains the Ptr(N) pointer and other
45 : ** information such as the size of key and data.
46 : **
47 : ** FORMAT DETAILS
48 : **
49 : ** The file is divided into pages. The first page is called page 1,
50 : ** the second is page 2, and so forth. A page number of zero indicates
51 : ** "no such page". The page size can be anything between 512 and 65536.
52 : ** Each page can be either a btree page, a freelist page or an overflow
53 : ** page.
54 : **
55 : ** The first page is always a btree page. The first 100 bytes of the first
56 : ** page contain a special header (the "file header") that describes the file.
57 : ** The format of the file header is as follows:
58 : **
59 : ** OFFSET SIZE DESCRIPTION
60 : ** 0 16 Header string: "SQLite format 3\000"
61 : ** 16 2 Page size in bytes.
62 : ** 18 1 File format write version
63 : ** 19 1 File format read version
64 : ** 20 1 Bytes of unused space at the end of each page
65 : ** 21 1 Max embedded payload fraction
66 : ** 22 1 Min embedded payload fraction
67 : ** 23 1 Min leaf payload fraction
68 : ** 24 4 File change counter
69 : ** 28 4 Reserved for future use
70 : ** 32 4 First freelist page
71 : ** 36 4 Number of freelist pages in the file
72 : ** 40 60 15 4-byte meta values passed to higher layers
73 : **
74 : ** All of the integer values are big-endian (most significant byte first).
75 : **
76 : ** The file change counter is incremented when the database is changed more
77 : ** than once within the same second. This counter, together with the
78 : ** modification time of the file, allows other processes to know
79 : ** when the file has changed and thus when they need to flush their
80 : ** cache.
81 : **
82 : ** The max embedded payload fraction is the amount of the total usable
83 : ** space in a page that can be consumed by a single cell for standard
84 : ** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
85 : ** is to limit the maximum cell size so that at least 4 cells will fit
86 : ** on one page. Thus the default max embedded payload fraction is 64.
87 : **
88 : ** If the payload for a cell is larger than the max payload, then extra
89 : ** payload is spilled to overflow pages. Once an overflow page is allocated,
90 : ** as many bytes as possible are moved into the overflow pages without letting
91 : ** the cell size drop below the min embedded payload fraction.
92 : **
93 : ** The min leaf payload fraction is like the min embedded payload fraction
94 : ** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
95 : ** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
96 : ** not specified in the header.
97 : **
98 : ** Each btree pages is divided into three sections: The header, the
99 : ** cell pointer array, and the cell area area. Page 1 also has a 100-byte
100 : ** file header that occurs before the page header.
101 : **
102 : ** |----------------|
103 : ** | file header | 100 bytes. Page 1 only.
104 : ** |----------------|
105 : ** | page header | 8 bytes for leaves. 12 bytes for interior nodes
106 : ** |----------------|
107 : ** | cell pointer | | 2 bytes per cell. Sorted order.
108 : ** | array | | Grows downward
109 : ** | | v
110 : ** |----------------|
111 : ** | unallocated |
112 : ** | space |
113 : ** |----------------| ^ Grows upwards
114 : ** | cell content | | Arbitrary order interspersed with freeblocks.
115 : ** | area | | and free space fragments.
116 : ** |----------------|
117 : **
118 : ** The page headers looks like this:
119 : **
120 : ** OFFSET SIZE DESCRIPTION
121 : ** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
122 : ** 1 2 byte offset to the first freeblock
123 : ** 3 2 number of cells on this page
124 : ** 5 2 first byte of the cell content area
125 : ** 7 1 number of fragmented free bytes
126 : ** 8 4 Right child (the Ptr(N) value). Omitted on leaves.
127 : **
128 : ** The flags define the format of this btree page. The leaf flag means that
129 : ** this page has no children. The zerodata flag means that this page carries
130 : ** only keys and no data. The intkey flag means that the key is a integer
131 : ** which is stored in the key size entry of the cell header rather than in
132 : ** the payload area.
133 : **
134 : ** The cell pointer array begins on the first byte after the page header.
135 : ** The cell pointer array contains zero or more 2-byte numbers which are
136 : ** offsets from the beginning of the page to the cell content in the cell
137 : ** content area. The cell pointers occur in sorted order. The system strives
138 : ** to keep free space after the last cell pointer so that new cells can
139 : ** be easily added without having to defragment the page.
140 : **
141 : ** Cell content is stored at the very end of the page and grows toward the
142 : ** beginning of the page.
143 : **
144 : ** Unused space within the cell content area is collected into a linked list of
145 : ** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
146 : ** to the first freeblock is given in the header. Freeblocks occur in
147 : ** increasing order. Because a freeblock must be at least 4 bytes in size,
148 : ** any group of 3 or fewer unused bytes in the cell content area cannot
149 : ** exist on the freeblock chain. A group of 3 or fewer free bytes is called
150 : ** a fragment. The total number of bytes in all fragments is recorded.
151 : ** in the page header at offset 7.
152 : **
153 : ** SIZE DESCRIPTION
154 : ** 2 Byte offset of the next freeblock
155 : ** 2 Bytes in this freeblock
156 : **
157 : ** Cells are of variable length. Cells are stored in the cell content area at
158 : ** the end of the page. Pointers to the cells are in the cell pointer array
159 : ** that immediately follows the page header. Cells is not necessarily
160 : ** contiguous or in order, but cell pointers are contiguous and in order.
161 : **
162 : ** Cell content makes use of variable length integers. A variable
163 : ** length integer is 1 to 9 bytes where the lower 7 bits of each
164 : ** byte are used. The integer consists of all bytes that have bit 8 set and
165 : ** the first byte with bit 8 clear. The most significant byte of the integer
166 : ** appears first. A variable-length integer may not be more than 9 bytes long.
167 : ** As a special case, all 8 bytes of the 9th byte are used as data. This
168 : ** allows a 64-bit integer to be encoded in 9 bytes.
169 : **
170 : ** 0x00 becomes 0x00000000
171 : ** 0x7f becomes 0x0000007f
172 : ** 0x81 0x00 becomes 0x00000080
173 : ** 0x82 0x00 becomes 0x00000100
174 : ** 0x80 0x7f becomes 0x0000007f
175 : ** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678
176 : ** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
177 : **
178 : ** Variable length integers are used for rowids and to hold the number of
179 : ** bytes of key and data in a btree cell.
180 : **
181 : ** The content of a cell looks like this:
182 : **
183 : ** SIZE DESCRIPTION
184 : ** 4 Page number of the left child. Omitted if leaf flag is set.
185 : ** var Number of bytes of data. Omitted if the zerodata flag is set.
186 : ** var Number of bytes of key. Or the key itself if intkey flag is set.
187 : ** * Payload
188 : ** 4 First page of the overflow chain. Omitted if no overflow
189 : **
190 : ** Overflow pages form a linked list. Each page except the last is completely
191 : ** filled with data (pagesize - 4 bytes). The last page can have as little
192 : ** as 1 byte of data.
193 : **
194 : ** SIZE DESCRIPTION
195 : ** 4 Page number of next overflow page
196 : ** * Data
197 : **
198 : ** Freelist pages come in two subtypes: trunk pages and leaf pages. The
199 : ** file header points to first in a linked list of trunk page. Each trunk
200 : ** page points to multiple leaf pages. The content of a leaf page is
201 : ** unspecified. A trunk page looks like this:
202 : **
203 : ** SIZE DESCRIPTION
204 : ** 4 Page number of next trunk page
205 : ** 4 Number of leaf pointers on this page
206 : ** * zero or more pages numbers of leaves
207 : */
208 : #include "sqliteInt.h"
209 : #include "pager.h"
210 : #include "btree.h"
211 : #include "os.h"
212 : #include <assert.h>
213 :
214 : /* Round up a number to the next larger multiple of 8. This is used
215 : ** to force 8-byte alignment on 64-bit architectures.
216 : */
217 : #define ROUND8(x) ((x+7)&~7)
218 :
219 :
220 : /* The following value is the maximum cell size assuming a maximum page
221 : ** size give above.
222 : */
223 : #define MX_CELL_SIZE(pBt) (pBt->pageSize-8)
224 :
225 : /* The maximum number of cells on a single page of the database. This
226 : ** assumes a minimum cell size of 3 bytes. Such small cells will be
227 : ** exceedingly rare, but they are possible.
228 : */
229 : #define MX_CELL(pBt) ((pBt->pageSize-8)/3)
230 :
231 : /* Forward declarations */
232 : typedef struct MemPage MemPage;
233 : typedef struct BtLock BtLock;
234 :
235 : /*
236 : ** This is a magic string that appears at the beginning of every
237 : ** SQLite database in order to identify the file as a real database.
238 : **
239 : ** You can change this value at compile-time by specifying a
240 : ** -DSQLITE_FILE_HEADER="..." on the compiler command-line. The
241 : ** header must be exactly 16 bytes including the zero-terminator so
242 : ** the string itself should be 15 characters long. If you change
243 : ** the header, then your custom library will not be able to read
244 : ** databases generated by the standard tools and the standard tools
245 : ** will not be able to read databases created by your custom library.
246 : */
247 : #ifndef SQLITE_FILE_HEADER /* 123456789 123456 */
248 : # define SQLITE_FILE_HEADER "SQLite format 3"
249 : #endif
250 : static const char zMagicHeader[] = SQLITE_FILE_HEADER;
251 :
252 : /*
253 : ** Page type flags. An ORed combination of these flags appear as the
254 : ** first byte of every BTree page.
255 : */
256 : #define PTF_INTKEY 0x01
257 : #define PTF_ZERODATA 0x02
258 : #define PTF_LEAFDATA 0x04
259 : #define PTF_LEAF 0x08
260 :
261 : /*
262 : ** As each page of the file is loaded into memory, an instance of the following
263 : ** structure is appended and initialized to zero. This structure stores
264 : ** information about the page that is decoded from the raw file page.
265 : **
266 : ** The pParent field points back to the parent page. This allows us to
267 : ** walk up the BTree from any leaf to the root. Care must be taken to
268 : ** unref() the parent page pointer when this page is no longer referenced.
269 : ** The pageDestructor() routine handles that chore.
270 : */
271 : struct MemPage {
272 : u8 isInit; /* True if previously initialized. MUST BE FIRST! */
273 : u8 idxShift; /* True if Cell indices have changed */
274 : u8 nOverflow; /* Number of overflow cell bodies in aCell[] */
275 : u8 intKey; /* True if intkey flag is set */
276 : u8 leaf; /* True if leaf flag is set */
277 : u8 zeroData; /* True if table stores keys only */
278 : u8 leafData; /* True if tables stores data on leaves only */
279 : u8 hasData; /* True if this page stores data */
280 : u8 hdrOffset; /* 100 for page 1. 0 otherwise */
281 : u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */
282 : u16 maxLocal; /* Copy of Btree.maxLocal or Btree.maxLeaf */
283 : u16 minLocal; /* Copy of Btree.minLocal or Btree.minLeaf */
284 : u16 cellOffset; /* Index in aData of first cell pointer */
285 : u16 idxParent; /* Index in parent of this node */
286 : u16 nFree; /* Number of free bytes on the page */
287 : u16 nCell; /* Number of cells on this page, local and ovfl */
288 : struct _OvflCell { /* Cells that will not fit on aData[] */
289 : u8 *pCell; /* Pointers to the body of the overflow cell */
290 : u16 idx; /* Insert this cell before idx-th non-overflow cell */
291 : } aOvfl[5];
292 : BtShared *pBt; /* Pointer back to BTree structure */
293 : u8 *aData; /* Pointer back to the start of the page */
294 : DbPage *pDbPage; /* Pager page handle */
295 : Pgno pgno; /* Page number for this page */
296 : MemPage *pParent; /* The parent of this page. NULL for root */
297 : };
298 :
299 : /*
300 : ** The in-memory image of a disk page has the auxiliary information appended
301 : ** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
302 : ** that extra information.
303 : */
304 : #define EXTRA_SIZE sizeof(MemPage)
305 :
306 : /* Btree handle */
307 : struct Btree {
308 : sqlite3 *pSqlite;
309 : BtShared *pBt;
310 : u8 inTrans; /* TRANS_NONE, TRANS_READ or TRANS_WRITE */
311 : };
312 :
313 : /*
314 : ** Btree.inTrans may take one of the following values.
315 : **
316 : ** If the shared-data extension is enabled, there may be multiple users
317 : ** of the Btree structure. At most one of these may open a write transaction,
318 : ** but any number may have active read transactions. Variable Btree.pDb
319 : ** points to the handle that owns any current write-transaction.
320 : */
321 : #define TRANS_NONE 0
322 : #define TRANS_READ 1
323 : #define TRANS_WRITE 2
324 :
325 : /*
326 : ** Everything we need to know about an open database
327 : */
328 : struct BtShared {
329 : Pager *pPager; /* The page cache */
330 : BtCursor *pCursor; /* A list of all open cursors */
331 : MemPage *pPage1; /* First page of the database */
332 : u8 inStmt; /* True if we are in a statement subtransaction */
333 : u8 readOnly; /* True if the underlying file is readonly */
334 : u8 maxEmbedFrac; /* Maximum payload as % of total page size */
335 : u8 minEmbedFrac; /* Minimum payload as % of total page size */
336 : u8 minLeafFrac; /* Minimum leaf payload as % of total page size */
337 : u8 pageSizeFixed; /* True if the page size can no longer be changed */
338 : #ifndef SQLITE_OMIT_AUTOVACUUM
339 : u8 autoVacuum; /* True if database supports auto-vacuum */
340 : #endif
341 : u16 pageSize; /* Total number of bytes on a page */
342 : u16 usableSize; /* Number of usable bytes on each page */
343 : int maxLocal; /* Maximum local payload in non-LEAFDATA tables */
344 : int minLocal; /* Minimum local payload in non-LEAFDATA tables */
345 : int maxLeaf; /* Maximum local payload in a LEAFDATA table */
346 : int minLeaf; /* Minimum local payload in a LEAFDATA table */
347 : BusyHandler *pBusyHandler; /* Callback for when there is lock contention */
348 : u8 inTransaction; /* Transaction state */
349 : int nRef; /* Number of references to this structure */
350 : int nTransaction; /* Number of open transactions (read + write) */
351 : void *pSchema; /* Pointer to space allocated by sqlite3BtreeSchema() */
352 : void (*xFreeSchema)(void*); /* Destructor for BtShared.pSchema */
353 : #ifndef SQLITE_OMIT_SHARED_CACHE
354 : BtLock *pLock; /* List of locks held on this shared-btree struct */
355 : BtShared *pNext; /* Next in ThreadData.pBtree linked list */
356 : #endif
357 : };
358 :
359 : /*
360 : ** An instance of the following structure is used to hold information
361 : ** about a cell. The parseCellPtr() function fills in this structure
362 : ** based on information extract from the raw disk page.
363 : */
364 : typedef struct CellInfo CellInfo;
365 : struct CellInfo {
366 : u8 *pCell; /* Pointer to the start of cell content */
367 : i64 nKey; /* The key for INTKEY tables, or number of bytes in key */
368 : u32 nData; /* Number of bytes of data */
369 : u32 nPayload; /* Total amount of payload */
370 : u16 nHeader; /* Size of the cell content header in bytes */
371 : u16 nLocal; /* Amount of payload held locally */
372 : u16 iOverflow; /* Offset to overflow page number. Zero if no overflow */
373 : u16 nSize; /* Size of the cell content on the main b-tree page */
374 : };
375 :
376 : /*
377 : ** A cursor is a pointer to a particular entry in the BTree.
378 : ** The entry is identified by its MemPage and the index in
379 : ** MemPage.aCell[] of the entry.
380 : */
381 : struct BtCursor {
382 : Btree *pBtree; /* The Btree to which this cursor belongs */
383 : BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
384 : int (*xCompare)(void*,int,const void*,int,const void*); /* Key comp func */
385 : void *pArg; /* First arg to xCompare() */
386 : Pgno pgnoRoot; /* The root page of this tree */
387 : MemPage *pPage; /* Page that contains the entry */
388 : int idx; /* Index of the entry in pPage->aCell[] */
389 : CellInfo info; /* A parse of the cell we are pointing at */
390 : u8 wrFlag; /* True if writable */
391 : u8 eState; /* One of the CURSOR_XXX constants (see below) */
392 : void *pKey; /* Saved key that was cursor's last known position */
393 : i64 nKey; /* Size of pKey, or last integer key */
394 : int skip; /* (skip<0) -> Prev() is a no-op. (skip>0) -> Next() is */
395 : };
396 :
397 : /*
398 : ** Potential values for BtCursor.eState.
399 : **
400 : ** CURSOR_VALID:
401 : ** Cursor points to a valid entry. getPayload() etc. may be called.
402 : **
403 : ** CURSOR_INVALID:
404 : ** Cursor does not point to a valid entry. This can happen (for example)
405 : ** because the table is empty or because BtreeCursorFirst() has not been
406 : ** called.
407 : **
408 : ** CURSOR_REQUIRESEEK:
409 : ** The table that this cursor was opened on still exists, but has been
410 : ** modified since the cursor was last used. The cursor position is saved
411 : ** in variables BtCursor.pKey and BtCursor.nKey. When a cursor is in
412 : ** this state, restoreOrClearCursorPosition() can be called to attempt to
413 : ** seek the cursor to the saved position.
414 : */
415 : #define CURSOR_INVALID 0
416 : #define CURSOR_VALID 1
417 : #define CURSOR_REQUIRESEEK 2
418 :
419 : /*
420 : ** The TRACE macro will print high-level status information about the
421 : ** btree operation when the global variable sqlite3_btree_trace is
422 : ** enabled.
423 : */
424 : #if SQLITE_TEST
425 : # define TRACE(X) if( sqlite3_btree_trace )\
426 : /* { sqlite3DebugPrintf X; fflush(stdout); } */ \
427 : { printf X; fflush(stdout); }
428 : int sqlite3_btree_trace=0; /* True to enable tracing */
429 : #else
430 : # define TRACE(X)
431 : #endif
432 :
433 : /*
434 : ** Forward declaration
435 : */
436 : static int checkReadLocks(Btree*,Pgno,BtCursor*);
437 :
438 : /*
439 : ** Read or write a two- and four-byte big-endian integer values.
440 : */
441 5533 : static u32 get2byte(unsigned char *p){
442 5533 : return (p[0]<<8) | p[1];
443 : }
444 906 : static u32 get4byte(unsigned char *p){
445 906 : return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
446 : }
447 1746 : static void put2byte(unsigned char *p, u32 v){
448 1746 : p[0] = v>>8;
449 1746 : p[1] = v;
450 1746 : }
451 172 : static void put4byte(unsigned char *p, u32 v){
452 172 : p[0] = v>>24;
453 172 : p[1] = v>>16;
454 172 : p[2] = v>>8;
455 172 : p[3] = v;
456 172 : }
457 :
458 : /*
459 : ** Routines to read and write variable-length integers. These used to
460 : ** be defined locally, but now we use the varint routines in the util.c
461 : ** file.
462 : */
463 : #define getVarint sqlite3GetVarint
464 : /* #define getVarint32 sqlite3GetVarint32 */
465 : #define getVarint32(A,B) ((*B=*(A))<=0x7f?1:sqlite3GetVarint32(A,B))
466 : #define putVarint sqlite3PutVarint
467 :
468 : /* The database page the PENDING_BYTE occupies. This page is never used.
469 : ** TODO: This macro is very similary to PAGER_MJ_PGNO() in pager.c. They
470 : ** should possibly be consolidated (presumably in pager.h).
471 : **
472 : ** If disk I/O is omitted (meaning that the database is stored purely
473 : ** in memory) then there is no pending byte.
474 : */
475 : #ifdef SQLITE_OMIT_DISKIO
476 : # define PENDING_BYTE_PAGE(pBt) 0x7fffffff
477 : #else
478 : # define PENDING_BYTE_PAGE(pBt) ((PENDING_BYTE/(pBt)->pageSize)+1)
479 : #endif
480 :
481 : /*
482 : ** A linked list of the following structures is stored at BtShared.pLock.
483 : ** Locks are added (or upgraded from READ_LOCK to WRITE_LOCK) when a cursor
484 : ** is opened on the table with root page BtShared.iTable. Locks are removed
485 : ** from this list when a transaction is committed or rolled back, or when
486 : ** a btree handle is closed.
487 : */
488 : struct BtLock {
489 : Btree *pBtree; /* Btree handle holding this lock */
490 : Pgno iTable; /* Root page of table */
491 : u8 eLock; /* READ_LOCK or WRITE_LOCK */
492 : BtLock *pNext; /* Next in BtShared.pLock list */
493 : };
494 :
495 : /* Candidate values for BtLock.eLock */
496 : #define READ_LOCK 1
497 : #define WRITE_LOCK 2
498 :
499 : #ifdef SQLITE_OMIT_SHARED_CACHE
500 : /*
501 : ** The functions queryTableLock(), lockTable() and unlockAllTables()
502 : ** manipulate entries in the BtShared.pLock linked list used to store
503 : ** shared-cache table level locks. If the library is compiled with the
504 : ** shared-cache feature disabled, then there is only ever one user
505 : ** of each BtShared structure and so this locking is not necessary.
506 : ** So define the lock related functions as no-ops.
507 : */
508 : #define queryTableLock(a,b,c) SQLITE_OK
509 : #define lockTable(a,b,c) SQLITE_OK
510 : #define unlockAllTables(a)
511 : #else
512 :
513 :
514 : /*
515 : ** Query to see if btree handle p may obtain a lock of type eLock
516 : ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
517 : ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
518 : ** SQLITE_LOCKED if not.
519 : */
520 1431 : static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
521 1431 : BtShared *pBt = p->pBt;
522 : BtLock *pIter;
523 :
524 : /* This is a no-op if the shared-cache is not enabled */
525 1431 : if( 0==sqlite3ThreadDataReadOnly()->useSharedData ){
526 1431 : return SQLITE_OK;
527 : }
528 :
529 : /* This (along with lockTable()) is where the ReadUncommitted flag is
530 : ** dealt with. If the caller is querying for a read-lock and the flag is
531 : ** set, it is unconditionally granted - even if there are write-locks
532 : ** on the table. If a write-lock is requested, the ReadUncommitted flag
533 : ** is not considered.
534 : **
535 : ** In function lockTable(), if a read-lock is demanded and the
536 : ** ReadUncommitted flag is set, no entry is added to the locks list
537 : ** (BtShared.pLock).
538 : **
539 : ** To summarize: If the ReadUncommitted flag is set, then read cursors do
540 : ** not create or respect table locks. The locking procedure for a
541 : ** write-cursor does not change.
542 : */
543 0 : if(
544 : !p->pSqlite ||
545 : 0==(p->pSqlite->flags&SQLITE_ReadUncommitted) ||
546 : eLock==WRITE_LOCK ||
547 : iTab==MASTER_ROOT
548 : ){
549 0 : for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
550 0 : if( pIter->pBtree!=p && pIter->iTable==iTab &&
551 : (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
552 0 : return SQLITE_LOCKED;
553 : }
554 : }
555 : }
556 0 : return SQLITE_OK;
557 : }
558 :
559 : /*
560 : ** Add a lock on the table with root-page iTable to the shared-btree used
561 : ** by Btree handle p. Parameter eLock must be either READ_LOCK or
562 : ** WRITE_LOCK.
563 : **
564 : ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
565 : ** SQLITE_NOMEM may also be returned.
566 : */
567 501 : static int lockTable(Btree *p, Pgno iTable, u8 eLock){
568 501 : BtShared *pBt = p->pBt;
569 501 : BtLock *pLock = 0;
570 : BtLock *pIter;
571 :
572 : /* This is a no-op if the shared-cache is not enabled */
573 501 : if( 0==sqlite3ThreadDataReadOnly()->useSharedData ){
574 501 : return SQLITE_OK;
575 : }
576 :
577 : assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
578 :
579 : /* If the read-uncommitted flag is set and a read-lock is requested,
580 : ** return early without adding an entry to the BtShared.pLock list. See
581 : ** comment in function queryTableLock() for more info on handling
582 : ** the ReadUncommitted flag.
583 : */
584 0 : if(
585 : (p->pSqlite) &&
586 : (p->pSqlite->flags&SQLITE_ReadUncommitted) &&
587 : (eLock==READ_LOCK) &&
588 : iTable!=MASTER_ROOT
589 : ){
590 0 : return SQLITE_OK;
591 : }
592 :
593 : /* First search the list for an existing lock on this table. */
594 0 : for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
595 0 : if( pIter->iTable==iTable && pIter->pBtree==p ){
596 0 : pLock = pIter;
597 0 : break;
598 : }
599 : }
600 :
601 : /* If the above search did not find a BtLock struct associating Btree p
602 : ** with table iTable, allocate one and link it into the list.
603 : */
604 0 : if( !pLock ){
605 0 : pLock = (BtLock *)sqliteMalloc(sizeof(BtLock));
606 0 : if( !pLock ){
607 0 : return SQLITE_NOMEM;
608 : }
609 0 : pLock->iTable = iTable;
610 0 : pLock->pBtree = p;
611 0 : pLock->pNext = pBt->pLock;
612 0 : pBt->pLock = pLock;
613 : }
614 :
615 : /* Set the BtLock.eLock variable to the maximum of the current lock
616 : ** and the requested lock. This means if a write-lock was already held
617 : ** and a read-lock requested, we don't incorrectly downgrade the lock.
618 : */
619 : assert( WRITE_LOCK>READ_LOCK );
620 0 : if( eLock>pLock->eLock ){
621 0 : pLock->eLock = eLock;
622 : }
623 :
624 0 : return SQLITE_OK;
625 : }
626 :
627 : /*
628 : ** Release all the table locks (locks obtained via calls to the lockTable()
629 : ** procedure) held by Btree handle p.
630 : */
631 694 : static void unlockAllTables(Btree *p){
632 694 : BtLock **ppIter = &p->pBt->pLock;
633 :
634 : /* If the shared-cache extension is not enabled, there should be no
635 : ** locks in the BtShared.pLock list, making this procedure a no-op. Assert
636 : ** that this is the case.
637 : */
638 : assert( sqlite3ThreadDataReadOnly()->useSharedData || 0==*ppIter );
639 :
640 1388 : while( *ppIter ){
641 0 : BtLock *pLock = *ppIter;
642 0 : if( pLock->pBtree==p ){
643 0 : *ppIter = pLock->pNext;
644 0 : sqliteFree(pLock);
645 : }else{
646 0 : ppIter = &pLock->pNext;
647 : }
648 : }
649 694 : }
650 : #endif /* SQLITE_OMIT_SHARED_CACHE */
651 :
652 : static void releasePage(MemPage *pPage); /* Forward reference */
653 :
654 : /*
655 : ** Save the current cursor position in the variables BtCursor.nKey
656 : ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
657 : */
658 0 : static int saveCursorPosition(BtCursor *pCur){
659 : int rc;
660 :
661 : assert( CURSOR_VALID==pCur->eState );
662 : assert( 0==pCur->pKey );
663 :
664 0 : rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
665 :
666 : /* If this is an intKey table, then the above call to BtreeKeySize()
667 : ** stores the integer key in pCur->nKey. In this case this value is
668 : ** all that is required. Otherwise, if pCur is not open on an intKey
669 : ** table, then malloc space for and store the pCur->nKey bytes of key
670 : ** data.
671 : */
672 0 : if( rc==SQLITE_OK && 0==pCur->pPage->intKey){
673 0 : void *pKey = sqliteMalloc(pCur->nKey);
674 0 : if( pKey ){
675 0 : rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
676 0 : if( rc==SQLITE_OK ){
677 0 : pCur->pKey = pKey;
678 : }else{
679 0 : sqliteFree(pKey);
680 : }
681 : }else{
682 0 : rc = SQLITE_NOMEM;
683 : }
684 : }
685 : assert( !pCur->pPage->intKey || !pCur->pKey );
686 :
687 0 : if( rc==SQLITE_OK ){
688 0 : releasePage(pCur->pPage);
689 0 : pCur->pPage = 0;
690 0 : pCur->eState = CURSOR_REQUIRESEEK;
691 : }
692 :
693 0 : return rc;
694 : }
695 :
696 : /*
697 : ** Save the positions of all cursors except pExcept open on the table
698 : ** with root-page iRoot. Usually, this is called just before cursor
699 : ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
700 : */
701 567 : static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
702 : BtCursor *p;
703 1254 : for(p=pBt->pCursor; p; p=p->pNext){
704 687 : if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
705 : p->eState==CURSOR_VALID ){
706 0 : int rc = saveCursorPosition(p);
707 0 : if( SQLITE_OK!=rc ){
708 0 : return rc;
709 : }
710 : }
711 : }
712 567 : return SQLITE_OK;
713 : }
714 :
715 : /*
716 : ** Clear the current cursor position.
717 : */
718 1155 : static void clearCursorPosition(BtCursor *pCur){
719 1155 : sqliteFree(pCur->pKey);
720 1155 : pCur->pKey = 0;
721 1155 : pCur->eState = CURSOR_INVALID;
722 1155 : }
723 :
724 : /*
725 : ** Restore the cursor to the position it was in (or as close to as possible)
726 : ** when saveCursorPosition() was called. Note that this call deletes the
727 : ** saved position info stored by saveCursorPosition(), so there can be
728 : ** at most one effective restoreOrClearCursorPosition() call after each
729 : ** saveCursorPosition().
730 : **
731 : ** If the second argument argument - doSeek - is false, then instead of
732 : ** returning the cursor to it's saved position, any saved position is deleted
733 : ** and the cursor state set to CURSOR_INVALID.
734 : */
735 0 : static int restoreOrClearCursorPositionX(BtCursor *pCur){
736 : int rc;
737 : assert( pCur->eState==CURSOR_REQUIRESEEK );
738 0 : pCur->eState = CURSOR_INVALID;
739 0 : rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
740 0 : if( rc==SQLITE_OK ){
741 0 : sqliteFree(pCur->pKey);
742 0 : pCur->pKey = 0;
743 : assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
744 : }
745 0 : return rc;
746 : }
747 :
748 : #define restoreOrClearCursorPosition(p) \
749 : (p->eState==CURSOR_REQUIRESEEK?restoreOrClearCursorPositionX(p):SQLITE_OK)
750 :
751 : #ifndef SQLITE_OMIT_AUTOVACUUM
752 : /*
753 : ** These macros define the location of the pointer-map entry for a
754 : ** database page. The first argument to each is the number of usable
755 : ** bytes on each page of the database (often 1024). The second is the
756 : ** page number to look up in the pointer map.
757 : **
758 : ** PTRMAP_PAGENO returns the database page number of the pointer-map
759 : ** page that stores the required pointer. PTRMAP_PTROFFSET returns
760 : ** the offset of the requested map entry.
761 : **
762 : ** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,
763 : ** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be
764 : ** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements
765 : ** this test.
766 : */
767 : #define PTRMAP_PAGENO(pBt, pgno) ptrmapPageno(pBt, pgno)
768 : #define PTRMAP_PTROFFSET(pBt, pgno) (5*(pgno-ptrmapPageno(pBt, pgno)-1))
769 : #define PTRMAP_ISPAGE(pBt, pgno) (PTRMAP_PAGENO((pBt),(pgno))==(pgno))
770 :
771 0 : static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
772 0 : int nPagesPerMapPage = (pBt->usableSize/5)+1;
773 0 : int iPtrMap = (pgno-2)/nPagesPerMapPage;
774 0 : int ret = (iPtrMap*nPagesPerMapPage) + 2;
775 0 : if( ret==PENDING_BYTE_PAGE(pBt) ){
776 0 : ret++;
777 : }
778 0 : return ret;
779 : }
780 :
781 : /*
782 : ** The pointer map is a lookup table that identifies the parent page for
783 : ** each child page in the database file. The parent page is the page that
784 : ** contains a pointer to the child. Every page in the database contains
785 : ** 0 or 1 parent pages. (In this context 'database page' refers
786 : ** to any page that is not part of the pointer map itself.) Each pointer map
787 : ** entry consists of a single byte 'type' and a 4 byte parent page number.
788 : ** The PTRMAP_XXX identifiers below are the valid types.
789 : **
790 : ** The purpose of the pointer map is to facility moving pages from one
791 : ** position in the file to another as part of autovacuum. When a page
792 : ** is moved, the pointer in its parent must be updated to point to the
793 : ** new location. The pointer map is used to locate the parent page quickly.
794 : **
795 : ** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not
796 : ** used in this case.
797 : **
798 : ** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number
799 : ** is not used in this case.
800 : **
801 : ** PTRMAP_OVERFLOW1: The database page is the first page in a list of
802 : ** overflow pages. The page number identifies the page that
803 : ** contains the cell with a pointer to this overflow page.
804 : **
805 : ** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of
806 : ** overflow pages. The page-number identifies the previous
807 : ** page in the overflow page list.
808 : **
809 : ** PTRMAP_BTREE: The database page is a non-root btree page. The page number
810 : ** identifies the parent page in the btree.
811 : */
812 : #define PTRMAP_ROOTPAGE 1
813 : #define PTRMAP_FREEPAGE 2
814 : #define PTRMAP_OVERFLOW1 3
815 : #define PTRMAP_OVERFLOW2 4
816 : #define PTRMAP_BTREE 5
817 :
818 : /*
819 : ** Write an entry into the pointer map.
820 : **
821 : ** This routine updates the pointer map entry for page number 'key'
822 : ** so that it maps to type 'eType' and parent page number 'pgno'.
823 : ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
824 : */
825 0 : static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
826 : DbPage *pDbPage; /* The pointer map page */
827 : u8 *pPtrmap; /* The pointer map data */
828 : Pgno iPtrmap; /* The pointer map page number */
829 : int offset; /* Offset in pointer map page */
830 : int rc;
831 :
832 : /* The master-journal page number must never be used as a pointer map page */
833 : assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
834 :
835 : assert( pBt->autoVacuum );
836 0 : if( key==0 ){
837 0 : return SQLITE_CORRUPT_BKPT;
838 : }
839 0 : iPtrmap = PTRMAP_PAGENO(pBt, key);
840 0 : rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
841 0 : if( rc!=SQLITE_OK ){
842 0 : return rc;
843 : }
844 0 : offset = PTRMAP_PTROFFSET(pBt, key);
845 0 : pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
846 :
847 0 : if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
848 : TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
849 0 : rc = sqlite3PagerWrite(pDbPage);
850 0 : if( rc==SQLITE_OK ){
851 0 : pPtrmap[offset] = eType;
852 0 : put4byte(&pPtrmap[offset+1], parent);
853 : }
854 : }
855 :
856 0 : sqlite3PagerUnref(pDbPage);
857 0 : return rc;
858 : }
859 :
860 : /*
861 : ** Read an entry from the pointer map.
862 : **
863 : ** This routine retrieves the pointer map entry for page 'key', writing
864 : ** the type and parent page number to *pEType and *pPgno respectively.
865 : ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
866 : */
867 0 : static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
868 : DbPage *pDbPage; /* The pointer map page */
869 : int iPtrmap; /* Pointer map page index */
870 : u8 *pPtrmap; /* Pointer map page data */
871 : int offset; /* Offset of entry in pointer map */
872 : int rc;
873 :
874 0 : iPtrmap = PTRMAP_PAGENO(pBt, key);
875 0 : rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
876 0 : if( rc!=0 ){
877 0 : return rc;
878 : }
879 0 : pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
880 :
881 0 : offset = PTRMAP_PTROFFSET(pBt, key);
882 : assert( pEType!=0 );
883 0 : *pEType = pPtrmap[offset];
884 0 : if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
885 :
886 0 : sqlite3PagerUnref(pDbPage);
887 0 : if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
888 0 : return SQLITE_OK;
889 : }
890 :
891 : #endif /* SQLITE_OMIT_AUTOVACUUM */
892 :
893 : /*
894 : ** Given a btree page and a cell index (0 means the first cell on
895 : ** the page, 1 means the second cell, and so forth) return a pointer
896 : ** to the cell content.
897 : **
898 : ** This routine works only for pages that do not contain overflow cells.
899 : */
900 1497 : static u8 *findCell(MemPage *pPage, int iCell){
901 1497 : u8 *data = pPage->aData;
902 : assert( iCell>=0 );
903 : assert( iCell<get2byte(&data[pPage->hdrOffset+3]) );
904 1497 : return data + get2byte(&data[pPage->cellOffset+2*iCell]);
905 : }
906 :
907 : /*
908 : ** This a more complex version of findCell() that works for
909 : ** pages that do contain overflow cells. See insert
910 : */
911 0 : static u8 *findOverflowCell(MemPage *pPage, int iCell){
912 : int i;
913 0 : for(i=pPage->nOverflow-1; i>=0; i--){
914 : int k;
915 : struct _OvflCell *pOvfl;
916 0 : pOvfl = &pPage->aOvfl[i];
917 0 : k = pOvfl->idx;
918 0 : if( k<=iCell ){
919 0 : if( k==iCell ){
920 0 : return pOvfl->pCell;
921 : }
922 0 : iCell--;
923 : }
924 : }
925 0 : return findCell(pPage, iCell);
926 : }
927 :
928 : /*
929 : ** Parse a cell content block and fill in the CellInfo structure. There
930 : ** are two versions of this function. parseCell() takes a cell index
931 : ** as the second argument and parseCellPtr() takes a pointer to the
932 : ** body of the cell as its second argument.
933 : */
934 : static void parseCellPtr(
935 : MemPage *pPage, /* Page containing the cell */
936 : u8 *pCell, /* Pointer to the cell text. */
937 : CellInfo *pInfo /* Fill in this structure */
938 1544 : ){
939 : int n; /* Number bytes in cell content header */
940 : u32 nPayload; /* Number of bytes of cell payload */
941 :
942 1544 : pInfo->pCell = pCell;
943 : assert( pPage->leaf==0 || pPage->leaf==1 );
944 1544 : n = pPage->childPtrSize;
945 : assert( n==4-4*pPage->leaf );
946 1544 : if( pPage->hasData ){
947 993 : n += getVarint32(&pCell[n], &nPayload);
948 : }else{
949 551 : nPayload = 0;
950 : }
951 1544 : pInfo->nData = nPayload;
952 1544 : if( pPage->intKey ){
953 993 : n += getVarint(&pCell[n], (u64 *)&pInfo->nKey);
954 : }else{
955 : u32 x;
956 551 : n += getVarint32(&pCell[n], &x);
957 551 : pInfo->nKey = x;
958 551 : nPayload += x;
959 : }
960 1544 : pInfo->nPayload = nPayload;
961 1544 : pInfo->nHeader = n;
962 1544 : if( nPayload<=pPage->maxLocal ){
963 : /* This is the (easy) common case where the entire payload fits
964 : ** on the local page. No overflow is required.
965 : */
966 : int nSize; /* Total size of cell content in bytes */
967 1544 : pInfo->nLocal = nPayload;
968 1544 : pInfo->iOverflow = 0;
969 1544 : nSize = nPayload + n;
970 1544 : if( nSize<4 ){
971 207 : nSize = 4; /* Minimum cell size is 4 */
972 : }
973 1544 : pInfo->nSize = nSize;
974 : }else{
975 : /* If the payload will not fit completely on the local page, we have
976 : ** to decide how much to store locally and how much to spill onto
977 : ** overflow pages. The strategy is to minimize the amount of unused
978 : ** space on overflow pages while keeping the amount of local storage
979 : ** in between minLocal and maxLocal.
980 : **
981 : ** Warning: changing the way overflow payload is distributed in any
982 : ** way will result in an incompatible file format.
983 : */
984 : int minLocal; /* Minimum amount of payload held locally */
985 : int maxLocal; /* Maximum amount of payload held locally */
986 : int surplus; /* Overflow payload available for local storage */
987 :
988 0 : minLocal = pPage->minLocal;
989 0 : maxLocal = pPage->maxLocal;
990 0 : surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
991 0 : if( surplus <= maxLocal ){
992 0 : pInfo->nLocal = surplus;
993 : }else{
994 0 : pInfo->nLocal = minLocal;
995 : }
996 0 : pInfo->iOverflow = pInfo->nLocal + n;
997 0 : pInfo->nSize = pInfo->iOverflow + 4;
998 : }
999 1544 : }
1000 : static void parseCell(
1001 : MemPage *pPage, /* Page containing the cell */
1002 : int iCell, /* The cell index. First cell is 0 */
1003 : CellInfo *pInfo /* Fill in this structure */
1004 977 : ){
1005 977 : parseCellPtr(pPage, findCell(pPage, iCell), pInfo);
1006 977 : }
1007 :
1008 : /*
1009 : ** Compute the total number of bytes that a Cell needs in the cell
1010 : ** data area of the btree-page. The return number includes the cell
1011 : ** data header and the local payload, but not any overflow page or
1012 : ** the space used by the cell pointer.
1013 : */
1014 : #ifndef NDEBUG
1015 : static int cellSize(MemPage *pPage, int iCell){
1016 : CellInfo info;
1017 : parseCell(pPage, iCell, &info);
1018 : return info.nSize;
1019 : }
1020 : #endif
1021 63 : static int cellSizePtr(MemPage *pPage, u8 *pCell){
1022 : CellInfo info;
1023 63 : parseCellPtr(pPage, pCell, &info);
1024 63 : return info.nSize;
1025 : }
1026 :
1027 : #ifndef SQLITE_OMIT_AUTOVACUUM
1028 : /*
1029 : ** If the cell pCell, part of page pPage contains a pointer
1030 : ** to an overflow page, insert an entry into the pointer-map
1031 : ** for the overflow page.
1032 : */
1033 0 : static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
1034 0 : if( pCell ){
1035 : CellInfo info;
1036 0 : parseCellPtr(pPage, pCell, &info);
1037 : assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1038 0 : if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
1039 0 : Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1040 0 : return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
1041 : }
1042 : }
1043 0 : return SQLITE_OK;
1044 : }
1045 : /*
1046 : ** If the cell with index iCell on page pPage contains a pointer
1047 : ** to an overflow page, insert an entry into the pointer-map
1048 : ** for the overflow page.
1049 : */
1050 0 : static int ptrmapPutOvfl(MemPage *pPage, int iCell){
1051 : u8 *pCell;
1052 0 : pCell = findOverflowCell(pPage, iCell);
1053 0 : return ptrmapPutOvflPtr(pPage, pCell);
1054 : }
1055 : #endif
1056 :
1057 :
1058 : /* A bunch of assert() statements to check the transaction state variables
1059 : ** of handle p (type Btree*) are internally consistent.
1060 : */
1061 : #define btreeIntegrity(p) \
1062 : assert( p->inTrans!=TRANS_NONE || p->pBt->nTransaction<p->pBt->nRef ); \
1063 : assert( p->pBt->nTransaction<=p->pBt->nRef ); \
1064 : assert( p->pBt->inTransaction!=TRANS_NONE || p->pBt->nTransaction==0 ); \
1065 : assert( p->pBt->inTransaction>=p->inTrans );
1066 :
1067 : /*
1068 : ** Defragment the page given. All Cells are moved to the
1069 : ** end of the page and all free space is collected into one
1070 : ** big FreeBlk that occurs in between the header and cell
1071 : ** pointer array and the cell content area.
1072 : */
1073 0 : static int defragmentPage(MemPage *pPage){
1074 : int i; /* Loop counter */
1075 : int pc; /* Address of a i-th cell */
1076 : int addr; /* Offset of first byte after cell pointer array */
1077 : int hdr; /* Offset to the page header */
1078 : int size; /* Size of a cell */
1079 : int usableSize; /* Number of usable bytes on a page */
1080 : int cellOffset; /* Offset to the cell pointer array */
1081 : int brk; /* Offset to the cell content area */
1082 : int nCell; /* Number of cells on the page */
1083 : unsigned char *data; /* The page data */
1084 : unsigned char *temp; /* Temp area for cell content */
1085 :
1086 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1087 : assert( pPage->pBt!=0 );
1088 : assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1089 : assert( pPage->nOverflow==0 );
1090 0 : temp = sqliteMalloc( pPage->pBt->pageSize );
1091 0 : if( temp==0 ) return SQLITE_NOMEM;
1092 0 : data = pPage->aData;
1093 0 : hdr = pPage->hdrOffset;
1094 0 : cellOffset = pPage->cellOffset;
1095 0 : nCell = pPage->nCell;
1096 : assert( nCell==get2byte(&data[hdr+3]) );
1097 0 : usableSize = pPage->pBt->usableSize;
1098 0 : brk = get2byte(&data[hdr+5]);
1099 0 : memcpy(&temp[brk], &data[brk], usableSize - brk);
1100 0 : brk = usableSize;
1101 0 : for(i=0; i<nCell; i++){
1102 : u8 *pAddr; /* The i-th cell pointer */
1103 0 : pAddr = &data[cellOffset + i*2];
1104 0 : pc = get2byte(pAddr);
1105 : assert( pc<pPage->pBt->usableSize );
1106 0 : size = cellSizePtr(pPage, &temp[pc]);
1107 0 : brk -= size;
1108 0 : memcpy(&data[brk], &temp[pc], size);
1109 0 : put2byte(pAddr, brk);
1110 : }
1111 : assert( brk>=cellOffset+2*nCell );
1112 0 : put2byte(&data[hdr+5], brk);
1113 0 : data[hdr+1] = 0;
1114 0 : data[hdr+2] = 0;
1115 0 : data[hdr+7] = 0;
1116 0 : addr = cellOffset+2*nCell;
1117 0 : memset(&data[addr], 0, brk-addr);
1118 0 : sqliteFree(temp);
1119 0 : return SQLITE_OK;
1120 : }
1121 :
1122 : /*
1123 : ** Allocate nByte bytes of space on a page.
1124 : **
1125 : ** Return the index into pPage->aData[] of the first byte of
1126 : ** the new allocation. Or return 0 if there is not enough free
1127 : ** space on the page to satisfy the allocation request.
1128 : **
1129 : ** If the page contains nBytes of free space but does not contain
1130 : ** nBytes of contiguous free space, then this routine automatically
1131 : ** calls defragementPage() to consolidate all free space before
1132 : ** allocating the new chunk.
1133 : */
1134 422 : static int allocateSpace(MemPage *pPage, int nByte){
1135 : int addr, pc, hdr;
1136 : int size;
1137 : int nFrag;
1138 : int top;
1139 : int nCell;
1140 : int cellOffset;
1141 : unsigned char *data;
1142 :
1143 422 : data = pPage->aData;
1144 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1145 : assert( pPage->pBt );
1146 422 : if( nByte<4 ) nByte = 4;
1147 422 : if( pPage->nFree<nByte || pPage->nOverflow>0 ) return 0;
1148 422 : pPage->nFree -= nByte;
1149 422 : hdr = pPage->hdrOffset;
1150 :
1151 422 : nFrag = data[hdr+7];
1152 422 : if( nFrag<60 ){
1153 : /* Search the freelist looking for a slot big enough to satisfy the
1154 : ** space request. */
1155 422 : addr = hdr+1;
1156 880 : while( (pc = get2byte(&data[addr]))>0 ){
1157 39 : size = get2byte(&data[pc+2]);
1158 39 : if( size>=nByte ){
1159 3 : if( size<nByte+4 ){
1160 3 : memcpy(&data[addr], &data[pc], 2);
1161 3 : data[hdr+7] = nFrag + size - nByte;
1162 3 : return pc;
1163 : }else{
1164 0 : put2byte(&data[pc+2], size-nByte);
1165 0 : return pc + size - nByte;
1166 : }
1167 : }
1168 36 : addr = pc;
1169 : }
1170 : }
1171 :
1172 : /* Allocate memory from the gap in between the cell pointer array
1173 : ** and the cell content area.
1174 : */
1175 419 : top = get2byte(&data[hdr+5]);
1176 419 : nCell = get2byte(&data[hdr+3]);
1177 419 : cellOffset = pPage->cellOffset;
1178 419 : if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
1179 0 : if( defragmentPage(pPage) ) return 0;
1180 0 : top = get2byte(&data[hdr+5]);
1181 : }
1182 419 : top -= nByte;
1183 : assert( cellOffset + 2*nCell <= top );
1184 419 : put2byte(&data[hdr+5], top);
1185 419 : return top;
1186 : }
1187 :
1188 : /*
1189 : ** Return a section of the pPage->aData to the freelist.
1190 : ** The first byte of the new free block is pPage->aDisk[start]
1191 : ** and the size of the block is "size" bytes.
1192 : **
1193 : ** Most of the effort here is involved in coalesing adjacent
1194 : ** free blocks into a single big free block.
1195 : */
1196 63 : static void freeSpace(MemPage *pPage, int start, int size){
1197 : int addr, pbegin, hdr;
1198 63 : unsigned char *data = pPage->aData;
1199 :
1200 : assert( pPage->pBt!=0 );
1201 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1202 : assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
1203 : assert( (start + size)<=pPage->pBt->usableSize );
1204 63 : if( size<4 ) size = 4;
1205 :
1206 : #ifdef SQLITE_SECURE_DELETE
1207 : /* Overwrite deleted information with zeros when the SECURE_DELETE
1208 : ** option is enabled at compile-time */
1209 : memset(&data[start], 0, size);
1210 : #endif
1211 :
1212 : /* Add the space back into the linked list of freeblocks */
1213 63 : hdr = pPage->hdrOffset;
1214 63 : addr = hdr + 1;
1215 126 : while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1216 : assert( pbegin<=pPage->pBt->usableSize-4 );
1217 : assert( pbegin>addr );
1218 0 : addr = pbegin;
1219 : }
1220 : assert( pbegin<=pPage->pBt->usableSize-4 );
1221 : assert( pbegin>addr || pbegin==0 );
1222 63 : put2byte(&data[addr], start);
1223 63 : put2byte(&data[start], pbegin);
1224 63 : put2byte(&data[start+2], size);
1225 63 : pPage->nFree += size;
1226 :
1227 : /* Coalesce adjacent free blocks */
1228 63 : addr = pPage->hdrOffset + 1;
1229 189 : while( (pbegin = get2byte(&data[addr]))>0 ){
1230 : int pnext, psize;
1231 : assert( pbegin>addr );
1232 : assert( pbegin<=pPage->pBt->usableSize-4 );
1233 63 : pnext = get2byte(&data[pbegin]);
1234 63 : psize = get2byte(&data[pbegin+2]);
1235 63 : if( pbegin + psize + 3 >= pnext && pnext>0 ){
1236 0 : int frag = pnext - (pbegin+psize);
1237 : assert( frag<=data[pPage->hdrOffset+7] );
1238 0 : data[pPage->hdrOffset+7] -= frag;
1239 0 : put2byte(&data[pbegin], get2byte(&data[pnext]));
1240 0 : put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
1241 : }else{
1242 63 : addr = pbegin;
1243 : }
1244 : }
1245 :
1246 : /* If the cell content area begins with a freeblock, remove it. */
1247 63 : if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1248 : int top;
1249 27 : pbegin = get2byte(&data[hdr+1]);
1250 27 : memcpy(&data[hdr+1], &data[pbegin], 2);
1251 27 : top = get2byte(&data[hdr+5]);
1252 27 : put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
1253 : }
1254 63 : }
1255 :
1256 : /*
1257 : ** Decode the flags byte (the first byte of the header) for a page
1258 : ** and initialize fields of the MemPage structure accordingly.
1259 : */
1260 618 : static void decodeFlags(MemPage *pPage, int flagByte){
1261 : BtShared *pBt; /* A copy of pPage->pBt */
1262 :
1263 : assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1264 618 : pPage->intKey = (flagByte & (PTF_INTKEY|PTF_LEAFDATA))!=0;
1265 618 : pPage->zeroData = (flagByte & PTF_ZERODATA)!=0;
1266 618 : pPage->leaf = (flagByte & PTF_LEAF)!=0;
1267 618 : pPage->childPtrSize = 4*(pPage->leaf==0);
1268 618 : pBt = pPage->pBt;
1269 618 : if( flagByte & PTF_LEAFDATA ){
1270 420 : pPage->leafData = 1;
1271 420 : pPage->maxLocal = pBt->maxLeaf;
1272 420 : pPage->minLocal = pBt->minLeaf;
1273 : }else{
1274 198 : pPage->leafData = 0;
1275 198 : pPage->maxLocal = pBt->maxLocal;
1276 198 : pPage->minLocal = pBt->minLocal;
1277 : }
1278 618 : pPage->hasData = !(pPage->zeroData || (!pPage->leaf && pPage->leafData));
1279 618 : }
1280 :
1281 : /*
1282 : ** Initialize the auxiliary information for a disk block.
1283 : **
1284 : ** The pParent parameter must be a pointer to the MemPage which
1285 : ** is the parent of the page being initialized. The root of a
1286 : ** BTree has no parent and so for that page, pParent==NULL.
1287 : **
1288 : ** Return SQLITE_OK on success. If we see that the page does
1289 : ** not contain a well-formed database page, then return
1290 : ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1291 : ** guarantee that the page is well-formed. It only shows that
1292 : ** we failed to detect any corruption.
1293 : */
1294 : static int initPage(
1295 : MemPage *pPage, /* The page to be initialized */
1296 : MemPage *pParent /* The parent. Might be NULL */
1297 464 : ){
1298 : int pc; /* Address of a freeblock within pPage->aData[] */
1299 : int hdr; /* Offset to beginning of page header */
1300 : u8 *data; /* Equal to pPage->aData */
1301 : BtShared *pBt; /* The main btree structure */
1302 : int usableSize; /* Amount of usable space on each page */
1303 : int cellOffset; /* Offset from start of page to first cell pointer */
1304 : int nFree; /* Number of unused bytes on the page */
1305 : int top; /* First byte of the cell content area */
1306 :
1307 464 : pBt = pPage->pBt;
1308 : assert( pBt!=0 );
1309 : assert( pParent==0 || pParent->pBt==pBt );
1310 : assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1311 : assert( pPage->aData == &((unsigned char*)pPage)[-pBt->pageSize] );
1312 464 : if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
1313 : /* The parent page should never change unless the file is corrupt */
1314 0 : return SQLITE_CORRUPT_BKPT;
1315 : }
1316 464 : if( pPage->isInit ) return SQLITE_OK;
1317 464 : if( pPage->pParent==0 && pParent!=0 ){
1318 0 : pPage->pParent = pParent;
1319 0 : sqlite3PagerRef(pParent->pDbPage);
1320 : }
1321 464 : hdr = pPage->hdrOffset;
1322 464 : data = pPage->aData;
1323 464 : decodeFlags(pPage, data[hdr]);
1324 464 : pPage->nOverflow = 0;
1325 464 : pPage->idxShift = 0;
1326 464 : usableSize = pBt->usableSize;
1327 464 : pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1328 464 : top = get2byte(&data[hdr+5]);
1329 464 : pPage->nCell = get2byte(&data[hdr+3]);
1330 464 : if( pPage->nCell>MX_CELL(pBt) ){
1331 : /* To many cells for a single page. The page must be corrupt */
1332 0 : return SQLITE_CORRUPT_BKPT;
1333 : }
1334 464 : if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
1335 : /* All pages must have at least one cell, except for root pages */
1336 0 : return SQLITE_CORRUPT_BKPT;
1337 : }
1338 :
1339 : /* Compute the total free space on the page */
1340 464 : pc = get2byte(&data[hdr+1]);
1341 464 : nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
1342 931 : while( pc>0 ){
1343 : int next, size;
1344 3 : if( pc>usableSize-4 ){
1345 : /* Free block is off the page */
1346 0 : return SQLITE_CORRUPT_BKPT;
1347 : }
1348 3 : next = get2byte(&data[pc]);
1349 3 : size = get2byte(&data[pc+2]);
1350 3 : if( next>0 && next<=pc+size+3 ){
1351 : /* Free blocks must be in accending order */
1352 0 : return SQLITE_CORRUPT_BKPT;
1353 : }
1354 3 : nFree += size;
1355 3 : pc = next;
1356 : }
1357 464 : pPage->nFree = nFree;
1358 464 : if( nFree>=usableSize ){
1359 : /* Free space cannot exceed total page size */
1360 0 : return SQLITE_CORRUPT_BKPT;
1361 : }
1362 :
1363 464 : pPage->isInit = 1;
1364 464 : return SQLITE_OK;
1365 : }
1366 :
1367 : /*
1368 : ** Set up a raw page so that it looks like a database page holding
1369 : ** no entries.
1370 : */
1371 154 : static void zeroPage(MemPage *pPage, int flags){
1372 154 : unsigned char *data = pPage->aData;
1373 154 : BtShared *pBt = pPage->pBt;
1374 154 : int hdr = pPage->hdrOffset;
1375 : int first;
1376 :
1377 : assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1378 : assert( &data[pBt->pageSize] == (unsigned char*)pPage );
1379 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1380 154 : memset(&data[hdr], 0, pBt->usableSize - hdr);
1381 154 : data[hdr] = flags;
1382 154 : first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1383 154 : memset(&data[hdr+1], 0, 4);
1384 154 : data[hdr+7] = 0;
1385 154 : put2byte(&data[hdr+5], pBt->usableSize);
1386 154 : pPage->nFree = pBt->usableSize - first;
1387 154 : decodeFlags(pPage, flags);
1388 154 : pPage->hdrOffset = hdr;
1389 154 : pPage->cellOffset = first;
1390 154 : pPage->nOverflow = 0;
1391 154 : pPage->idxShift = 0;
1392 154 : pPage->nCell = 0;
1393 154 : pPage->isInit = 1;
1394 154 : }
1395 :
1396 : /*
1397 : ** Get a page from the pager. Initialize the MemPage.pBt and
1398 : ** MemPage.aData elements if needed.
1399 : **
1400 : ** If the noContent flag is set, it means that we do not care about
1401 : ** the content of the page at this time. So do not go to the disk
1402 : ** to fetch the content. Just fill in the content with zeros for now.
1403 : ** If in the future we call sqlite3PagerWrite() on this page, that
1404 : ** means we have started to be concerned about content and the disk
1405 : ** read should occur at that point.
1406 : */
1407 1599 : static int getPage(BtShared *pBt, Pgno pgno, MemPage **ppPage, int noContent){
1408 : int rc;
1409 : MemPage *pPage;
1410 : DbPage *pDbPage;
1411 :
1412 1599 : rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1413 1599 : if( rc ) return rc;
1414 1599 : pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage);
1415 1599 : pPage->aData = sqlite3PagerGetData(pDbPage);
1416 1599 : pPage->pDbPage = pDbPage;
1417 1599 : pPage->pBt = pBt;
1418 1599 : pPage->pgno = pgno;
1419 1599 : pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1420 1599 : *ppPage = pPage;
1421 1599 : return SQLITE_OK;
1422 : }
1423 :
1424 : /*
1425 : ** Get a page from the pager and initialize it. This routine
1426 : ** is just a convenience wrapper around separate calls to
1427 : ** getPage() and initPage().
1428 : */
1429 : static int getAndInitPage(
1430 : BtShared *pBt, /* The database file */
1431 : Pgno pgno, /* Number of the page to get */
1432 : MemPage **ppPage, /* Write the page pointer here */
1433 : MemPage *pParent /* Parent of the page */
1434 742 : ){
1435 : int rc;
1436 742 : if( pgno==0 ){
1437 0 : return SQLITE_CORRUPT_BKPT;
1438 : }
1439 742 : rc = getPage(pBt, pgno, ppPage, 0);
1440 742 : if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1441 464 : rc = initPage(*ppPage, pParent);
1442 : }
1443 742 : return rc;
1444 : }
1445 :
1446 : /*
1447 : ** Release a MemPage. This should be called once for each prior
1448 : ** call to getPage.
1449 : */
1450 2535 : static void releasePage(MemPage *pPage){
1451 2535 : if( pPage ){
1452 : assert( pPage->aData );
1453 : assert( pPage->pBt );
1454 : assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
1455 1501 : sqlite3PagerUnref(pPage->pDbPage);
1456 : }
1457 2535 : }
1458 :
1459 : /*
1460 : ** This routine is called when the reference count for a page
1461 : ** reaches zero. We need to unref the pParent pointer when that
1462 : ** happens.
1463 : */
1464 1298 : static void pageDestructor(DbPage *pData, int pageSize){
1465 : MemPage *pPage;
1466 : assert( (pageSize & 7)==0 );
1467 1298 : pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1468 1298 : if( pPage->pParent ){
1469 0 : MemPage *pParent = pPage->pParent;
1470 0 : pPage->pParent = 0;
1471 0 : releasePage(pParent);
1472 : }
1473 1298 : pPage->isInit = 0;
1474 1298 : }
1475 :
1476 : /*
1477 : ** During a rollback, when the pager reloads information into the cache
1478 : ** so that the cache is restored to its original state at the start of
1479 : ** the transaction, for each page restored this routine is called.
1480 : **
1481 : ** This routine needs to reset the extra data section at the end of the
1482 : ** page to agree with the restored data.
1483 : */
1484 3 : static void pageReinit(DbPage *pData, int pageSize){
1485 : MemPage *pPage;
1486 : assert( (pageSize & 7)==0 );
1487 3 : pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1488 3 : if( pPage->isInit ){
1489 0 : pPage->isInit = 0;
1490 0 : initPage(pPage, pPage->pParent);
1491 : }
1492 3 : }
1493 :
1494 : /*
1495 : ** Open a database file.
1496 : **
1497 : ** zFilename is the name of the database file. If zFilename is NULL
1498 : ** a new database with a random name is created. This randomly named
1499 : ** database file will be deleted when sqlite3BtreeClose() is called.
1500 : */
1501 : int sqlite3BtreeOpen(
1502 : const char *zFilename, /* Name of the file containing the BTree database */
1503 : sqlite3 *pSqlite, /* Associated database handle */
1504 : Btree **ppBtree, /* Pointer to new Btree object written here */
1505 : int flags /* Options */
1506 130 : ){
1507 : BtShared *pBt; /* Shared part of btree structure */
1508 : Btree *p; /* Handle to return */
1509 : int rc;
1510 : int nReserve;
1511 : unsigned char zDbHeader[100];
1512 : #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1513 : const ThreadData *pTsdro;
1514 : #endif
1515 :
1516 : /* Set the variable isMemdb to true for an in-memory database, or
1517 : ** false for a file-based database. This symbol is only required if
1518 : ** either of the shared-data or autovacuum features are compiled
1519 : ** into the library.
1520 : */
1521 : #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1522 : #ifdef SQLITE_OMIT_MEMORYDB
1523 : const int isMemdb = 0;
1524 : #else
1525 130 : const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1526 : #endif
1527 : #endif
1528 :
1529 130 : p = sqliteMalloc(sizeof(Btree));
1530 130 : if( !p ){
1531 0 : return SQLITE_NOMEM;
1532 : }
1533 130 : p->inTrans = TRANS_NONE;
1534 130 : p->pSqlite = pSqlite;
1535 :
1536 : /* Try to find an existing Btree structure opened on zFilename. */
1537 : #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1538 130 : pTsdro = sqlite3ThreadDataReadOnly();
1539 130 : if( pTsdro->useSharedData && zFilename && !isMemdb ){
1540 0 : char *zFullPathname = sqlite3OsFullPathname(zFilename);
1541 0 : if( !zFullPathname ){
1542 0 : sqliteFree(p);
1543 0 : return SQLITE_NOMEM;
1544 : }
1545 0 : for(pBt=pTsdro->pBtree; pBt; pBt=pBt->pNext){
1546 : assert( pBt->nRef>0 );
1547 0 : if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) ){
1548 0 : p->pBt = pBt;
1549 0 : *ppBtree = p;
1550 0 : pBt->nRef++;
1551 0 : sqliteFree(zFullPathname);
1552 0 : return SQLITE_OK;
1553 : }
1554 : }
1555 0 : sqliteFree(zFullPathname);
1556 : }
1557 : #endif
1558 :
1559 : /*
1560 : ** The following asserts make sure that structures used by the btree are
1561 : ** the right size. This is to guard against size changes that result
1562 : ** when compiling on a different architecture.
1563 : */
1564 : assert( sizeof(i64)==8 || sizeof(i64)==4 );
1565 : assert( sizeof(u64)==8 || sizeof(u64)==4 );
1566 : assert( sizeof(u32)==4 );
1567 : assert( sizeof(u16)==2 );
1568 : assert( sizeof(Pgno)==4 );
1569 :
1570 130 : pBt = sqliteMalloc( sizeof(*pBt) );
1571 130 : if( pBt==0 ){
1572 0 : *ppBtree = 0;
1573 0 : sqliteFree(p);
1574 0 : return SQLITE_NOMEM;
1575 : }
1576 130 : rc = sqlite3PagerOpen(&pBt->pPager, zFilename, EXTRA_SIZE, flags);
1577 130 : if( rc==SQLITE_OK ){
1578 130 : rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1579 : }
1580 130 : if( rc!=SQLITE_OK ){
1581 0 : if( pBt->pPager ){
1582 0 : sqlite3PagerClose(pBt->pPager);
1583 : }
1584 0 : sqliteFree(pBt);
1585 0 : sqliteFree(p);
1586 0 : *ppBtree = 0;
1587 0 : return rc;
1588 : }
1589 130 : p->pBt = pBt;
1590 :
1591 130 : sqlite3PagerSetDestructor(pBt->pPager, pageDestructor);
1592 130 : sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1593 130 : pBt->pCursor = 0;
1594 130 : pBt->pPage1 = 0;
1595 130 : pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1596 130 : pBt->pageSize = get2byte(&zDbHeader[16]);
1597 253 : if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1598 : || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1599 123 : pBt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1600 123 : pBt->maxEmbedFrac = 64; /* 25% */
1601 123 : pBt->minEmbedFrac = 32; /* 12.5% */
1602 123 : pBt->minLeafFrac = 32; /* 12.5% */
1603 : #ifndef SQLITE_OMIT_AUTOVACUUM
1604 : /* If the magic name ":memory:" will create an in-memory database, then
1605 : ** do not set the auto-vacuum flag, even if SQLITE_DEFAULT_AUTOVACUUM
1606 : ** is true. On the other hand, if SQLITE_OMIT_MEMORYDB has been defined,
1607 : ** then ":memory:" is just a regular file-name. Respect the auto-vacuum
1608 : ** default in this case.
1609 : */
1610 123 : if( zFilename && !isMemdb ){
1611 6 : pBt->autoVacuum = SQLITE_DEFAULT_AUTOVACUUM;
1612 : }
1613 : #endif
1614 123 : nReserve = 0;
1615 : }else{
1616 7 : nReserve = zDbHeader[20];
1617 7 : pBt->maxEmbedFrac = zDbHeader[21];
1618 7 : pBt->minEmbedFrac = zDbHeader[22];
1619 7 : pBt->minLeafFrac = zDbHeader[23];
1620 7 : pBt->pageSizeFixed = 1;
1621 : #ifndef SQLITE_OMIT_AUTOVACUUM
1622 7 : pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1623 : #endif
1624 : }
1625 130 : pBt->usableSize = pBt->pageSize - nReserve;
1626 : assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1627 130 : sqlite3PagerSetPagesize(pBt->pPager, pBt->pageSize);
1628 :
1629 : #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1630 : /* Add the new btree to the linked list starting at ThreadData.pBtree.
1631 : ** There is no chance that a malloc() may fail inside of the
1632 : ** sqlite3ThreadData() call, as the ThreadData structure must have already
1633 : ** been allocated for pTsdro->useSharedData to be non-zero.
1634 : */
1635 130 : if( pTsdro->useSharedData && zFilename && !isMemdb ){
1636 0 : pBt->pNext = pTsdro->pBtree;
1637 0 : sqlite3ThreadData()->pBtree = pBt;
1638 : }
1639 : #endif
1640 130 : pBt->nRef = 1;
1641 130 : *ppBtree = p;
1642 130 : return SQLITE_OK;
1643 : }
1644 :
1645 : /*
1646 : ** Close an open database and invalidate all cursors.
1647 : */
1648 129 : int sqlite3BtreeClose(Btree *p){
1649 129 : BtShared *pBt = p->pBt;
1650 : BtCursor *pCur;
1651 :
1652 : #ifndef SQLITE_OMIT_SHARED_CACHE
1653 : ThreadData *pTsd;
1654 : #endif
1655 :
1656 : /* Close all cursors opened via this handle. */
1657 129 : pCur = pBt->pCursor;
1658 258 : while( pCur ){
1659 0 : BtCursor *pTmp = pCur;
1660 0 : pCur = pCur->pNext;
1661 0 : if( pTmp->pBtree==p ){
1662 0 : sqlite3BtreeCloseCursor(pTmp);
1663 : }
1664 : }
1665 :
1666 : /* Rollback any active transaction and free the handle structure.
1667 : ** The call to sqlite3BtreeRollback() drops any table-locks held by
1668 : ** this handle.
1669 : */
1670 129 : sqlite3BtreeRollback(p);
1671 129 : sqliteFree(p);
1672 :
1673 : #ifndef SQLITE_OMIT_SHARED_CACHE
1674 : /* If there are still other outstanding references to the shared-btree
1675 : ** structure, return now. The remainder of this procedure cleans
1676 : ** up the shared-btree.
1677 : */
1678 : assert( pBt->nRef>0 );
1679 129 : pBt->nRef--;
1680 129 : if( pBt->nRef ){
1681 0 : return SQLITE_OK;
1682 : }
1683 :
1684 : /* Remove the shared-btree from the thread wide list. Call
1685 : ** ThreadDataReadOnly() and then cast away the const property of the
1686 : ** pointer to avoid allocating thread data if it is not really required.
1687 : */
1688 129 : pTsd = (ThreadData *)sqlite3ThreadDataReadOnly();
1689 129 : if( pTsd->pBtree==pBt ){
1690 : assert( pTsd==sqlite3ThreadData() );
1691 0 : pTsd->pBtree = pBt->pNext;
1692 : }else{
1693 : BtShared *pPrev;
1694 129 : for(pPrev=pTsd->pBtree; pPrev && pPrev->pNext!=pBt; pPrev=pPrev->pNext){}
1695 129 : if( pPrev ){
1696 : assert( pTsd==sqlite3ThreadData() );
1697 0 : pPrev->pNext = pBt->pNext;
1698 : }
1699 : }
1700 : #endif
1701 :
1702 : /* Close the pager and free the shared-btree structure */
1703 : assert( !pBt->pCursor );
1704 129 : sqlite3PagerClose(pBt->pPager);
1705 129 : if( pBt->xFreeSchema && pBt->pSchema ){
1706 129 : pBt->xFreeSchema(pBt->pSchema);
1707 : }
1708 129 : sqliteFree(pBt->pSchema);
1709 129 : sqliteFree(pBt);
1710 129 : return SQLITE_OK;
1711 : }
1712 :
1713 : /*
1714 : ** Change the busy handler callback function.
1715 : */
1716 130 : int sqlite3BtreeSetBusyHandler(Btree *p, BusyHandler *pHandler){
1717 130 : BtShared *pBt = p->pBt;
1718 130 : pBt->pBusyHandler = pHandler;
1719 130 : sqlite3PagerSetBusyhandler(pBt->pPager, pHandler);
1720 130 : return SQLITE_OK;
1721 : }
1722 :
1723 : /*
1724 : ** Change the limit on the number of pages allowed in the cache.
1725 : **
1726 : ** The maximum number of cache pages is set to the absolute
1727 : ** value of mxPage. If mxPage is negative, the pager will
1728 : ** operate asynchronously - it will not stop to do fsync()s
1729 : ** to insure data is written to the disk surface before
1730 : ** continuing. Transactions still work if synchronous is off,
1731 : ** and the database cannot be corrupted if this program
1732 : ** crashes. But if the operating system crashes or there is
1733 : ** an abrupt power failure when synchronous is off, the database
1734 : ** could be left in an inconsistent and unrecoverable state.
1735 : ** Synchronous is on by default so database corruption is not
1736 : ** normally a worry.
1737 : */
1738 238 : int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1739 238 : BtShared *pBt = p->pBt;
1740 238 : sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1741 238 : return SQLITE_OK;
1742 : }
1743 :
1744 : /*
1745 : ** Change the way data is synced to disk in order to increase or decrease
1746 : ** how well the database resists damage due to OS crashes and power
1747 : ** failures. Level 1 is the same as asynchronous (no syncs() occur and
1748 : ** there is a high probability of damage) Level 2 is the default. There
1749 : ** is a very low but non-zero probability of damage. Level 3 reduces the
1750 : ** probability of damage to near zero but with a write performance reduction.
1751 : */
1752 : #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1753 0 : int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1754 0 : BtShared *pBt = p->pBt;
1755 0 : sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1756 0 : return SQLITE_OK;
1757 : }
1758 : #endif
1759 :
1760 : /*
1761 : ** Return TRUE if the given btree is set to safety level 1. In other
1762 : ** words, return TRUE if no sync() occurs on the disk files.
1763 : */
1764 0 : int sqlite3BtreeSyncDisabled(Btree *p){
1765 0 : BtShared *pBt = p->pBt;
1766 : assert( pBt && pBt->pPager );
1767 0 : return sqlite3PagerNosync(pBt->pPager);
1768 : }
1769 :
1770 : #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1771 : /*
1772 : ** Change the default pages size and the number of reserved bytes per page.
1773 : **
1774 : ** The page size must be a power of 2 between 512 and 65536. If the page
1775 : ** size supplied does not meet this constraint then the page size is not
1776 : ** changed.
1777 : **
1778 : ** Page sizes are constrained to be a power of two so that the region
1779 : ** of the database file used for locking (beginning at PENDING_BYTE,
1780 : ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1781 : ** at the beginning of a page.
1782 : **
1783 : ** If parameter nReserve is less than zero, then the number of reserved
1784 : ** bytes per page is left unchanged.
1785 : */
1786 0 : int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1787 0 : BtShared *pBt = p->pBt;
1788 0 : if( pBt->pageSizeFixed ){
1789 0 : return SQLITE_READONLY;
1790 : }
1791 0 : if( nReserve<0 ){
1792 0 : nReserve = pBt->pageSize - pBt->usableSize;
1793 : }
1794 0 : if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1795 : ((pageSize-1)&pageSize)==0 ){
1796 : assert( (pageSize & 7)==0 );
1797 : assert( !pBt->pPage1 && !pBt->pCursor );
1798 0 : pBt->pageSize = sqlite3PagerSetPagesize(pBt->pPager, pageSize);
1799 : }
1800 0 : pBt->usableSize = pBt->pageSize - nReserve;
1801 0 : return SQLITE_OK;
1802 : }
1803 :
1804 : /*
1805 : ** Return the currently defined page size
1806 : */
1807 0 : int sqlite3BtreeGetPageSize(Btree *p){
1808 0 : return p->pBt->pageSize;
1809 : }
1810 0 : int sqlite3BtreeGetReserve(Btree *p){
1811 0 : return p->pBt->pageSize - p->pBt->usableSize;
1812 : }
1813 : #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1814 :
1815 : /*
1816 : ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1817 : ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1818 : ** is disabled. The default value for the auto-vacuum property is
1819 : ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1820 : */
1821 0 : int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1822 0 : BtShared *pBt = p->pBt;;
1823 : #ifdef SQLITE_OMIT_AUTOVACUUM
1824 : return SQLITE_READONLY;
1825 : #else
1826 0 : if( pBt->pageSizeFixed ){
1827 0 : return SQLITE_READONLY;
1828 : }
1829 0 : pBt->autoVacuum = (autoVacuum?1:0);
1830 0 : return SQLITE_OK;
1831 : #endif
1832 : }
1833 :
1834 : /*
1835 : ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1836 : ** enabled 1 is returned. Otherwise 0.
1837 : */
1838 0 : int sqlite3BtreeGetAutoVacuum(Btree *p){
1839 : #ifdef SQLITE_OMIT_AUTOVACUUM
1840 : return 0;
1841 : #else
1842 0 : return p->pBt->autoVacuum;
1843 : #endif
1844 : }
1845 :
1846 :
1847 : /*
1848 : ** Get a reference to pPage1 of the database file. This will
1849 : ** also acquire a readlock on that file.
1850 : **
1851 : ** SQLITE_OK is returned on success. If the file is not a
1852 : ** well-formed database file, then SQLITE_CORRUPT is returned.
1853 : ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1854 : ** is returned if we run out of memory.
1855 : */
1856 755 : static int lockBtree(BtShared *pBt){
1857 : int rc, pageSize;
1858 : MemPage *pPage1;
1859 755 : if( pBt->pPage1 ) return SQLITE_OK;
1860 755 : rc = getPage(pBt, 1, &pPage1, 0);
1861 755 : if( rc!=SQLITE_OK ) return rc;
1862 :
1863 :
1864 : /* Do some checking to help insure the file we opened really is
1865 : ** a valid database file.
1866 : */
1867 755 : rc = SQLITE_NOTADB;
1868 755 : if( sqlite3PagerPagecount(pBt->pPager)>0 ){
1869 292 : u8 *page1 = pPage1->aData;
1870 292 : if( memcmp(page1, zMagicHeader, 16)!=0 ){
1871 0 : goto page1_init_failed;
1872 : }
1873 292 : if( page1[18]>1 ){
1874 0 : pBt->readOnly = 1;
1875 : }
1876 292 : if( page1[19]>1 ){
1877 0 : goto page1_init_failed;
1878 : }
1879 292 : pageSize = get2byte(&page1[16]);
1880 292 : if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ){
1881 : goto page1_init_failed;
1882 : }
1883 : assert( (pageSize & 7)==0 );
1884 292 : pBt->pageSize = pageSize;
1885 292 : pBt->usableSize = pageSize - page1[20];
1886 292 : if( pBt->usableSize<500 ){
1887 0 : goto page1_init_failed;
1888 : }
1889 292 : pBt->maxEmbedFrac = page1[21];
1890 292 : pBt->minEmbedFrac = page1[22];
1891 292 : pBt->minLeafFrac = page1[23];
1892 : #ifndef SQLITE_OMIT_AUTOVACUUM
1893 292 : pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1894 : #endif
1895 : }
1896 :
1897 : /* maxLocal is the maximum amount of payload to store locally for
1898 : ** a cell. Make sure it is small enough so that at least minFanout
1899 : ** cells can will fit on one page. We assume a 10-byte page header.
1900 : ** Besides the payload, the cell must store:
1901 : ** 2-byte pointer to the cell
1902 : ** 4-byte child pointer
1903 : ** 9-byte nKey value
1904 : ** 4-byte nData value
1905 : ** 4-byte overflow page pointer
1906 : ** So a cell consists of a 2-byte poiner, a header which is as much as
1907 : ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1908 : ** page pointer.
1909 : */
1910 755 : pBt->maxLocal = (pBt->usableSize-12)*pBt->maxEmbedFrac/255 - 23;
1911 755 : pBt->minLocal = (pBt->usableSize-12)*pBt->minEmbedFrac/255 - 23;
1912 755 : pBt->maxLeaf = pBt->usableSize - 35;
1913 755 : pBt->minLeaf = (pBt->usableSize-12)*pBt->minLeafFrac/255 - 23;
1914 755 : if( pBt->minLocal>pBt->maxLocal || pBt->maxLocal<0 ){
1915 : goto page1_init_failed;
1916 : }
1917 : assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1918 755 : pBt->pPage1 = pPage1;
1919 755 : return SQLITE_OK;
1920 :
1921 0 : page1_init_failed:
1922 0 : releasePage(pPage1);
1923 0 : pBt->pPage1 = 0;
1924 0 : return rc;
1925 : }
1926 :
1927 : /*
1928 : ** This routine works like lockBtree() except that it also invokes the
1929 : ** busy callback if there is lock contention.
1930 : */
1931 417 : static int lockBtreeWithRetry(Btree *pRef){
1932 417 : int rc = SQLITE_OK;
1933 417 : if( pRef->inTrans==TRANS_NONE ){
1934 417 : u8 inTransaction = pRef->pBt->inTransaction;
1935 : btreeIntegrity(pRef);
1936 417 : rc = sqlite3BtreeBeginTrans(pRef, 0);
1937 417 : pRef->pBt->inTransaction = inTransaction;
1938 417 : pRef->inTrans = TRANS_NONE;
1939 417 : if( rc==SQLITE_OK ){
1940 417 : pRef->pBt->nTransaction--;
1941 : }
1942 : btreeIntegrity(pRef);
1943 : }
1944 417 : return rc;
1945 : }
1946 :
1947 :
1948 : /*
1949 : ** If there are no outstanding cursors and we are not in the middle
1950 : ** of a transaction but there is a read lock on the database, then
1951 : ** this routine unrefs the first page of the database file which
1952 : ** has the effect of releasing the read lock.
1953 : **
1954 : ** If there are any outstanding cursors, this routine is a no-op.
1955 : **
1956 : ** If there is a transaction in progress, this routine is a no-op.
1957 : */
1958 1840 : static void unlockBtreeIfUnused(BtShared *pBt){
1959 1840 : if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1960 754 : if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1961 754 : if( pBt->pPage1->aData==0 ){
1962 0 : MemPage *pPage = pBt->pPage1;
1963 0 : pPage->aData = &((u8*)pPage)[-pBt->pageSize];
1964 0 : pPage->pBt = pBt;
1965 0 : pPage->pgno = 1;
1966 : }
1967 754 : releasePage(pBt->pPage1);
1968 : }
1969 754 : pBt->pPage1 = 0;
1970 754 : pBt->inStmt = 0;
1971 : }
1972 1840 : }
1973 :
1974 : /*
1975 : ** Create a new database by initializing the first page of the
1976 : ** file.
1977 : */
1978 211 : static int newDatabase(BtShared *pBt){
1979 : MemPage *pP1;
1980 : unsigned char *data;
1981 : int rc;
1982 211 : if( sqlite3PagerPagecount(pBt->pPager)>0 ) return SQLITE_OK;
1983 50 : pP1 = pBt->pPage1;
1984 : assert( pP1!=0 );
1985 50 : data = pP1->aData;
1986 50 : rc = sqlite3PagerWrite(pP1->pDbPage);
1987 50 : if( rc ) return rc;
1988 50 : memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1989 : assert( sizeof(zMagicHeader)==16 );
1990 50 : put2byte(&data[16], pBt->pageSize);
1991 50 : data[18] = 1;
1992 50 : data[19] = 1;
1993 50 : data[20] = pBt->pageSize - pBt->usableSize;
1994 50 : data[21] = pBt->maxEmbedFrac;
1995 50 : data[22] = pBt->minEmbedFrac;
1996 50 : data[23] = pBt->minLeafFrac;
1997 50 : memset(&data[24], 0, 100-24);
1998 50 : zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1999 50 : pBt->pageSizeFixed = 1;
2000 : #ifndef SQLITE_OMIT_AUTOVACUUM
2001 50 : if( pBt->autoVacuum ){
2002 0 : put4byte(&data[36 + 4*4], 1);
2003 : }
2004 : #endif
2005 50 : return SQLITE_OK;
2006 : }
2007 :
2008 : /*
2009 : ** Attempt to start a new transaction. A write-transaction
2010 : ** is started if the second argument is nonzero, otherwise a read-
2011 : ** transaction. If the second argument is 2 or more and exclusive
2012 : ** transaction is started, meaning that no other process is allowed
2013 : ** to access the database. A preexisting transaction may not be
2014 : ** upgraded to exclusive by calling this routine a second time - the
2015 : ** exclusivity flag only works for a new transaction.
2016 : **
2017 : ** A write-transaction must be started before attempting any
2018 : ** changes to the database. None of the following routines
2019 : ** will work unless a transaction is started first:
2020 : **
2021 : ** sqlite3BtreeCreateTable()
2022 : ** sqlite3BtreeCreateIndex()
2023 : ** sqlite3BtreeClearTable()
2024 : ** sqlite3BtreeDropTable()
2025 : ** sqlite3BtreeInsert()
2026 : ** sqlite3BtreeDelete()
2027 : ** sqlite3BtreeUpdateMeta()
2028 : **
2029 : ** If an initial attempt to acquire the lock fails because of lock contention
2030 : ** and the database was previously unlocked, then invoke the busy handler
2031 : ** if there is one. But if there was previously a read-lock, do not
2032 : ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
2033 : ** returned when there is already a read-lock in order to avoid a deadlock.
2034 : **
2035 : ** Suppose there are two processes A and B. A has a read lock and B has
2036 : ** a reserved lock. B tries to promote to exclusive but is blocked because
2037 : ** of A's read lock. A tries to promote to reserved but is blocked by B.
2038 : ** One or the other of the two processes must give way or there can be
2039 : ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
2040 : ** when A already has a read lock, we encourage A to give up and let B
2041 : ** proceed.
2042 : */
2043 821 : int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2044 821 : BtShared *pBt = p->pBt;
2045 821 : int rc = SQLITE_OK;
2046 :
2047 : btreeIntegrity(p);
2048 :
2049 : /* If the btree is already in a write-transaction, or it
2050 : ** is already in a read-transaction and a read-transaction
2051 : ** is requested, this is a no-op.
2052 : */
2053 821 : if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2054 60 : return SQLITE_OK;
2055 : }
2056 :
2057 : /* Write transactions are not possible on a read-only database */
2058 761 : if( pBt->readOnly && wrflag ){
2059 0 : return SQLITE_READONLY;
2060 : }
2061 :
2062 : /* If another database handle has already opened a write transaction
2063 : ** on this shared-btree structure and a second write transaction is
2064 : ** requested, return SQLITE_BUSY.
2065 : */
2066 761 : if( pBt->inTransaction==TRANS_WRITE && wrflag ){
2067 0 : return SQLITE_BUSY;
2068 : }
2069 :
2070 : do {
2071 761 : if( pBt->pPage1==0 ){
2072 755 : rc = lockBtree(pBt);
2073 : }
2074 :
2075 761 : if( rc==SQLITE_OK && wrflag ){
2076 211 : if( pBt->readOnly ){
2077 0 : rc = SQLITE_READONLY;
2078 : }else{
2079 211 : rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
2080 211 : if( rc==SQLITE_OK ){
2081 211 : rc = newDatabase(pBt);
2082 : }
2083 : }
2084 : }
2085 :
2086 761 : if( rc==SQLITE_OK ){
2087 761 : if( wrflag ) pBt->inStmt = 0;
2088 : }else{
2089 0 : unlockBtreeIfUnused(pBt);
2090 : }
2091 : }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2092 761 : sqlite3InvokeBusyHandler(pBt->pBusyHandler) );
2093 :
2094 761 : if( rc==SQLITE_OK ){
2095 761 : if( p->inTrans==TRANS_NONE ){
2096 759 : pBt->nTransaction++;
2097 : }
2098 761 : p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2099 761 : if( p->inTrans>pBt->inTransaction ){
2100 761 : pBt->inTransaction = p->inTrans;
2101 : }
2102 : }
2103 :
2104 : btreeIntegrity(p);
2105 761 : return rc;
2106 : }
2107 :
2108 : #ifndef SQLITE_OMIT_AUTOVACUUM
2109 :
2110 : /*
2111 : ** Set the pointer-map entries for all children of page pPage. Also, if
2112 : ** pPage contains cells that point to overflow pages, set the pointer
2113 : ** map entries for the overflow pages as well.
2114 : */
2115 0 : static int setChildPtrmaps(MemPage *pPage){
2116 : int i; /* Counter variable */
2117 : int nCell; /* Number of cells in page pPage */
2118 0 : int rc = SQLITE_OK; /* Return code */
2119 0 : BtShared *pBt = pPage->pBt;
2120 0 : int isInitOrig = pPage->isInit;
2121 0 : Pgno pgno = pPage->pgno;
2122 :
2123 0 : initPage(pPage, 0);
2124 0 : nCell = pPage->nCell;
2125 :
2126 0 : for(i=0; i<nCell; i++){
2127 0 : u8 *pCell = findCell(pPage, i);
2128 :
2129 0 : rc = ptrmapPutOvflPtr(pPage, pCell);
2130 0 : if( rc!=SQLITE_OK ){
2131 0 : goto set_child_ptrmaps_out;
2132 : }
2133 :
2134 0 : if( !pPage->leaf ){
2135 0 : Pgno childPgno = get4byte(pCell);
2136 0 : rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2137 0 : if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
2138 : }
2139 : }
2140 :
2141 0 : if( !pPage->leaf ){
2142 0 : Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2143 0 : rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2144 : }
2145 :
2146 0 : set_child_ptrmaps_out:
2147 0 : pPage->isInit = isInitOrig;
2148 0 : return rc;
2149 : }
2150 :
2151 : /*
2152 : ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2153 : ** page, is a pointer to page iFrom. Modify this pointer so that it points to
2154 : ** iTo. Parameter eType describes the type of pointer to be modified, as
2155 : ** follows:
2156 : **
2157 : ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2158 : ** page of pPage.
2159 : **
2160 : ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2161 : ** page pointed to by one of the cells on pPage.
2162 : **
2163 : ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2164 : ** overflow page in the list.
2165 : */
2166 0 : static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2167 0 : if( eType==PTRMAP_OVERFLOW2 ){
2168 : /* The pointer is always the first 4 bytes of the page in this case. */
2169 0 : if( get4byte(pPage->aData)!=iFrom ){
2170 0 : return SQLITE_CORRUPT_BKPT;
2171 : }
2172 0 : put4byte(pPage->aData, iTo);
2173 : }else{
2174 0 : int isInitOrig = pPage->isInit;
2175 : int i;
2176 : int nCell;
2177 :
2178 0 : initPage(pPage, 0);
2179 0 : nCell = pPage->nCell;
2180 :
2181 0 : for(i=0; i<nCell; i++){
2182 0 : u8 *pCell = findCell(pPage, i);
2183 0 : if( eType==PTRMAP_OVERFLOW1 ){
2184 : CellInfo info;
2185 0 : parseCellPtr(pPage, pCell, &info);
2186 0 : if( info.iOverflow ){
2187 0 : if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2188 0 : put4byte(&pCell[info.iOverflow], iTo);
2189 0 : break;
2190 : }
2191 : }
2192 : }else{
2193 0 : if( get4byte(pCell)==iFrom ){
2194 0 : put4byte(pCell, iTo);
2195 0 : break;
2196 : }
2197 : }
2198 : }
2199 :
2200 0 : if( i==nCell ){
2201 0 : if( eType!=PTRMAP_BTREE ||
2202 : get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2203 0 : return SQLITE_CORRUPT_BKPT;
2204 : }
2205 0 : put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2206 : }
2207 :
2208 0 : pPage->isInit = isInitOrig;
2209 : }
2210 0 : return SQLITE_OK;
2211 : }
2212 :
2213 :
2214 : /*
2215 : ** Move the open database page pDbPage to location iFreePage in the
2216 : ** database. The pDbPage reference remains valid.
2217 : */
2218 : static int relocatePage(
2219 : BtShared *pBt, /* Btree */
2220 : MemPage *pDbPage, /* Open page to move */
2221 : u8 eType, /* Pointer map 'type' entry for pDbPage */
2222 : Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2223 : Pgno iFreePage /* The location to move pDbPage to */
2224 0 : ){
2225 : MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2226 0 : Pgno iDbPage = pDbPage->pgno;
2227 0 : Pager *pPager = pBt->pPager;
2228 : int rc;
2229 :
2230 : assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2231 : eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2232 :
2233 : /* Move page iDbPage from it's current location to page number iFreePage */
2234 : TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2235 : iDbPage, iFreePage, iPtrPage, eType));
2236 0 : rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage);
2237 0 : if( rc!=SQLITE_OK ){
2238 0 : return rc;
2239 : }
2240 0 : pDbPage->pgno = iFreePage;
2241 :
2242 : /* If pDbPage was a btree-page, then it may have child pages and/or cells
2243 : ** that point to overflow pages. The pointer map entries for all these
2244 : ** pages need to be changed.
2245 : **
2246 : ** If pDbPage is an overflow page, then the first 4 bytes may store a
2247 : ** pointer to a subsequent overflow page. If this is the case, then
2248 : ** the pointer map needs to be updated for the subsequent overflow page.
2249 : */
2250 0 : if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2251 0 : rc = setChildPtrmaps(pDbPage);
2252 0 : if( rc!=SQLITE_OK ){
2253 0 : return rc;
2254 : }
2255 : }else{
2256 0 : Pgno nextOvfl = get4byte(pDbPage->aData);
2257 0 : if( nextOvfl!=0 ){
2258 0 : rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2259 0 : if( rc!=SQLITE_OK ){
2260 0 : return rc;
2261 : }
2262 : }
2263 : }
2264 :
2265 : /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2266 : ** that it points at iFreePage. Also fix the pointer map entry for
2267 : ** iPtrPage.
2268 : */
2269 0 : if( eType!=PTRMAP_ROOTPAGE ){
2270 0 : rc = getPage(pBt, iPtrPage, &pPtrPage, 0);
2271 0 : if( rc!=SQLITE_OK ){
2272 0 : return rc;
2273 : }
2274 0 : rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2275 0 : if( rc!=SQLITE_OK ){
2276 0 : releasePage(pPtrPage);
2277 0 : return rc;
2278 : }
2279 0 : rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2280 0 : releasePage(pPtrPage);
2281 0 : if( rc==SQLITE_OK ){
2282 0 : rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2283 : }
2284 : }
2285 0 : return rc;
2286 : }
2287 :
2288 : /* Forward declaration required by autoVacuumCommit(). */
2289 : static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2290 :
2291 : /*
2292 : ** This routine is called prior to sqlite3PagerCommit when a transaction
2293 : ** is commited for an auto-vacuum database.
2294 : **
2295 : ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2296 : ** the database file should be truncated to during the commit process.
2297 : ** i.e. the database has been reorganized so that only the first *pnTrunc
2298 : ** pages are in use.
2299 : */
2300 0 : static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2301 0 : Pager *pPager = pBt->pPager;
2302 : Pgno nFreeList; /* Number of pages remaining on the free-list. */
2303 : int nPtrMap; /* Number of pointer-map pages deallocated */
2304 : Pgno origSize; /* Pages in the database file */
2305 : Pgno finSize; /* Pages in the database file after truncation */
2306 : int rc; /* Return code */
2307 : u8 eType;
2308 0 : int pgsz = pBt->pageSize; /* Page size for this database */
2309 : Pgno iDbPage; /* The database page to move */
2310 0 : MemPage *pDbMemPage = 0; /* "" */
2311 : Pgno iPtrPage; /* The page that contains a pointer to iDbPage */
2312 : Pgno iFreePage; /* The free-list page to move iDbPage to */
2313 0 : MemPage *pFreeMemPage = 0; /* "" */
2314 :
2315 : #ifndef NDEBUG
2316 : int nRef = sqlite3PagerRefcount(pPager);
2317 : #endif
2318 :
2319 : assert( pBt->autoVacuum );
2320 0 : if( PTRMAP_ISPAGE(pBt, sqlite3PagerPagecount(pPager)) ){
2321 0 : return SQLITE_CORRUPT_BKPT;
2322 : }
2323 :
2324 : /* Figure out how many free-pages are in the database. If there are no
2325 : ** free pages, then auto-vacuum is a no-op.
2326 : */
2327 0 : nFreeList = get4byte(&pBt->pPage1->aData[36]);
2328 0 : if( nFreeList==0 ){
2329 0 : *pnTrunc = 0;
2330 0 : return SQLITE_OK;
2331 : }
2332 :
2333 : /* This block figures out how many pages there are in the database
2334 : ** now (variable origSize), and how many there will be after the
2335 : ** truncation (variable finSize).
2336 : **
2337 : ** The final size is the original size, less the number of free pages
2338 : ** in the database, less any pointer-map pages that will no longer
2339 : ** be required, less 1 if the pending-byte page was part of the database
2340 : ** but is not after the truncation.
2341 : **/
2342 0 : origSize = sqlite3PagerPagecount(pPager);
2343 0 : if( origSize==PENDING_BYTE_PAGE(pBt) ){
2344 0 : origSize--;
2345 : }
2346 0 : nPtrMap = (nFreeList-origSize+PTRMAP_PAGENO(pBt, origSize)+pgsz/5)/(pgsz/5);
2347 0 : finSize = origSize - nFreeList - nPtrMap;
2348 0 : if( origSize>PENDING_BYTE_PAGE(pBt) && finSize<=PENDING_BYTE_PAGE(pBt) ){
2349 0 : finSize--;
2350 : }
2351 0 : while( PTRMAP_ISPAGE(pBt, finSize) || finSize==PENDING_BYTE_PAGE(pBt) ){
2352 0 : finSize--;
2353 : }
2354 : TRACE(("AUTOVACUUM: Begin (db size %d->%d)\n", origSize, finSize));
2355 :
2356 : /* Variable 'finSize' will be the size of the file in pages after
2357 : ** the auto-vacuum has completed (the current file size minus the number
2358 : ** of pages on the free list). Loop through the pages that lie beyond
2359 : ** this mark, and if they are not already on the free list, move them
2360 : ** to a free page earlier in the file (somewhere before finSize).
2361 : */
2362 0 : for( iDbPage=finSize+1; iDbPage<=origSize; iDbPage++ ){
2363 : /* If iDbPage is a pointer map page, or the pending-byte page, skip it. */
2364 0 : if( PTRMAP_ISPAGE(pBt, iDbPage) || iDbPage==PENDING_BYTE_PAGE(pBt) ){
2365 : continue;
2366 : }
2367 :
2368 0 : rc = ptrmapGet(pBt, iDbPage, &eType, &iPtrPage);
2369 0 : if( rc!=SQLITE_OK ) goto autovacuum_out;
2370 0 : if( eType==PTRMAP_ROOTPAGE ){
2371 0 : rc = SQLITE_CORRUPT_BKPT;
2372 0 : goto autovacuum_out;
2373 : }
2374 :
2375 : /* If iDbPage is free, do not swap it. */
2376 0 : if( eType==PTRMAP_FREEPAGE ){
2377 0 : continue;
2378 : }
2379 0 : rc = getPage(pBt, iDbPage, &pDbMemPage, 0);
2380 0 : if( rc!=SQLITE_OK ) goto autovacuum_out;
2381 :
2382 : /* Find the next page in the free-list that is not already at the end
2383 : ** of the file. A page can be pulled off the free list using the
2384 : ** allocateBtreePage() routine.
2385 : */
2386 : do{
2387 0 : if( pFreeMemPage ){
2388 0 : releasePage(pFreeMemPage);
2389 0 : pFreeMemPage = 0;
2390 : }
2391 0 : rc = allocateBtreePage(pBt, &pFreeMemPage, &iFreePage, 0, 0);
2392 0 : if( rc!=SQLITE_OK ){
2393 0 : releasePage(pDbMemPage);
2394 0 : goto autovacuum_out;
2395 : }
2396 : assert( iFreePage<=origSize );
2397 0 : }while( iFreePage>finSize );
2398 0 : releasePage(pFreeMemPage);
2399 0 : pFreeMemPage = 0;
2400 :
2401 : /* Relocate the page into the body of the file. Note that although the
2402 : ** page has moved within the database file, the pDbMemPage pointer
2403 : ** remains valid. This means that this function can run without
2404 : ** invalidating cursors open on the btree. This is important in
2405 : ** shared-cache mode.
2406 : */
2407 0 : rc = relocatePage(pBt, pDbMemPage, eType, iPtrPage, iFreePage);
2408 0 : releasePage(pDbMemPage);
2409 0 : if( rc!=SQLITE_OK ) goto autovacuum_out;
2410 : }
2411 :
2412 : /* The entire free-list has been swapped to the end of the file. So
2413 : ** truncate the database file to finSize pages and consider the
2414 : ** free-list empty.
2415 : */
2416 0 : rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2417 0 : if( rc!=SQLITE_OK ) goto autovacuum_out;
2418 0 : put4byte(&pBt->pPage1->aData[32], 0);
2419 0 : put4byte(&pBt->pPage1->aData[36], 0);
2420 0 : *pnTrunc = finSize;
2421 : assert( finSize!=PENDING_BYTE_PAGE(pBt) );
2422 :
2423 0 : autovacuum_out:
2424 : assert( nRef==sqlite3PagerRefcount(pPager) );
2425 0 : if( rc!=SQLITE_OK ){
2426 0 : sqlite3PagerRollback(pPager);
2427 : }
2428 0 : return rc;
2429 : }
2430 : #endif
2431 :
2432 : /*
2433 : ** This routine does the first phase of a two-phase commit. This routine
2434 : ** causes a rollback journal to be created (if it does not already exist)
2435 : ** and populated with enough information so that if a power loss occurs
2436 : ** the database can be restored to its original state by playing back
2437 : ** the journal. Then the contents of the journal are flushed out to
2438 : ** the disk. After the journal is safely on oxide, the changes to the
2439 : ** database are written into the database file and flushed to oxide.
2440 : ** At the end of this call, the rollback journal still exists on the
2441 : ** disk and we are still holding all locks, so the transaction has not
2442 : ** committed. See sqlite3BtreeCommit() for the second phase of the
2443 : ** commit process.
2444 : **
2445 : ** This call is a no-op if no write-transaction is currently active on pBt.
2446 : **
2447 : ** Otherwise, sync the database file for the btree pBt. zMaster points to
2448 : ** the name of a master journal file that should be written into the
2449 : ** individual journal file, or is NULL, indicating no master journal file
2450 : ** (single database transaction).
2451 : **
2452 : ** When this is called, the master journal should already have been
2453 : ** created, populated with this journal pointer and synced to disk.
2454 : **
2455 : ** Once this is routine has returned, the only thing required to commit
2456 : ** the write-transaction for this database file is to delete the journal.
2457 : */
2458 562 : int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2459 562 : int rc = SQLITE_OK;
2460 562 : if( p->inTrans==TRANS_WRITE ){
2461 209 : BtShared *pBt = p->pBt;
2462 209 : Pgno nTrunc = 0;
2463 : #ifndef SQLITE_OMIT_AUTOVACUUM
2464 209 : if( pBt->autoVacuum ){
2465 0 : rc = autoVacuumCommit(pBt, &nTrunc);
2466 0 : if( rc!=SQLITE_OK ){
2467 0 : return rc;
2468 : }
2469 : }
2470 : #endif
2471 209 : rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc);
2472 : }
2473 562 : return rc;
2474 : }
2475 :
2476 : /*
2477 : ** Commit the transaction currently in progress.
2478 : **
2479 : ** This routine implements the second phase of a 2-phase commit. The
2480 : ** sqlite3BtreeSync() routine does the first phase and should be invoked
2481 : ** prior to calling this routine. The sqlite3BtreeSync() routine did
2482 : ** all the work of writing information out to disk and flushing the
2483 : ** contents so that they are written onto the disk platter. All this
2484 : ** routine has to do is delete or truncate the rollback journal
2485 : ** (which causes the transaction to commit) and drop locks.
2486 : **
2487 : ** This will release the write lock on the database file. If there
2488 : ** are no active cursors, it also releases the read lock.
2489 : */
2490 562 : int sqlite3BtreeCommitPhaseTwo(Btree *p){
2491 562 : BtShared *pBt = p->pBt;
2492 :
2493 : btreeIntegrity(p);
2494 :
2495 : /* If the handle has a write-transaction open, commit the shared-btrees
2496 : ** transaction and set the shared state to TRANS_READ.
2497 : */
2498 562 : if( p->inTrans==TRANS_WRITE ){
2499 : int rc;
2500 : assert( pBt->inTransaction==TRANS_WRITE );
2501 : assert( pBt->nTransaction>0 );
2502 209 : rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2503 209 : if( rc!=SQLITE_OK ){
2504 0 : return rc;
2505 : }
2506 209 : pBt->inTransaction = TRANS_READ;
2507 209 : pBt->inStmt = 0;
2508 : }
2509 562 : unlockAllTables(p);
2510 :
2511 : /* If the handle has any kind of transaction open, decrement the transaction
2512 : ** count of the shared btree. If the transaction count reaches 0, set
2513 : ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2514 : ** will unlock the pager.
2515 : */
2516 562 : if( p->inTrans!=TRANS_NONE ){
2517 339 : pBt->nTransaction--;
2518 339 : if( 0==pBt->nTransaction ){
2519 339 : pBt->inTransaction = TRANS_NONE;
2520 : }
2521 : }
2522 :
2523 : /* Set the handles current transaction state to TRANS_NONE and unlock
2524 : ** the pager if this call closed the only read or write transaction.
2525 : */
2526 562 : p->inTrans = TRANS_NONE;
2527 562 : unlockBtreeIfUnused(pBt);
2528 :
2529 : btreeIntegrity(p);
2530 562 : return SQLITE_OK;
2531 : }
2532 :
2533 : /*
2534 : ** Do both phases of a commit.
2535 : */
2536 0 : int sqlite3BtreeCommit(Btree *p){
2537 : int rc;
2538 0 : rc = sqlite3BtreeCommitPhaseOne(p, 0);
2539 0 : if( rc==SQLITE_OK ){
2540 0 : rc = sqlite3BtreeCommitPhaseTwo(p);
2541 : }
2542 0 : return rc;
2543 : }
2544 :
2545 : #ifndef NDEBUG
2546 : /*
2547 : ** Return the number of write-cursors open on this handle. This is for use
2548 : ** in assert() expressions, so it is only compiled if NDEBUG is not
2549 : ** defined.
2550 : */
2551 : static int countWriteCursors(BtShared *pBt){
2552 : BtCursor *pCur;
2553 : int r = 0;
2554 : for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2555 : if( pCur->wrFlag ) r++;
2556 : }
2557 : return r;
2558 : }
2559 : #endif
2560 :
2561 : #if defined(SQLITE_TEST) || defined(SQLITE_DEBUG)
2562 : /*
2563 : ** Print debugging information about all cursors to standard output.
2564 : */
2565 : void sqlite3BtreeCursorList(Btree *p){
2566 : BtCursor *pCur;
2567 : BtShared *pBt = p->pBt;
2568 : for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2569 : MemPage *pPage = pCur->pPage;
2570 : char *zMode = pCur->wrFlag ? "rw" : "ro";
2571 : sqlite3DebugPrintf("CURSOR %p rooted at %4d(%s) currently at %d.%d%s\n",
2572 : pCur, pCur->pgnoRoot, zMode,
2573 : pPage ? pPage->pgno : 0, pCur->idx,
2574 : (pCur->eState==CURSOR_VALID) ? "" : " eof"
2575 : );
2576 : }
2577 : }
2578 : #endif
2579 :
2580 : /*
2581 : ** Rollback the transaction in progress. All cursors will be
2582 : ** invalided by this operation. Any attempt to use a cursor
2583 : ** that was open at the beginning of this operation will result
2584 : ** in an error.
2585 : **
2586 : ** This will release the write lock on the database file. If there
2587 : ** are no active cursors, it also releases the read lock.
2588 : */
2589 132 : int sqlite3BtreeRollback(Btree *p){
2590 : int rc;
2591 132 : BtShared *pBt = p->pBt;
2592 : MemPage *pPage1;
2593 :
2594 132 : rc = saveAllCursors(pBt, 0, 0);
2595 : #ifndef SQLITE_OMIT_SHARED_CACHE
2596 132 : if( rc!=SQLITE_OK ){
2597 : /* This is a horrible situation. An IO or malloc() error occured whilst
2598 : ** trying to save cursor positions. If this is an automatic rollback (as
2599 : ** the result of a constraint, malloc() failure or IO error) then
2600 : ** the cache may be internally inconsistent (not contain valid trees) so
2601 : ** we cannot simply return the error to the caller. Instead, abort
2602 : ** all queries that may be using any of the cursors that failed to save.
2603 : */
2604 0 : while( pBt->pCursor ){
2605 0 : sqlite3 *db = pBt->pCursor->pBtree->pSqlite;
2606 0 : if( db ){
2607 0 : sqlite3AbortOtherActiveVdbes(db, 0);
2608 : }
2609 : }
2610 : }
2611 : #endif
2612 : btreeIntegrity(p);
2613 132 : unlockAllTables(p);
2614 :
2615 132 : if( p->inTrans==TRANS_WRITE ){
2616 : int rc2;
2617 :
2618 : assert( TRANS_WRITE==pBt->inTransaction );
2619 2 : rc2 = sqlite3PagerRollback(pBt->pPager);
2620 2 : if( rc2!=SQLITE_OK ){
2621 0 : rc = rc2;
2622 : }
2623 :
2624 : /* The rollback may have destroyed the pPage1->aData value. So
2625 : ** call getPage() on page 1 again to make sure pPage1->aData is
2626 : ** set correctly. */
2627 2 : if( getPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2628 2 : releasePage(pPage1);
2629 : }
2630 : assert( countWriteCursors(pBt)==0 );
2631 2 : pBt->inTransaction = TRANS_READ;
2632 : }
2633 :
2634 132 : if( p->inTrans!=TRANS_NONE ){
2635 : assert( pBt->nTransaction>0 );
2636 2 : pBt->nTransaction--;
2637 2 : if( 0==pBt->nTransaction ){
2638 2 : pBt->inTransaction = TRANS_NONE;
2639 : }
2640 : }
2641 :
2642 132 : p->inTrans = TRANS_NONE;
2643 132 : pBt->inStmt = 0;
2644 132 : unlockBtreeIfUnused(pBt);
2645 :
2646 : btreeIntegrity(p);
2647 132 : return rc;
2648 : }
2649 :
2650 : /*
2651 : ** Start a statement subtransaction. The subtransaction can
2652 : ** can be rolled back independently of the main transaction.
2653 : ** You must start a transaction before starting a subtransaction.
2654 : ** The subtransaction is ended automatically if the main transaction
2655 : ** commits or rolls back.
2656 : **
2657 : ** Only one subtransaction may be active at a time. It is an error to try
2658 : ** to start a new subtransaction if another subtransaction is already active.
2659 : **
2660 : ** Statement subtransactions are used around individual SQL statements
2661 : ** that are contained within a BEGIN...COMMIT block. If a constraint
2662 : ** error occurs within the statement, the effect of that one statement
2663 : ** can be rolled back without having to rollback the entire transaction.
2664 : */
2665 0 : int sqlite3BtreeBeginStmt(Btree *p){
2666 : int rc;
2667 0 : BtShared *pBt = p->pBt;
2668 0 : if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2669 0 : return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2670 : }
2671 : assert( pBt->inTransaction==TRANS_WRITE );
2672 0 : rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2673 0 : pBt->inStmt = 1;
2674 0 : return rc;
2675 : }
2676 :
2677 :
2678 : /*
2679 : ** Commit the statment subtransaction currently in progress. If no
2680 : ** subtransaction is active, this is a no-op.
2681 : */
2682 131 : int sqlite3BtreeCommitStmt(Btree *p){
2683 : int rc;
2684 131 : BtShared *pBt = p->pBt;
2685 131 : if( pBt->inStmt && !pBt->readOnly ){
2686 0 : rc = sqlite3PagerStmtCommit(pBt->pPager);
2687 : }else{
2688 131 : rc = SQLITE_OK;
2689 : }
2690 131 : pBt->inStmt = 0;
2691 131 : return rc;
2692 : }
2693 :
2694 : /*
2695 : ** Rollback the active statement subtransaction. If no subtransaction
2696 : ** is active this routine is a no-op.
2697 : **
2698 : ** All cursors will be invalidated by this operation. Any attempt
2699 : ** to use a cursor that was open at the beginning of this operation
2700 : ** will result in an error.
2701 : */
2702 1 : int sqlite3BtreeRollbackStmt(Btree *p){
2703 1 : int rc = SQLITE_OK;
2704 1 : BtShared *pBt = p->pBt;
2705 : sqlite3MallocDisallow();
2706 1 : if( pBt->inStmt && !pBt->readOnly ){
2707 0 : rc = sqlite3PagerStmtRollback(pBt->pPager);
2708 : assert( countWriteCursors(pBt)==0 );
2709 0 : pBt->inStmt = 0;
2710 : }
2711 : sqlite3MallocAllow();
2712 1 : return rc;
2713 : }
2714 :
2715 : /*
2716 : ** Default key comparison function to be used if no comparison function
2717 : ** is specified on the sqlite3BtreeCursor() call.
2718 : */
2719 : static int dfltCompare(
2720 : void *NotUsed, /* User data is not used */
2721 : int n1, const void *p1, /* First key to compare */
2722 : int n2, const void *p2 /* Second key to compare */
2723 0 : ){
2724 : int c;
2725 0 : c = memcmp(p1, p2, n1<n2 ? n1 : n2);
2726 0 : if( c==0 ){
2727 0 : c = n1 - n2;
2728 : }
2729 0 : return c;
2730 : }
2731 :
2732 : /*
2733 : ** Create a new cursor for the BTree whose root is on the page
2734 : ** iTable. The act of acquiring a cursor gets a read lock on
2735 : ** the database file.
2736 : **
2737 : ** If wrFlag==0, then the cursor can only be used for reading.
2738 : ** If wrFlag==1, then the cursor can be used for reading or for
2739 : ** writing if other conditions for writing are also met. These
2740 : ** are the conditions that must be met in order for writing to
2741 : ** be allowed:
2742 : **
2743 : ** 1: The cursor must have been opened with wrFlag==1
2744 : **
2745 : ** 2: Other database connections that share the same pager cache
2746 : ** but which are not in the READ_UNCOMMITTED state may not have
2747 : ** cursors open with wrFlag==0 on the same table. Otherwise
2748 : ** the changes made by this write cursor would be visible to
2749 : ** the read cursors in the other database connection.
2750 : **
2751 : ** 3: The database must be writable (not on read-only media)
2752 : **
2753 : ** 4: There must be an active transaction.
2754 : **
2755 : ** No checking is done to make sure that page iTable really is the
2756 : ** root page of a b-tree. If it is not, then the cursor acquired
2757 : ** will not work correctly.
2758 : **
2759 : ** The comparison function must be logically the same for every cursor
2760 : ** on a particular table. Changing the comparison function will result
2761 : ** in incorrect operations. If the comparison function is NULL, a
2762 : ** default comparison function is used. The comparison function is
2763 : ** always ignored for INTKEY tables.
2764 : */
2765 : int sqlite3BtreeCursor(
2766 : Btree *p, /* The btree */
2767 : int iTable, /* Root page of table to open */
2768 : int wrFlag, /* 1 to write. 0 read-only */
2769 : int (*xCmp)(void*,int,const void*,int,const void*), /* Key Comparison func */
2770 : void *pArg, /* First arg to xCompare() */
2771 : BtCursor **ppCur /* Write new cursor here */
2772 1147 : ){
2773 : int rc;
2774 : BtCursor *pCur;
2775 1147 : BtShared *pBt = p->pBt;
2776 :
2777 1147 : *ppCur = 0;
2778 1147 : if( wrFlag ){
2779 431 : if( pBt->readOnly ){
2780 0 : return SQLITE_READONLY;
2781 : }
2782 431 : if( checkReadLocks(p, iTable, 0) ){
2783 0 : return SQLITE_LOCKED;
2784 : }
2785 : }
2786 :
2787 1147 : if( pBt->pPage1==0 ){
2788 417 : rc = lockBtreeWithRetry(p);
2789 417 : if( rc!=SQLITE_OK ){
2790 0 : return rc;
2791 : }
2792 417 : if( pBt->readOnly && wrFlag ){
2793 0 : return SQLITE_READONLY;
2794 : }
2795 : }
2796 1147 : pCur = sqliteMalloc( sizeof(*pCur) );
2797 1147 : if( pCur==0 ){
2798 0 : rc = SQLITE_NOMEM;
2799 0 : goto create_cursor_exception;
2800 : }
2801 1147 : pCur->pgnoRoot = (Pgno)iTable;
2802 1147 : if( iTable==1 && sqlite3PagerPagecount(pBt->pPager)==0 ){
2803 413 : rc = SQLITE_EMPTY;
2804 413 : goto create_cursor_exception;
2805 : }
2806 734 : rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2807 734 : if( rc!=SQLITE_OK ){
2808 0 : goto create_cursor_exception;
2809 : }
2810 :
2811 : /* Now that no other errors can occur, finish filling in the BtCursor
2812 : ** variables, link the cursor into the BtShared list and set *ppCur (the
2813 : ** output argument to this function).
2814 : */
2815 734 : pCur->xCompare = xCmp ? xCmp : dfltCompare;
2816 734 : pCur->pArg = pArg;
2817 734 : pCur->pBtree = p;
2818 734 : pCur->wrFlag = wrFlag;
2819 734 : pCur->pNext = pBt->pCursor;
2820 734 : if( pCur->pNext ){
2821 165 : pCur->pNext->pPrev = pCur;
2822 : }
2823 734 : pBt->pCursor = pCur;
2824 734 : pCur->eState = CURSOR_INVALID;
2825 734 : *ppCur = pCur;
2826 :
2827 734 : return SQLITE_OK;
2828 413 : create_cursor_exception:
2829 413 : if( pCur ){
2830 413 : releasePage(pCur->pPage);
2831 413 : sqliteFree(pCur);
2832 : }
2833 413 : unlockBtreeIfUnused(pBt);
2834 413 : return rc;
2835 : }
2836 :
2837 : #if 0 /* Not Used */
2838 : /*
2839 : ** Change the value of the comparison function used by a cursor.
2840 : */
2841 : void sqlite3BtreeSetCompare(
2842 : BtCursor *pCur, /* The cursor to whose comparison function is changed */
2843 : int(*xCmp)(void*,int,const void*,int,const void*), /* New comparison func */
2844 : void *pArg /* First argument to xCmp() */
2845 : ){
2846 : pCur->xCompare = xCmp ? xCmp : dfltCompare;
2847 : pCur->pArg = pArg;
2848 : }
2849 : #endif
2850 :
2851 : /*
2852 : ** Close a cursor. The read lock on the database file is released
2853 : ** when the last cursor is closed.
2854 : */
2855 733 : int sqlite3BtreeCloseCursor(BtCursor *pCur){
2856 733 : BtShared *pBt = pCur->pBtree->pBt;
2857 733 : clearCursorPosition(pCur);
2858 733 : if( pCur->pPrev ){
2859 161 : pCur->pPrev->pNext = pCur->pNext;
2860 : }else{
2861 572 : pBt->pCursor = pCur->pNext;
2862 : }
2863 733 : if( pCur->pNext ){
2864 4 : pCur->pNext->pPrev = pCur->pPrev;
2865 : }
2866 733 : releasePage(pCur->pPage);
2867 733 : unlockBtreeIfUnused(pBt);
2868 733 : sqliteFree(pCur);
2869 733 : return SQLITE_OK;
2870 : }
2871 :
2872 : /*
2873 : ** Make a temporary cursor by filling in the fields of pTempCur.
2874 : ** The temporary cursor is not on the cursor list for the Btree.
2875 : */
2876 0 : static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2877 0 : memcpy(pTempCur, pCur, sizeof(*pCur));
2878 0 : pTempCur->pNext = 0;
2879 0 : pTempCur->pPrev = 0;
2880 0 : if( pTempCur->pPage ){
2881 0 : sqlite3PagerRef(pTempCur->pPage->pDbPage);
2882 : }
2883 0 : }
2884 :
2885 : /*
2886 : ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2887 : ** function above.
2888 : */
2889 0 : static void releaseTempCursor(BtCursor *pCur){
2890 0 : if( pCur->pPage ){
2891 0 : sqlite3PagerUnref(pCur->pPage->pDbPage);
2892 : }
2893 0 : }
2894 :
2895 : /*
2896 : ** Make sure the BtCursor.info field of the given cursor is valid.
2897 : ** If it is not already valid, call parseCell() to fill it in.
2898 : **
2899 : ** BtCursor.info is a cache of the information in the current cell.
2900 : ** Using this cache reduces the number of calls to parseCell().
2901 : */
2902 1610 : static void getCellInfo(BtCursor *pCur){
2903 1610 : if( pCur->info.nSize==0 ){
2904 977 : parseCell(pCur->pPage, pCur->idx, &pCur->info);
2905 : }else{
2906 : #ifndef NDEBUG
2907 : CellInfo info;
2908 : memset(&info, 0, sizeof(info));
2909 : parseCell(pCur->pPage, pCur->idx, &info);
2910 : assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2911 : #endif
2912 : }
2913 1610 : }
2914 :
2915 : /*
2916 : ** Set *pSize to the size of the buffer needed to hold the value of
2917 : ** the key for the current entry. If the cursor is not pointing
2918 : ** to a valid entry, *pSize is set to 0.
2919 : **
2920 : ** For a table with the INTKEY flag set, this routine returns the key
2921 : ** itself, not the number of bytes in the key.
2922 : */
2923 272 : int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2924 272 : int rc = restoreOrClearCursorPosition(pCur);
2925 272 : if( rc==SQLITE_OK ){
2926 : assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2927 272 : if( pCur->eState==CURSOR_INVALID ){
2928 0 : *pSize = 0;
2929 : }else{
2930 272 : getCellInfo(pCur);
2931 272 : *pSize = pCur->info.nKey;
2932 : }
2933 : }
2934 272 : return rc;
2935 : }
2936 :
2937 : /*
2938 : ** Set *pSize to the number of bytes of data in the entry the
2939 : ** cursor currently points to. Always return SQLITE_OK.
2940 : ** Failure is not possible. If the cursor is not currently
2941 : ** pointing to an entry (which can happen, for example, if
2942 : ** the database is empty) then *pSize is set to 0.
2943 : */
2944 420 : int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2945 420 : int rc = restoreOrClearCursorPosition(pCur);
2946 420 : if( rc==SQLITE_OK ){
2947 : assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2948 420 : if( pCur->eState==CURSOR_INVALID ){
2949 : /* Not pointing at a valid entry - set *pSize to 0. */
2950 0 : *pSize = 0;
2951 : }else{
2952 420 : getCellInfo(pCur);
2953 420 : *pSize = pCur->info.nData;
2954 : }
2955 : }
2956 420 : return rc;
2957 : }
2958 :
2959 : /*
2960 : ** Read payload information from the entry that the pCur cursor is
2961 : ** pointing to. Begin reading the payload at "offset" and read
2962 : ** a total of "amt" bytes. Put the result in zBuf.
2963 : **
2964 : ** This routine does not make a distinction between key and data.
2965 : ** It just reads bytes from the payload area. Data might appear
2966 : ** on the main page or be scattered out on multiple overflow pages.
2967 : */
2968 : static int getPayload(
2969 : BtCursor *pCur, /* Cursor pointing to entry to read from */
2970 : int offset, /* Begin reading this far into payload */
2971 : int amt, /* Read this many bytes */
2972 : unsigned char *pBuf, /* Write the bytes into this buffer */
2973 : int skipKey /* offset begins at data if this is true */
2974 0 : ){
2975 : unsigned char *aPayload;
2976 : Pgno nextPage;
2977 : int rc;
2978 : MemPage *pPage;
2979 : BtShared *pBt;
2980 : int ovflSize;
2981 : u32 nKey;
2982 :
2983 : assert( pCur!=0 && pCur->pPage!=0 );
2984 : assert( pCur->eState==CURSOR_VALID );
2985 0 : pBt = pCur->pBtree->pBt;
2986 0 : pPage = pCur->pPage;
2987 : assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
2988 0 : getCellInfo(pCur);
2989 0 : aPayload = pCur->info.pCell + pCur->info.nHeader;
2990 0 : if( pPage->intKey ){
2991 0 : nKey = 0;
2992 : }else{
2993 0 : nKey = pCur->info.nKey;
2994 : }
2995 : assert( offset>=0 );
2996 0 : if( skipKey ){
2997 0 : offset += nKey;
2998 : }
2999 0 : if( offset+amt > nKey+pCur->info.nData ){
3000 0 : return SQLITE_ERROR;
3001 : }
3002 0 : if( offset<pCur->info.nLocal ){
3003 0 : int a = amt;
3004 0 : if( a+offset>pCur->info.nLocal ){
3005 0 : a = pCur->info.nLocal - offset;
3006 : }
3007 0 : memcpy(pBuf, &aPayload[offset], a);
3008 0 : if( a==amt ){
3009 0 : return SQLITE_OK;
3010 : }
3011 0 : offset = 0;
3012 0 : pBuf += a;
3013 0 : amt -= a;
3014 : }else{
3015 0 : offset -= pCur->info.nLocal;
3016 : }
3017 0 : ovflSize = pBt->usableSize - 4;
3018 0 : if( amt>0 ){
3019 0 : nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3020 0 : while( amt>0 && nextPage ){
3021 : DbPage *pDbPage;
3022 0 : rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3023 0 : if( rc!=0 ){
3024 0 : return rc;
3025 : }
3026 0 : aPayload = sqlite3PagerGetData(pDbPage);
3027 0 : nextPage = get4byte(aPayload);
3028 0 : if( offset<ovflSize ){
3029 0 : int a = amt;
3030 0 : if( a + offset > ovflSize ){
3031 0 : a = ovflSize - offset;
3032 : }
3033 0 : memcpy(pBuf, &aPayload[offset+4], a);
3034 0 : offset = 0;
3035 0 : amt -= a;
3036 0 : pBuf += a;
3037 : }else{
3038 0 : offset -= ovflSize;
3039 : }
3040 0 : sqlite3PagerUnref(pDbPage);
3041 : }
3042 : }
3043 :
3044 0 : if( amt>0 ){
3045 0 : return SQLITE_CORRUPT_BKPT;
3046 : }
3047 0 : return SQLITE_OK;
3048 : }
3049 :
3050 : /*
3051 : ** Read part of the key associated with cursor pCur. Exactly
3052 : ** "amt" bytes will be transfered into pBuf[]. The transfer
3053 : ** begins at "offset".
3054 : **
3055 : ** Return SQLITE_OK on success or an error code if anything goes
3056 : ** wrong. An error is returned if "offset+amt" is larger than
3057 : ** the available payload.
3058 : */
3059 0 : int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3060 0 : int rc = restoreOrClearCursorPosition(pCur);
3061 0 : if( rc==SQLITE_OK ){
3062 : assert( pCur->eState==CURSOR_VALID );
3063 : assert( pCur->pPage!=0 );
3064 0 : if( pCur->pPage->intKey ){
3065 0 : return SQLITE_CORRUPT_BKPT;
3066 : }
3067 : assert( pCur->pPage->intKey==0 );
3068 : assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3069 0 : rc = getPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
3070 : }
3071 0 : return rc;
3072 : }
3073 :
3074 : /*
3075 : ** Read part of the data associated with cursor pCur. Exactly
3076 : ** "amt" bytes will be transfered into pBuf[]. The transfer
3077 : ** begins at "offset".
3078 : **
3079 : ** Return SQLITE_OK on success or an error code if anything goes
3080 : ** wrong. An error is returned if "offset+amt" is larger than
3081 : ** the available payload.
3082 : */
3083 0 : int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3084 0 : int rc = restoreOrClearCursorPosition(pCur);
3085 0 : if( rc==SQLITE_OK ){
3086 : assert( pCur->eState==CURSOR_VALID );
3087 : assert( pCur->pPage!=0 );
3088 : assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3089 0 : rc = getPayload(pCur, offset, amt, pBuf, 1);
3090 : }
3091 0 : return rc;
3092 : }
3093 :
3094 : /*
3095 : ** Return a pointer to payload information from the entry that the
3096 : ** pCur cursor is pointing to. The pointer is to the beginning of
3097 : ** the key if skipKey==0 and it points to the beginning of data if
3098 : ** skipKey==1. The number of bytes of available key/data is written
3099 : ** into *pAmt. If *pAmt==0, then the value returned will not be
3100 : ** a valid pointer.
3101 : **
3102 : ** This routine is an optimization. It is common for the entire key
3103 : ** and data to fit on the local page and for there to be no overflow
3104 : ** pages. When that is so, this routine can be used to access the
3105 : ** key and data without making a copy. If the key and/or data spills
3106 : ** onto overflow pages, then getPayload() must be used to reassembly
3107 : ** the key/data and copy it into a preallocated buffer.
3108 : **
3109 : ** The pointer returned by this routine looks directly into the cached
3110 : ** page of the database. The data might change or move the next time
3111 : ** any btree routine is called.
3112 : */
3113 : static const unsigned char *fetchPayload(
3114 : BtCursor *pCur, /* Cursor pointing to entry to read from */
3115 : int *pAmt, /* Write the number of available bytes here */
3116 : int skipKey /* read beginning at data if this is true */
3117 918 : ){
3118 : unsigned char *aPayload;
3119 : MemPage *pPage;
3120 : u32 nKey;
3121 : int nLocal;
3122 :
3123 : assert( pCur!=0 && pCur->pPage!=0 );
3124 : assert( pCur->eState==CURSOR_VALID );
3125 918 : pPage = pCur->pPage;
3126 : assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3127 918 : getCellInfo(pCur);
3128 918 : aPayload = pCur->info.pCell;
3129 918 : aPayload += pCur->info.nHeader;
3130 918 : if( pPage->intKey ){
3131 420 : nKey = 0;
3132 : }else{
3133 498 : nKey = pCur->info.nKey;
3134 : }
3135 918 : if( skipKey ){
3136 420 : aPayload += nKey;
3137 420 : nLocal = pCur->info.nLocal - nKey;
3138 : }else{
3139 498 : nLocal = pCur->info.nLocal;
3140 498 : if( nLocal>nKey ){
3141 0 : nLocal = nKey;
3142 : }
3143 : }
3144 918 : *pAmt = nLocal;
3145 918 : return aPayload;
3146 : }
3147 :
3148 :
3149 : /*
3150 : ** For the entry that cursor pCur is point to, return as
3151 : ** many bytes of the key or data as are available on the local
3152 : ** b-tree page. Write the number of available bytes into *pAmt.
3153 : **
3154 : ** The pointer returned is ephemeral. The key/data may move
3155 : ** or be destroyed on the next call to any Btree routine.
3156 : **
3157 : ** These routines is used to get quick access to key and data
3158 : ** in the common case where no overflow pages are used.
3159 : */
3160 131 : const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3161 131 : if( pCur->eState==CURSOR_VALID ){
3162 131 : return (const void*)fetchPayload(pCur, pAmt, 0);
3163 : }
3164 0 : return 0;
3165 : }
3166 420 : const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3167 420 : if( pCur->eState==CURSOR_VALID ){
3168 420 : return (const void*)fetchPayload(pCur, pAmt, 1);
3169 : }
3170 0 : return 0;
3171 : }
3172 :
3173 :
3174 : /*
3175 : ** Move the cursor down to a new child page. The newPgno argument is the
3176 : ** page number of the child page to move to.
3177 : */
3178 0 : static int moveToChild(BtCursor *pCur, u32 newPgno){
3179 : int rc;
3180 : MemPage *pNewPage;
3181 : MemPage *pOldPage;
3182 0 : BtShared *pBt = pCur->pBtree->pBt;
3183 :
3184 : assert( pCur->eState==CURSOR_VALID );
3185 0 : rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
3186 0 : if( rc ) return rc;
3187 0 : pNewPage->idxParent = pCur->idx;
3188 0 : pOldPage = pCur->pPage;
3189 0 : pOldPage->idxShift = 0;
3190 0 : releasePage(pOldPage);
3191 0 : pCur->pPage = pNewPage;
3192 0 : pCur->idx = 0;
3193 0 : pCur->info.nSize = 0;
3194 0 : if( pNewPage->nCell<1 ){
3195 0 : return SQLITE_CORRUPT_BKPT;
3196 : }
3197 0 : return SQLITE_OK;
3198 : }
3199 :
3200 : /*
3201 : ** Return true if the page is the virtual root of its table.
3202 : **
3203 : ** The virtual root page is the root page for most tables. But
3204 : ** for the table rooted on page 1, sometime the real root page
3205 : ** is empty except for the right-pointer. In such cases the
3206 : ** virtual root page is the page that the right-pointer of page
3207 : ** 1 is pointing to.
3208 : */
3209 229 : static int isRootPage(MemPage *pPage){
3210 229 : MemPage *pParent = pPage->pParent;
3211 229 : if( pParent==0 ) return 1;
3212 0 : if( pParent->pgno>1 ) return 0;
3213 0 : if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
3214 0 : return 0;
3215 : }
3216 :
3217 : /*
3218 : ** Move the cursor up to the parent page.
3219 : **
3220 : ** pCur->idx is set to the cell index that contains the pointer
3221 : ** to the page we are coming from. If we are coming from the
3222 : ** right-most child page then pCur->idx is set to one more than
3223 : ** the largest cell index.
3224 : */
3225 0 : static void moveToParent(BtCursor *pCur){
3226 : MemPage *pParent;
3227 : MemPage *pPage;
3228 : int idxParent;
3229 :
3230 : assert( pCur->eState==CURSOR_VALID );
3231 0 : pPage = pCur->pPage;
3232 : assert( pPage!=0 );
3233 : assert( !isRootPage(pPage) );
3234 0 : pParent = pPage->pParent;
3235 : assert( pParent!=0 );
3236 0 : idxParent = pPage->idxParent;
3237 0 : sqlite3PagerRef(pParent->pDbPage);
3238 0 : releasePage(pPage);
3239 0 : pCur->pPage = pParent;
3240 0 : pCur->info.nSize = 0;
3241 : assert( pParent->idxShift==0 );
3242 0 : pCur->idx = idxParent;
3243 0 : }
3244 :
3245 : /*
3246 : ** Move the cursor to the root page
3247 : */
3248 1624 : static int moveToRoot(BtCursor *pCur){
3249 : MemPage *pRoot;
3250 1624 : int rc = SQLITE_OK;
3251 1624 : BtShared *pBt = pCur->pBtree->pBt;
3252 :
3253 1624 : if( pCur->eState==CURSOR_REQUIRESEEK ){
3254 0 : clearCursorPosition(pCur);
3255 : }
3256 1624 : pRoot = pCur->pPage;
3257 1624 : if( pRoot && pRoot->pgno==pCur->pgnoRoot ){
3258 : assert( pRoot->isInit );
3259 : }else{
3260 0 : if(
3261 0 : SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0))
3262 : ){
3263 0 : pCur->eState = CURSOR_INVALID;
3264 0 : return rc;
3265 : }
3266 0 : releasePage(pCur->pPage);
3267 0 : pCur->pPage = pRoot;
3268 : }
3269 1624 : pCur->idx = 0;
3270 1624 : pCur->info.nSize = 0;
3271 1624 : if( pRoot->nCell==0 && !pRoot->leaf ){
3272 : Pgno subpage;
3273 : assert( pRoot->pgno==1 );
3274 0 : subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3275 : assert( subpage>0 );
3276 0 : pCur->eState = CURSOR_VALID;
3277 0 : rc = moveToChild(pCur, subpage);
3278 : }
3279 1624 : pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3280 1624 : return rc;
3281 : }
3282 :
3283 : /*
3284 : ** Move the cursor down to the left-most leaf entry beneath the
3285 : ** entry to which it is currently pointing.
3286 : **
3287 : ** The left-most leaf is the one with the smallest key - the first
3288 : ** in ascending order.
3289 : */
3290 171 : static int moveToLeftmost(BtCursor *pCur){
3291 : Pgno pgno;
3292 : int rc;
3293 : MemPage *pPage;
3294 :
3295 : assert( pCur->eState==CURSOR_VALID );
3296 342 : while( !(pPage = pCur->pPage)->leaf ){
3297 : assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3298 0 : pgno = get4byte(findCell(pPage, pCur->idx));
3299 0 : rc = moveToChild(pCur, pgno);
3300 0 : if( rc ) return rc;
3301 : }
3302 171 : return SQLITE_OK;
3303 : }
3304 :
3305 : /*
3306 : ** Move the cursor down to the right-most leaf entry beneath the
3307 : ** page to which it is currently pointing. Notice the difference
3308 : ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3309 : ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3310 : ** finds the right-most entry beneath the *page*.
3311 : **
3312 : ** The right-most entry is the one with the largest key - the last
3313 : ** key in ascending order.
3314 : */
3315 132 : static int moveToRightmost(BtCursor *pCur){
3316 : Pgno pgno;
3317 : int rc;
3318 : MemPage *pPage;
3319 :
3320 : assert( pCur->eState==CURSOR_VALID );
3321 264 : while( !(pPage = pCur->pPage)->leaf ){
3322 0 : pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3323 0 : pCur->idx = pPage->nCell;
3324 0 : rc = moveToChild(pCur, pgno);
3325 0 : if( rc ) return rc;
3326 : }
3327 132 : pCur->idx = pPage->nCell - 1;
3328 132 : pCur->info.nSize = 0;
3329 132 : return SQLITE_OK;
3330 : }
3331 :
3332 : /* Move the cursor to the first entry in the table. Return SQLITE_OK
3333 : ** on success. Set *pRes to 0 if the cursor actually points to something
3334 : ** or set *pRes to 1 if the table is empty.
3335 : */
3336 183 : int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3337 : int rc;
3338 183 : rc = moveToRoot(pCur);
3339 183 : if( rc ) return rc;
3340 183 : if( pCur->eState==CURSOR_INVALID ){
3341 : assert( pCur->pPage->nCell==0 );
3342 12 : *pRes = 1;
3343 12 : return SQLITE_OK;
3344 : }
3345 : assert( pCur->pPage->nCell>0 );
3346 171 : *pRes = 0;
3347 171 : rc = moveToLeftmost(pCur);
3348 171 : return rc;
3349 : }
3350 :
3351 : /* Move the cursor to the last entry in the table. Return SQLITE_OK
3352 : ** on success. Set *pRes to 0 if the cursor actually points to something
3353 : ** or set *pRes to 1 if the table is empty.
3354 : */
3355 240 : int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3356 : int rc;
3357 240 : rc = moveToRoot(pCur);
3358 240 : if( rc ) return rc;
3359 240 : if( CURSOR_INVALID==pCur->eState ){
3360 : assert( pCur->pPage->nCell==0 );
3361 108 : *pRes = 1;
3362 108 : return SQLITE_OK;
3363 : }
3364 : assert( pCur->eState==CURSOR_VALID );
3365 132 : *pRes = 0;
3366 132 : rc = moveToRightmost(pCur);
3367 132 : return rc;
3368 : }
3369 :
3370 : /* Move the cursor so that it points to an entry near pKey/nKey.
3371 : ** Return a success code.
3372 : **
3373 : ** For INTKEY tables, only the nKey parameter is used. pKey is
3374 : ** ignored. For other tables, nKey is the number of bytes of data
3375 : ** in pKey. The comparison function specified when the cursor was
3376 : ** created is used to compare keys.
3377 : **
3378 : ** If an exact match is not found, then the cursor is always
3379 : ** left pointing at a leaf page which would hold the entry if it
3380 : ** were present. The cursor might point to an entry that comes
3381 : ** before or after the key.
3382 : **
3383 : ** The result of comparing the key with the entry to which the
3384 : ** cursor is written to *pRes if pRes!=NULL. The meaning of
3385 : ** this value is as follows:
3386 : **
3387 : ** *pRes<0 The cursor is left pointing at an entry that
3388 : ** is smaller than pKey or if the table is empty
3389 : ** and the cursor is therefore left point to nothing.
3390 : **
3391 : ** *pRes==0 The cursor is left pointing at an entry that
3392 : ** exactly matches pKey.
3393 : **
3394 : ** *pRes>0 The cursor is left pointing at an entry that
3395 : ** is larger than pKey.
3396 : */
3397 : int sqlite3BtreeMoveto(
3398 : BtCursor *pCur, /* The cursor to be moved */
3399 : const void *pKey, /* The key content for indices. Not used by tables */
3400 : i64 nKey, /* Size of pKey. Or the key for tables */
3401 : int biasRight, /* If true, bias the search to the high end */
3402 : int *pRes /* Search result flag */
3403 774 : ){
3404 : int rc;
3405 774 : rc = moveToRoot(pCur);
3406 774 : if( rc ) return rc;
3407 : assert( pCur->pPage );
3408 : assert( pCur->pPage->isInit );
3409 774 : if( pCur->eState==CURSOR_INVALID ){
3410 192 : *pRes = -1;
3411 : assert( pCur->pPage->nCell==0 );
3412 192 : return SQLITE_OK;
3413 : }
3414 : for(;;){
3415 : int lwr, upr;
3416 : Pgno chldPg;
3417 582 : MemPage *pPage = pCur->pPage;
3418 582 : int c = -1; /* pRes return if table is empty must be -1 */
3419 582 : lwr = 0;
3420 582 : upr = pPage->nCell-1;
3421 582 : if( !pPage->intKey && pKey==0 ){
3422 0 : return SQLITE_CORRUPT_BKPT;
3423 : }
3424 582 : if( biasRight ){
3425 132 : pCur->idx = upr;
3426 : }else{
3427 450 : pCur->idx = (upr+lwr)/2;
3428 : }
3429 582 : if( lwr<=upr ) for(;;){
3430 : void *pCellKey;
3431 : i64 nCellKey;
3432 805 : pCur->info.nSize = 0;
3433 805 : if( pPage->intKey ){
3434 : u8 *pCell;
3435 438 : pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize;
3436 438 : if( pPage->hasData ){
3437 : u32 dummy;
3438 438 : pCell += getVarint32(pCell, &dummy);
3439 : }
3440 438 : getVarint(pCell, (u64 *)&nCellKey);
3441 438 : if( nCellKey<nKey ){
3442 165 : c = -1;
3443 273 : }else if( nCellKey>nKey ){
3444 30 : c = +1;
3445 : }else{
3446 243 : c = 0;
3447 : }
3448 : }else{
3449 : int available;
3450 367 : pCellKey = (void *)fetchPayload(pCur, &available, 0);
3451 367 : nCellKey = pCur->info.nKey;
3452 367 : if( available>=nCellKey ){
3453 367 : c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3454 : }else{
3455 0 : pCellKey = sqliteMallocRaw( nCellKey );
3456 0 : if( pCellKey==0 ) return SQLITE_NOMEM;
3457 0 : rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3458 0 : c = pCur->xCompare(pCur->pArg, nCellKey, pCellKey, nKey, pKey);
3459 0 : sqliteFree(pCellKey);
3460 0 : if( rc ) return rc;
3461 : }
3462 : }
3463 805 : if( c==0 ){
3464 243 : if( pPage->leafData && !pPage->leaf ){
3465 0 : lwr = pCur->idx;
3466 0 : upr = lwr - 1;
3467 0 : break;
3468 : }else{
3469 243 : if( pRes ) *pRes = 0;
3470 243 : return SQLITE_OK;
3471 : }
3472 : }
3473 562 : if( c<0 ){
3474 456 : lwr = pCur->idx+1;
3475 : }else{
3476 106 : upr = pCur->idx-1;
3477 : }
3478 562 : if( lwr>upr ){
3479 339 : break;
3480 : }
3481 223 : pCur->idx = (lwr+upr)/2;
3482 223 : }
3483 : assert( lwr==upr+1 );
3484 : assert( pPage->isInit );
3485 339 : if( pPage->leaf ){
3486 339 : chldPg = 0;
3487 0 : }else if( lwr>=pPage->nCell ){
3488 0 : chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3489 : }else{
3490 0 : chldPg = get4byte(findCell(pPage, lwr));
3491 : }
3492 339 : if( chldPg==0 ){
3493 : assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3494 339 : if( pRes ) *pRes = c;
3495 339 : return SQLITE_OK;
3496 : }
3497 0 : pCur->idx = lwr;
3498 0 : pCur->info.nSize = 0;
3499 0 : rc = moveToChild(pCur, chldPg);
3500 0 : if( rc ){
3501 0 : return rc;
3502 : }
3503 0 : }
3504 : /* NOT REACHED */
3505 : }
3506 :
3507 : /*
3508 : ** Return TRUE if the cursor is not pointing at an entry of the table.
3509 : **
3510 : ** TRUE will be returned after a call to sqlite3BtreeNext() moves
3511 : ** past the last entry in the table or sqlite3BtreePrev() moves past
3512 : ** the first entry. TRUE is also returned if the table is empty.
3513 : */
3514 0 : int sqlite3BtreeEof(BtCursor *pCur){
3515 : /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3516 : ** have been deleted? This API will need to change to return an error code
3517 : ** as well as the boolean result value.
3518 : */
3519 0 : return (CURSOR_VALID!=pCur->eState);
3520 : }
3521 :
3522 : /*
3523 : ** Advance the cursor to the next entry in the database. If
3524 : ** successful then set *pRes=0. If the cursor
3525 : ** was already pointing to the last entry in the database before
3526 : ** this routine was called, then set *pRes=1.
3527 : */
3528 526 : int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3529 : int rc;
3530 : MemPage *pPage;
3531 :
3532 526 : rc = restoreOrClearCursorPosition(pCur);
3533 526 : if( rc!=SQLITE_OK ){
3534 0 : return rc;
3535 : }
3536 : assert( pRes!=0 );
3537 526 : pPage = pCur->pPage;
3538 526 : if( CURSOR_INVALID==pCur->eState ){
3539 39 : *pRes = 1;
3540 39 : return SQLITE_OK;
3541 : }
3542 487 : if( pCur->skip>0 ){
3543 0 : pCur->skip = 0;
3544 0 : *pRes = 0;
3545 0 : return SQLITE_OK;
3546 : }
3547 487 : pCur->skip = 0;
3548 :
3549 : assert( pPage->isInit );
3550 : assert( pCur->idx<pPage->nCell );
3551 :
3552 487 : pCur->idx++;
3553 487 : pCur->info.nSize = 0;
3554 487 : if( pCur->idx>=pPage->nCell ){
3555 229 : if( !pPage->leaf ){
3556 0 : rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3557 0 : if( rc ) return rc;
3558 0 : rc = moveToLeftmost(pCur);
3559 0 : *pRes = 0;
3560 0 : return rc;
3561 : }
3562 : do{
3563 229 : if( isRootPage(pPage) ){
3564 229 : *pRes = 1;
3565 229 : pCur->eState = CURSOR_INVALID;
3566 229 : return SQLITE_OK;
3567 : }
3568 0 : moveToParent(pCur);
3569 0 : pPage = pCur->pPage;
3570 0 : }while( pCur->idx>=pPage->nCell );
3571 0 : *pRes = 0;
3572 0 : if( pPage->leafData ){
3573 0 : rc = sqlite3BtreeNext(pCur, pRes);
3574 : }else{
3575 0 : rc = SQLITE_OK;
3576 : }
3577 0 : return rc;
3578 : }
3579 258 : *pRes = 0;
3580 258 : if( pPage->leaf ){
3581 258 : return SQLITE_OK;
3582 : }
3583 0 : rc = moveToLeftmost(pCur);
3584 0 : return rc;
3585 : }
3586 :
3587 : /*
3588 : ** Step the cursor to the back to the previous entry in the database. If
3589 : ** successful then set *pRes=0. If the cursor
3590 : ** was already pointing to the first entry in the database before
3591 : ** this routine was called, then set *pRes=1.
3592 : */
3593 0 : int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3594 : int rc;
3595 : Pgno pgno;
3596 : MemPage *pPage;
3597 :
3598 0 : rc = restoreOrClearCursorPosition(pCur);
3599 0 : if( rc!=SQLITE_OK ){
3600 0 : return rc;
3601 : }
3602 0 : if( CURSOR_INVALID==pCur->eState ){
3603 0 : *pRes = 1;
3604 0 : return SQLITE_OK;
3605 : }
3606 0 : if( pCur->skip<0 ){
3607 0 : pCur->skip = 0;
3608 0 : *pRes = 0;
3609 0 : return SQLITE_OK;
3610 : }
3611 0 : pCur->skip = 0;
3612 :
3613 0 : pPage = pCur->pPage;
3614 : assert( pPage->isInit );
3615 : assert( pCur->idx>=0 );
3616 0 : if( !pPage->leaf ){
3617 0 : pgno = get4byte( findCell(pPage, pCur->idx) );
3618 0 : rc = moveToChild(pCur, pgno);
3619 0 : if( rc ) return rc;
3620 0 : rc = moveToRightmost(pCur);
3621 : }else{
3622 0 : while( pCur->idx==0 ){
3623 0 : if( isRootPage(pPage) ){
3624 0 : pCur->eState = CURSOR_INVALID;
3625 0 : *pRes = 1;
3626 0 : return SQLITE_OK;
3627 : }
3628 0 : moveToParent(pCur);
3629 0 : pPage = pCur->pPage;
3630 : }
3631 0 : pCur->idx--;
3632 0 : pCur->info.nSize = 0;
3633 0 : if( pPage->leafData && !pPage->leaf ){
3634 0 : rc = sqlite3BtreePrevious(pCur, pRes);
3635 : }else{
3636 0 : rc = SQLITE_OK;
3637 : }
3638 : }
3639 0 : *pRes = 0;
3640 0 : return rc;
3641 : }
3642 :
3643 : /*
3644 : ** Allocate a new page from the database file.
3645 : **
3646 : ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
3647 : ** has already been called on the new page.) The new page has also
3648 : ** been referenced and the calling routine is responsible for calling
3649 : ** sqlite3PagerUnref() on the new page when it is done.
3650 : **
3651 : ** SQLITE_OK is returned on success. Any other return value indicates
3652 : ** an error. *ppPage and *pPgno are undefined in the event of an error.
3653 : ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
3654 : **
3655 : ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
3656 : ** locate a page close to the page number "nearby". This can be used in an
3657 : ** attempt to keep related pages close to each other in the database file,
3658 : ** which in turn can make database access faster.
3659 : **
3660 : ** If the "exact" parameter is not 0, and the page-number nearby exists
3661 : ** anywhere on the free-list, then it is guarenteed to be returned. This
3662 : ** is only used by auto-vacuum databases when allocating a new table.
3663 : */
3664 : static int allocateBtreePage(
3665 : BtShared *pBt,
3666 : MemPage **ppPage,
3667 : Pgno *pPgno,
3668 : Pgno nearby,
3669 : u8 exact
3670 96 : ){
3671 : MemPage *pPage1;
3672 : int rc;
3673 : int n; /* Number of pages on the freelist */
3674 : int k; /* Number of leaves on the trunk of the freelist */
3675 96 : MemPage *pTrunk = 0;
3676 96 : MemPage *pPrevTrunk = 0;
3677 :
3678 96 : pPage1 = pBt->pPage1;
3679 96 : n = get4byte(&pPage1->aData[36]);
3680 96 : if( n>0 ){
3681 : /* There are pages on the freelist. Reuse one of those pages. */
3682 : Pgno iTrunk;
3683 3 : u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
3684 :
3685 : /* If the 'exact' parameter was true and a query of the pointer-map
3686 : ** shows that the page 'nearby' is somewhere on the free-list, then
3687 : ** the entire-list will be searched for that page.
3688 : */
3689 : #ifndef SQLITE_OMIT_AUTOVACUUM
3690 3 : if( exact ){
3691 : u8 eType;
3692 : assert( nearby>0 );
3693 : assert( pBt->autoVacuum );
3694 0 : rc = ptrmapGet(pBt, nearby, &eType, 0);
3695 0 : if( rc ) return rc;
3696 0 : if( eType==PTRMAP_FREEPAGE ){
3697 0 : searchList = 1;
3698 : }
3699 0 : *pPgno = nearby;
3700 : }
3701 : #endif
3702 :
3703 : /* Decrement the free-list count by 1. Set iTrunk to the index of the
3704 : ** first free-list trunk page. iPrevTrunk is initially 1.
3705 : */
3706 3 : rc = sqlite3PagerWrite(pPage1->pDbPage);
3707 3 : if( rc ) return rc;
3708 3 : put4byte(&pPage1->aData[36], n-1);
3709 :
3710 : /* The code within this loop is run only once if the 'searchList' variable
3711 : ** is not true. Otherwise, it runs once for each trunk-page on the
3712 : ** free-list until the page 'nearby' is located.
3713 : */
3714 : do {
3715 3 : pPrevTrunk = pTrunk;
3716 3 : if( pPrevTrunk ){
3717 0 : iTrunk = get4byte(&pPrevTrunk->aData[0]);
3718 : }else{
3719 3 : iTrunk = get4byte(&pPage1->aData[32]);
3720 : }
3721 3 : rc = getPage(pBt, iTrunk, &pTrunk, 0);
3722 3 : if( rc ){
3723 0 : pTrunk = 0;
3724 0 : goto end_allocate_page;
3725 : }
3726 :
3727 3 : k = get4byte(&pTrunk->aData[4]);
3728 6 : if( k==0 && !searchList ){
3729 : /* The trunk has no leaves and the list is not being searched.
3730 : ** So extract the trunk page itself and use it as the newly
3731 : ** allocated page */
3732 : assert( pPrevTrunk==0 );
3733 3 : rc = sqlite3PagerWrite(pTrunk->pDbPage);
3734 3 : if( rc ){
3735 0 : goto end_allocate_page;
3736 : }
3737 3 : *pPgno = iTrunk;
3738 3 : memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3739 3 : *ppPage = pTrunk;
3740 3 : pTrunk = 0;
3741 : TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3742 0 : }else if( k>pBt->usableSize/4 - 8 ){
3743 : /* Value of k is out of range. Database corruption */
3744 0 : rc = SQLITE_CORRUPT_BKPT;
3745 0 : goto end_allocate_page;
3746 : #ifndef SQLITE_OMIT_AUTOVACUUM
3747 0 : }else if( searchList && nearby==iTrunk ){
3748 : /* The list is being searched and this trunk page is the page
3749 : ** to allocate, regardless of whether it has leaves.
3750 : */
3751 : assert( *pPgno==iTrunk );
3752 0 : *ppPage = pTrunk;
3753 0 : searchList = 0;
3754 0 : rc = sqlite3PagerWrite(pTrunk->pDbPage);
3755 0 : if( rc ){
3756 0 : goto end_allocate_page;
3757 : }
3758 0 : if( k==0 ){
3759 0 : if( !pPrevTrunk ){
3760 0 : memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
3761 : }else{
3762 0 : memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
3763 : }
3764 : }else{
3765 : /* The trunk page is required by the caller but it contains
3766 : ** pointers to free-list leaves. The first leaf becomes a trunk
3767 : ** page in this case.
3768 : */
3769 : MemPage *pNewTrunk;
3770 0 : Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
3771 0 : rc = getPage(pBt, iNewTrunk, &pNewTrunk, 0);
3772 0 : if( rc!=SQLITE_OK ){
3773 0 : goto end_allocate_page;
3774 : }
3775 0 : rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
3776 0 : if( rc!=SQLITE_OK ){
3777 0 : releasePage(pNewTrunk);
3778 0 : goto end_allocate_page;
3779 : }
3780 0 : memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
3781 0 : put4byte(&pNewTrunk->aData[4], k-1);
3782 0 : memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
3783 0 : releasePage(pNewTrunk);
3784 0 : if( !pPrevTrunk ){
3785 0 : put4byte(&pPage1->aData[32], iNewTrunk);
3786 : }else{
3787 0 : rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
3788 0 : if( rc ){
3789 0 : goto end_allocate_page;
3790 : }
3791 0 : put4byte(&pPrevTrunk->aData[0], iNewTrunk);
3792 : }
3793 : }
3794 0 : pTrunk = 0;
3795 : TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
3796 : #endif
3797 : }else{
3798 : /* Extract a leaf from the trunk */
3799 : int closest;
3800 : Pgno iPage;
3801 0 : unsigned char *aData = pTrunk->aData;
3802 0 : rc = sqlite3PagerWrite(pTrunk->pDbPage);
3803 0 : if( rc ){
3804 0 : goto end_allocate_page;
3805 : }
3806 0 : if( nearby>0 ){
3807 : int i, dist;
3808 0 : closest = 0;
3809 0 : dist = get4byte(&aData[8]) - nearby;
3810 0 : if( dist<0 ) dist = -dist;
3811 0 : for(i=1; i<k; i++){
3812 0 : int d2 = get4byte(&aData[8+i*4]) - nearby;
3813 0 : if( d2<0 ) d2 = -d2;
3814 0 : if( d2<dist ){
3815 0 : closest = i;
3816 0 : dist = d2;
3817 : }
3818 : }
3819 : }else{
3820 0 : closest = 0;
3821 : }
3822 :
3823 0 : iPage = get4byte(&aData[8+closest*4]);
3824 0 : if( !searchList || iPage==nearby ){
3825 0 : *pPgno = iPage;
3826 0 : if( *pPgno>sqlite3PagerPagecount(pBt->pPager) ){
3827 : /* Free page off the end of the file */
3828 0 : return SQLITE_CORRUPT_BKPT;
3829 : }
3830 : TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
3831 : ": %d more free pages\n",
3832 : *pPgno, closest+1, k, pTrunk->pgno, n-1));
3833 0 : if( closest<k-1 ){
3834 0 : memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
3835 : }
3836 0 : put4byte(&aData[4], k-1);
3837 0 : rc = getPage(pBt, *pPgno, ppPage, 1);
3838 0 : if( rc==SQLITE_OK ){
3839 0 : sqlite3PagerDontRollback((*ppPage)->pDbPage);
3840 0 : rc = sqlite3PagerWrite((*ppPage)->pDbPage);
3841 0 : if( rc!=SQLITE_OK ){
3842 0 : releasePage(*ppPage);
3843 : }
3844 : }
3845 0 : searchList = 0;
3846 : }
3847 : }
3848 3 : releasePage(pPrevTrunk);
3849 3 : pPrevTrunk = 0;
3850 3 : }while( searchList );
3851 : }else{
3852 : /* There are no pages on the freelist, so create a new page at the
3853 : ** end of the file */
3854 93 : *pPgno = sqlite3PagerPagecount(pBt->pPager) + 1;
3855 :
3856 : #ifndef SQLITE_OMIT_AUTOVACUUM
3857 93 : if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
3858 : /* If *pPgno refers to a pointer-map page, allocate two new pages
3859 : ** at the end of the file instead of one. The first allocated page
3860 : ** becomes a new pointer-map page, the second is used by the caller.
3861 : */
3862 : TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
3863 : assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3864 0 : (*pPgno)++;
3865 : }
3866 : #endif
3867 :
3868 : assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3869 93 : rc = getPage(pBt, *pPgno, ppPage, 0);
3870 93 : if( rc ) return rc;
3871 93 : rc = sqlite3PagerWrite((*ppPage)->pDbPage);
3872 93 : if( rc!=SQLITE_OK ){
3873 0 : releasePage(*ppPage);
3874 : }
3875 : TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
3876 : }
3877 :
3878 : assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
3879 :
3880 96 : end_allocate_page:
3881 96 : releasePage(pTrunk);
3882 96 : releasePage(pPrevTrunk);
3883 96 : return rc;
3884 : }
3885 :
3886 : /*
3887 : ** Add a page of the database file to the freelist.
3888 : **
3889 : ** sqlite3PagerUnref() is NOT called for pPage.
3890 : */
3891 4 : static int freePage(MemPage *pPage){
3892 4 : BtShared *pBt = pPage->pBt;
3893 4 : MemPage *pPage1 = pBt->pPage1;
3894 : int rc, n, k;
3895 :
3896 : /* Prepare the page for freeing */
3897 : assert( pPage->pgno>1 );
3898 4 : pPage->isInit = 0;
3899 4 : releasePage(pPage->pParent);
3900 4 : pPage->pParent = 0;
3901 :
3902 : /* Increment the free page count on pPage1 */
3903 4 : rc = sqlite3PagerWrite(pPage1->pDbPage);
3904 4 : if( rc ) return rc;
3905 4 : n = get4byte(&pPage1->aData[36]);
3906 4 : put4byte(&pPage1->aData[36], n+1);
3907 :
3908 : #ifdef SQLITE_SECURE_DELETE
3909 : /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
3910 : ** always fully overwrite deleted information with zeros.
3911 : */
3912 : rc = sqlite3PagerWrite(pPage->pDbPage);
3913 : if( rc ) return rc;
3914 : memset(pPage->aData, 0, pPage->pBt->pageSize);
3915 : #endif
3916 :
3917 : #ifndef SQLITE_OMIT_AUTOVACUUM
3918 : /* If the database supports auto-vacuum, write an entry in the pointer-map
3919 : ** to indicate that the page is free.
3920 : */
3921 4 : if( pBt->autoVacuum ){
3922 0 : rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
3923 0 : if( rc ) return rc;
3924 : }
3925 : #endif
3926 :
3927 4 : if( n==0 ){
3928 : /* This is the first free page */
3929 4 : rc = sqlite3PagerWrite(pPage->pDbPage);
3930 4 : if( rc ) return rc;
3931 4 : memset(pPage->aData, 0, 8);
3932 4 : put4byte(&pPage1->aData[32], pPage->pgno);
3933 : TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
3934 : }else{
3935 : /* Other free pages already exist. Retrive the first trunk page
3936 : ** of the freelist and find out how many leaves it has. */
3937 : MemPage *pTrunk;
3938 0 : rc = getPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
3939 0 : if( rc ) return rc;
3940 0 : k = get4byte(&pTrunk->aData[4]);
3941 0 : if( k>=pBt->usableSize/4 - 8 ){
3942 : /* The trunk is full. Turn the page being freed into a new
3943 : ** trunk page with no leaves. */
3944 0 : rc = sqlite3PagerWrite(pPage->pDbPage);
3945 0 : if( rc ) return rc;
3946 0 : put4byte(pPage->aData, pTrunk->pgno);
3947 0 : put4byte(&pPage->aData[4], 0);
3948 0 : put4byte(&pPage1->aData[32], pPage->pgno);
3949 : TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
3950 : pPage->pgno, pTrunk->pgno));
3951 : }else{
3952 : /* Add the newly freed page as a leaf on the current trunk */
3953 0 : rc = sqlite3PagerWrite(pTrunk->pDbPage);
3954 0 : if( rc==SQLITE_OK ){
3955 0 : put4byte(&pTrunk->aData[4], k+1);
3956 0 : put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
3957 : #ifndef SQLITE_SECURE_DELETE
3958 0 : sqlite3PagerDontWrite(pPage->pDbPage);
3959 : #endif
3960 : }
3961 : TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
3962 : }
3963 0 : releasePage(pTrunk);
3964 : }
3965 4 : return rc;
3966 : }
3967 :
3968 : /*
3969 : ** Free any overflow pages associated with the given Cell.
3970 : */
3971 82 : static int clearCell(MemPage *pPage, unsigned char *pCell){
3972 82 : BtShared *pBt = pPage->pBt;
3973 : CellInfo info;
3974 : Pgno ovflPgno;
3975 : int rc;
3976 : int nOvfl;
3977 : int ovflPageSize;
3978 :
3979 82 : parseCellPtr(pPage, pCell, &info);
3980 82 : if( info.iOverflow==0 ){
3981 82 : return SQLITE_OK; /* No overflow pages. Return without doing anything */
3982 : }
3983 0 : ovflPgno = get4byte(&pCell[info.iOverflow]);
3984 0 : ovflPageSize = pBt->usableSize - 4;
3985 0 : nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
3986 : assert( ovflPgno==0 || nOvfl>0 );
3987 0 : while( nOvfl-- ){
3988 : MemPage *pOvfl;
3989 0 : if( ovflPgno==0 || ovflPgno>sqlite3PagerPagecount(pBt->pPager) ){
3990 0 : return SQLITE_CORRUPT_BKPT;
3991 : }
3992 0 : rc = getPage(pBt, ovflPgno, &pOvfl, nOvfl==0);
3993 0 : if( rc ) return rc;
3994 0 : if( nOvfl ){
3995 0 : ovflPgno = get4byte(pOvfl->aData);
3996 : }
3997 0 : rc = freePage(pOvfl);
3998 0 : sqlite3PagerUnref(pOvfl->pDbPage);
3999 0 : if( rc ) return rc;
4000 : }
4001 0 : return SQLITE_OK;
4002 : }
4003 :
4004 : /*
4005 : ** Create the byte sequence used to represent a cell on page pPage
4006 : ** and write that byte sequence into pCell[]. Overflow pages are
4007 : ** allocated and filled in as necessary. The calling procedure
4008 : ** is responsible for making sure sufficient space has been allocated
4009 : ** for pCell[].
4010 : **
4011 : ** Note that pCell does not necessary need to point to the pPage->aData
4012 : ** area. pCell might point to some temporary storage. The cell will
4013 : ** be constructed in this temporary area then copied into pPage->aData
4014 : ** later.
4015 : */
4016 : static int fillInCell(
4017 : MemPage *pPage, /* The page that contains the cell */
4018 : unsigned char *pCell, /* Complete text of the cell */
4019 : const void *pKey, i64 nKey, /* The key */
4020 : const void *pData,int nData, /* The data */
4021 : int *pnSize /* Write cell size here */
4022 422 : ){
4023 : int nPayload;
4024 : const u8 *pSrc;
4025 : int nSrc, n, rc;
4026 : int spaceLeft;
4027 422 : MemPage *pOvfl = 0;
4028 422 : MemPage *pToRelease = 0;
4029 : unsigned char *pPrior;
4030 : unsigned char *pPayload;
4031 422 : BtShared *pBt = pPage->pBt;
4032 422 : Pgno pgnoOvfl = 0;
4033 : int nHeader;
4034 : CellInfo info;
4035 :
4036 : /* Fill in the header. */
4037 422 : nHeader = 0;
4038 422 : if( !pPage->leaf ){
4039 0 : nHeader += 4;
4040 : }
4041 422 : if( pPage->hasData ){
4042 301 : nHeader += putVarint(&pCell[nHeader], nData);
4043 : }else{
4044 121 : nData = 0;
4045 : }
4046 422 : nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4047 422 : parseCellPtr(pPage, pCell, &info);
4048 : assert( info.nHeader==nHeader );
4049 : assert( info.nKey==nKey );
4050 : assert( info.nData==nData );
4051 :
4052 : /* Fill in the payload */
4053 422 : nPayload = nData;
4054 422 : if( pPage->intKey ){
4055 301 : pSrc = pData;
4056 301 : nSrc = nData;
4057 301 : nData = 0;
4058 : }else{
4059 121 : nPayload += nKey;
4060 121 : pSrc = pKey;
4061 121 : nSrc = nKey;
4062 : }
4063 422 : *pnSize = info.nSize;
4064 422 : spaceLeft = info.nLocal;
4065 422 : pPayload = &pCell[nHeader];
4066 422 : pPrior = &pCell[info.iOverflow];
4067 :
4068 1209 : while( nPayload>0 ){
4069 365 : if( spaceLeft==0 ){
4070 : #ifndef SQLITE_OMIT_AUTOVACUUM
4071 0 : Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4072 : #endif
4073 0 : rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
4074 : #ifndef SQLITE_OMIT_AUTOVACUUM
4075 : /* If the database supports auto-vacuum, and the second or subsequent
4076 : ** overflow page is being allocated, add an entry to the pointer-map
4077 : ** for that page now. The entry for the first overflow page will be
4078 : ** added later, by the insertCell() routine.
4079 : */
4080 0 : if( pBt->autoVacuum && pgnoPtrmap!=0 && rc==SQLITE_OK ){
4081 0 : rc = ptrmapPut(pBt, pgnoOvfl, PTRMAP_OVERFLOW2, pgnoPtrmap);
4082 : }
4083 : #endif
4084 0 : if( rc ){
4085 0 : releasePage(pToRelease);
4086 0 : return rc;
4087 : }
4088 0 : put4byte(pPrior, pgnoOvfl);
4089 0 : releasePage(pToRelease);
4090 0 : pToRelease = pOvfl;
4091 0 : pPrior = pOvfl->aData;
4092 0 : put4byte(pPrior, 0);
4093 0 : pPayload = &pOvfl->aData[4];
4094 0 : spaceLeft = pBt->usableSize - 4;
4095 : }
4096 365 : n = nPayload;
4097 365 : if( n>spaceLeft ) n = spaceLeft;
4098 365 : if( n>nSrc ) n = nSrc;
4099 : assert( pSrc );
4100 365 : memcpy(pPayload, pSrc, n);
4101 365 : nPayload -= n;
4102 365 : pPayload += n;
4103 365 : pSrc += n;
4104 365 : nSrc -= n;
4105 365 : spaceLeft -= n;
4106 365 : if( nSrc==0 ){
4107 365 : nSrc = nData;
4108 365 : pSrc = pData;
4109 : }
4110 : }
4111 422 : releasePage(pToRelease);
4112 422 : return SQLITE_OK;
4113 : }
4114 :
4115 : /*
4116 : ** Change the MemPage.pParent pointer on the page whose number is
4117 : ** given in the second argument so that MemPage.pParent holds the
4118 : ** pointer in the third argument.
4119 : */
4120 0 : static int reparentPage(BtShared *pBt, Pgno pgno, MemPage *pNewParent, int idx){
4121 : MemPage *pThis;
4122 : DbPage *pDbPage;
4123 :
4124 : assert( pNewParent!=0 );
4125 0 : if( pgno==0 ) return SQLITE_OK;
4126 : assert( pBt->pPager!=0 );
4127 0 : pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
4128 0 : if( pDbPage ){
4129 0 : pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage);
4130 0 : if( pThis->isInit ){
4131 : assert( pThis->aData==(sqlite3PagerGetData(pDbPage)) );
4132 0 : if( pThis->pParent!=pNewParent ){
4133 0 : if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage);
4134 0 : pThis->pParent = pNewParent;
4135 0 : sqlite3PagerRef(pNewParent->pDbPage);
4136 : }
4137 0 : pThis->idxParent = idx;
4138 : }
4139 0 : sqlite3PagerUnref(pDbPage);
4140 : }
4141 :
4142 : #ifndef SQLITE_OMIT_AUTOVACUUM
4143 0 : if( pBt->autoVacuum ){
4144 0 : return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
4145 : }
4146 : #endif
4147 0 : return SQLITE_OK;
4148 : }
4149 :
4150 :
4151 :
4152 : /*
4153 : ** Change the pParent pointer of all children of pPage to point back
4154 : ** to pPage.
4155 : **
4156 : ** In other words, for every child of pPage, invoke reparentPage()
4157 : ** to make sure that each child knows that pPage is its parent.
4158 : **
4159 : ** This routine gets called after you memcpy() one page into
4160 : ** another.
4161 : */
4162 0 : static int reparentChildPages(MemPage *pPage){
4163 : int i;
4164 0 : BtShared *pBt = pPage->pBt;
4165 0 : int rc = SQLITE_OK;
4166 :
4167 0 : if( pPage->leaf ) return SQLITE_OK;
4168 :
4169 0 : for(i=0; i<pPage->nCell; i++){
4170 0 : u8 *pCell = findCell(pPage, i);
4171 0 : if( !pPage->leaf ){
4172 0 : rc = reparentPage(pBt, get4byte(pCell), pPage, i);
4173 0 : if( rc!=SQLITE_OK ) return rc;
4174 : }
4175 : }
4176 0 : if( !pPage->leaf ){
4177 0 : rc = reparentPage(pBt, get4byte(&pPage->aData[pPage->hdrOffset+8]),
4178 : pPage, i);
4179 0 : pPage->idxShift = 0;
4180 : }
4181 0 : return rc;
4182 : }
4183 :
4184 : /*
4185 : ** Remove the i-th cell from pPage. This routine effects pPage only.
4186 : ** The cell content is not freed or deallocated. It is assumed that
4187 : ** the cell content has been copied someplace else. This routine just
4188 : ** removes the reference to the cell from pPage.
4189 : **
4190 : ** "sz" must be the number of bytes in the cell.
4191 : */
4192 63 : static void dropCell(MemPage *pPage, int idx, int sz){
4193 : int i; /* Loop counter */
4194 : int pc; /* Offset to cell content of cell being deleted */
4195 : u8 *data; /* pPage->aData */
4196 : u8 *ptr; /* Used to move bytes around within data[] */
4197 :
4198 : assert( idx>=0 && idx<pPage->nCell );
4199 : assert( sz==cellSize(pPage, idx) );
4200 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4201 63 : data = pPage->aData;
4202 63 : ptr = &data[pPage->cellOffset + 2*idx];
4203 63 : pc = get2byte(ptr);
4204 : assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4205 63 : freeSpace(pPage, pc, sz);
4206 102 : for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4207 39 : ptr[0] = ptr[2];
4208 39 : ptr[1] = ptr[3];
4209 : }
4210 63 : pPage->nCell--;
4211 63 : put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4212 63 : pPage->nFree += 2;
4213 63 : pPage->idxShift = 1;
4214 63 : }
4215 :
4216 : /*
4217 : ** Insert a new cell on pPage at cell index "i". pCell points to the
4218 : ** content of the cell.
4219 : **
4220 : ** If the cell content will fit on the page, then put it there. If it
4221 : ** will not fit, then make a copy of the cell content into pTemp if
4222 : ** pTemp is not null. Regardless of pTemp, allocate a new entry
4223 : ** in pPage->aOvfl[] and make it point to the cell content (either
4224 : ** in pTemp or the original pCell) and also record its index.
4225 : ** Allocating a new entry in pPage->aCell[] implies that
4226 : ** pPage->nOverflow is incremented.
4227 : **
4228 : ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4229 : ** cell. The caller will overwrite them after this function returns. If
4230 : ** nSkip is non-zero, then pCell may not point to an invalid memory location
4231 : ** (but pCell+nSkip is always valid).
4232 : */
4233 : static int insertCell(
4234 : MemPage *pPage, /* Page into which we are copying */
4235 : int i, /* New cell becomes the i-th cell of the page */
4236 : u8 *pCell, /* Content of the new cell */
4237 : int sz, /* Bytes of content in pCell */
4238 : u8 *pTemp, /* Temp storage space for pCell, if needed */
4239 : u8 nSkip /* Do not write the first nSkip bytes of the cell */
4240 422 : ){
4241 : int idx; /* Where to write new cell content in data[] */
4242 : int j; /* Loop counter */
4243 : int top; /* First byte of content for any cell in data[] */
4244 : int end; /* First byte past the last cell pointer in data[] */
4245 : int ins; /* Index in data[] where new cell pointer is inserted */
4246 : int hdr; /* Offset into data[] of the page header */
4247 : int cellOffset; /* Address of first cell pointer in data[] */
4248 : u8 *data; /* The content of the whole page */
4249 : u8 *ptr; /* Used for moving information around in data[] */
4250 :
4251 : assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4252 : assert( sz==cellSizePtr(pPage, pCell) );
4253 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4254 422 : if( pPage->nOverflow || sz+2>pPage->nFree ){
4255 0 : if( pTemp ){
4256 0 : memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4257 0 : pCell = pTemp;
4258 : }
4259 0 : j = pPage->nOverflow++;
4260 : assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4261 0 : pPage->aOvfl[j].pCell = pCell;
4262 0 : pPage->aOvfl[j].idx = i;
4263 0 : pPage->nFree = 0;
4264 : }else{
4265 422 : data = pPage->aData;
4266 422 : hdr = pPage->hdrOffset;
4267 422 : top = get2byte(&data[hdr+5]);
4268 422 : cellOffset = pPage->cellOffset;
4269 422 : end = cellOffset + 2*pPage->nCell + 2;
4270 422 : ins = cellOffset + 2*i;
4271 422 : if( end > top - sz ){
4272 0 : int rc = defragmentPage(pPage);
4273 0 : if( rc!=SQLITE_OK ) return rc;
4274 0 : top = get2byte(&data[hdr+5]);
4275 : assert( end + sz <= top );
4276 : }
4277 422 : idx = allocateSpace(pPage, sz);
4278 : assert( idx>0 );
4279 : assert( end <= get2byte(&data[hdr+5]) );
4280 422 : pPage->nCell++;
4281 422 : pPage->nFree -= 2;
4282 422 : memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4283 482 : for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4284 60 : ptr[0] = ptr[-2];
4285 60 : ptr[1] = ptr[-1];
4286 : }
4287 422 : put2byte(&data[ins], idx);
4288 422 : put2byte(&data[hdr+3], pPage->nCell);
4289 422 : pPage->idxShift = 1;
4290 : #ifndef SQLITE_OMIT_AUTOVACUUM
4291 422 : if( pPage->pBt->autoVacuum ){
4292 : /* The cell may contain a pointer to an overflow page. If so, write
4293 : ** the entry for the overflow page into the pointer map.
4294 : */
4295 : CellInfo info;
4296 0 : parseCellPtr(pPage, pCell, &info);
4297 : assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4298 0 : if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4299 0 : Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4300 0 : int rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4301 0 : if( rc!=SQLITE_OK ) return rc;
4302 : }
4303 : }
4304 : #endif
4305 : }
4306 :
4307 422 : return SQLITE_OK;
4308 : }
4309 :
4310 : /*
4311 : ** Add a list of cells to a page. The page should be initially empty.
4312 : ** The cells are guaranteed to fit on the page.
4313 : */
4314 : static void assemblePage(
4315 : MemPage *pPage, /* The page to be assemblied */
4316 : int nCell, /* The number of cells to add to this page */
4317 : u8 **apCell, /* Pointers to cell bodies */
4318 : int *aSize /* Sizes of the cells */
4319 0 : ){
4320 : int i; /* Loop counter */
4321 : int totalSize; /* Total size of all cells */
4322 : int hdr; /* Index of page header */
4323 : int cellptr; /* Address of next cell pointer */
4324 : int cellbody; /* Address of next cell body */
4325 : u8 *data; /* Data for the page */
4326 :
4327 : assert( pPage->nOverflow==0 );
4328 0 : totalSize = 0;
4329 0 : for(i=0; i<nCell; i++){
4330 0 : totalSize += aSize[i];
4331 : }
4332 : assert( totalSize+2*nCell<=pPage->nFree );
4333 : assert( pPage->nCell==0 );
4334 0 : cellptr = pPage->cellOffset;
4335 0 : data = pPage->aData;
4336 0 : hdr = pPage->hdrOffset;
4337 0 : put2byte(&data[hdr+3], nCell);
4338 0 : if( nCell ){
4339 0 : cellbody = allocateSpace(pPage, totalSize);
4340 : assert( cellbody>0 );
4341 : assert( pPage->nFree >= 2*nCell );
4342 0 : pPage->nFree -= 2*nCell;
4343 0 : for(i=0; i<nCell; i++){
4344 0 : put2byte(&data[cellptr], cellbody);
4345 0 : memcpy(&data[cellbody], apCell[i], aSize[i]);
4346 0 : cellptr += 2;
4347 0 : cellbody += aSize[i];
4348 : }
4349 : assert( cellbody==pPage->pBt->usableSize );
4350 : }
4351 0 : pPage->nCell = nCell;
4352 0 : }
4353 :
4354 : /*
4355 : ** The following parameters determine how many adjacent pages get involved
4356 : ** in a balancing operation. NN is the number of neighbors on either side
4357 : ** of the page that participate in the balancing operation. NB is the
4358 : ** total number of pages that participate, including the target page and
4359 : ** NN neighbors on either side.
4360 : **
4361 : ** The minimum value of NN is 1 (of course). Increasing NN above 1
4362 : ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4363 : ** in exchange for a larger degradation in INSERT and UPDATE performance.
4364 : ** The value of NN appears to give the best results overall.
4365 : */
4366 : #define NN 1 /* Number of neighbors on either side of pPage */
4367 : #define NB (NN*2+1) /* Total pages involved in the balance */
4368 :
4369 : /* Forward reference */
4370 : static int balance(MemPage*, int);
4371 :
4372 : #ifndef SQLITE_OMIT_QUICKBALANCE
4373 : /*
4374 : ** This version of balance() handles the common special case where
4375 : ** a new entry is being inserted on the extreme right-end of the
4376 : ** tree, in other words, when the new entry will become the largest
4377 : ** entry in the tree.
4378 : **
4379 : ** Instead of trying balance the 3 right-most leaf pages, just add
4380 : ** a new page to the right-hand side and put the one new entry in
4381 : ** that page. This leaves the right side of the tree somewhat
4382 : ** unbalanced. But odds are that we will be inserting new entries
4383 : ** at the end soon afterwards so the nearly empty page will quickly
4384 : ** fill up. On average.
4385 : **
4386 : ** pPage is the leaf page which is the right-most page in the tree.
4387 : ** pParent is its parent. pPage must have a single overflow entry
4388 : ** which is also the right-most entry on the page.
4389 : */
4390 0 : static int balance_quick(MemPage *pPage, MemPage *pParent){
4391 : int rc;
4392 : MemPage *pNew;
4393 : Pgno pgnoNew;
4394 : u8 *pCell;
4395 : int szCell;
4396 : CellInfo info;
4397 0 : BtShared *pBt = pPage->pBt;
4398 0 : int parentIdx = pParent->nCell; /* pParent new divider cell index */
4399 : int parentSize; /* Size of new divider cell */
4400 : u8 parentCell[64]; /* Space for the new divider cell */
4401 :
4402 : /* Allocate a new page. Insert the overflow cell from pPage
4403 : ** into it. Then remove the overflow cell from pPage.
4404 : */
4405 0 : rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4406 0 : if( rc!=SQLITE_OK ){
4407 0 : return rc;
4408 : }
4409 0 : pCell = pPage->aOvfl[0].pCell;
4410 0 : szCell = cellSizePtr(pPage, pCell);
4411 0 : zeroPage(pNew, pPage->aData[0]);
4412 0 : assemblePage(pNew, 1, &pCell, &szCell);
4413 0 : pPage->nOverflow = 0;
4414 :
4415 : /* Set the parent of the newly allocated page to pParent. */
4416 0 : pNew->pParent = pParent;
4417 0 : sqlite3PagerRef(pParent->pDbPage);
4418 :
4419 : /* pPage is currently the right-child of pParent. Change this
4420 : ** so that the right-child is the new page allocated above and
4421 : ** pPage is the next-to-right child.
4422 : */
4423 : assert( pPage->nCell>0 );
4424 0 : parseCellPtr(pPage, findCell(pPage, pPage->nCell-1), &info);
4425 0 : rc = fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, &parentSize);
4426 0 : if( rc!=SQLITE_OK ){
4427 0 : return rc;
4428 : }
4429 : assert( parentSize<64 );
4430 0 : rc = insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4431 0 : if( rc!=SQLITE_OK ){
4432 0 : return rc;
4433 : }
4434 0 : put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4435 0 : put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4436 :
4437 : #ifndef SQLITE_OMIT_AUTOVACUUM
4438 : /* If this is an auto-vacuum database, update the pointer map
4439 : ** with entries for the new page, and any pointer from the
4440 : ** cell on the page to an overflow page.
4441 : */
4442 0 : if( pBt->autoVacuum ){
4443 0 : rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4444 0 : if( rc!=SQLITE_OK ){
4445 0 : return rc;
4446 : }
4447 0 : rc = ptrmapPutOvfl(pNew, 0);
4448 0 : if( rc!=SQLITE_OK ){
4449 0 : return rc;
4450 : }
4451 : }
4452 : #endif
4453 :
4454 : /* Release the reference to the new page and balance the parent page,
4455 : ** in case the divider cell inserted caused it to become overfull.
4456 : */
4457 0 : releasePage(pNew);
4458 0 : return balance(pParent, 0);
4459 : }
4460 : #endif /* SQLITE_OMIT_QUICKBALANCE */
4461 :
4462 : /*
4463 : ** The ISAUTOVACUUM macro is used within balance_nonroot() to determine
4464 : ** if the database supports auto-vacuum or not. Because it is used
4465 : ** within an expression that is an argument to another macro
4466 : ** (sqliteMallocRaw), it is not possible to use conditional compilation.
4467 : ** So, this macro is defined instead.
4468 : */
4469 : #ifndef SQLITE_OMIT_AUTOVACUUM
4470 : #define ISAUTOVACUUM (pBt->autoVacuum)
4471 : #else
4472 : #define ISAUTOVACUUM 0
4473 : #endif
4474 :
4475 : /*
4476 : ** This routine redistributes Cells on pPage and up to NN*2 siblings
4477 : ** of pPage so that all pages have about the same amount of free space.
4478 : ** Usually NN siblings on either side of pPage is used in the balancing,
4479 : ** though more siblings might come from one side if pPage is the first
4480 : ** or last child of its parent. If pPage has fewer than 2*NN siblings
4481 : ** (something which can only happen if pPage is the root page or a
4482 : ** child of root) then all available siblings participate in the balancing.
4483 : **
4484 : ** The number of siblings of pPage might be increased or decreased by one or
4485 : ** two in an effort to keep pages nearly full but not over full. The root page
4486 : ** is special and is allowed to be nearly empty. If pPage is
4487 : ** the root page, then the depth of the tree might be increased
4488 : ** or decreased by one, as necessary, to keep the root page from being
4489 : ** overfull or completely empty.
4490 : **
4491 : ** Note that when this routine is called, some of the Cells on pPage
4492 : ** might not actually be stored in pPage->aData[]. This can happen
4493 : ** if the page is overfull. Part of the job of this routine is to
4494 : ** make sure all Cells for pPage once again fit in pPage->aData[].
4495 : **
4496 : ** In the course of balancing the siblings of pPage, the parent of pPage
4497 : ** might become overfull or underfull. If that happens, then this routine
4498 : ** is called recursively on the parent.
4499 : **
4500 : ** If this routine fails for any reason, it might leave the database
4501 : ** in a corrupted state. So if this routine fails, the database should
4502 : ** be rolled back.
4503 : */
4504 0 : static int balance_nonroot(MemPage *pPage){
4505 : MemPage *pParent; /* The parent of pPage */
4506 : BtShared *pBt; /* The whole database */
4507 0 : int nCell = 0; /* Number of cells in apCell[] */
4508 0 : int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
4509 : int nOld; /* Number of pages in apOld[] */
4510 : int nNew; /* Number of pages in apNew[] */
4511 : int nDiv; /* Number of cells in apDiv[] */
4512 : int i, j, k; /* Loop counters */
4513 : int idx; /* Index of pPage in pParent->aCell[] */
4514 : int nxDiv; /* Next divider slot in pParent->aCell[] */
4515 : int rc; /* The return code */
4516 : int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
4517 : int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
4518 : int usableSpace; /* Bytes in pPage beyond the header */
4519 : int pageFlags; /* Value of pPage->aData[0] */
4520 : int subtotal; /* Subtotal of bytes in cells on one page */
4521 0 : int iSpace = 0; /* First unused byte of aSpace[] */
4522 : MemPage *apOld[NB]; /* pPage and up to two siblings */
4523 : Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
4524 : MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
4525 : MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4526 : Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
4527 : u8 *apDiv[NB]; /* Divider cells in pParent */
4528 : int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4529 : int szNew[NB+2]; /* Combined size of cells place on i-th page */
4530 0 : u8 **apCell = 0; /* All cells begin balanced */
4531 : int *szCell; /* Local size of all cells in apCell[] */
4532 : u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4533 : u8 *aSpace; /* Space to hold copies of dividers cells */
4534 : #ifndef SQLITE_OMIT_AUTOVACUUM
4535 0 : u8 *aFrom = 0;
4536 : #endif
4537 :
4538 : /*
4539 : ** Find the parent page.
4540 : */
4541 : assert( pPage->isInit );
4542 : assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4543 0 : pBt = pPage->pBt;
4544 0 : pParent = pPage->pParent;
4545 : assert( pParent );
4546 0 : if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4547 0 : return rc;
4548 : }
4549 : TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4550 :
4551 : #ifndef SQLITE_OMIT_QUICKBALANCE
4552 : /*
4553 : ** A special case: If a new entry has just been inserted into a
4554 : ** table (that is, a btree with integer keys and all data at the leaves)
4555 : ** and the new entry is the right-most entry in the tree (it has the
4556 : ** largest key) then use the special balance_quick() routine for
4557 : ** balancing. balance_quick() is much faster and results in a tighter
4558 : ** packing of data in the common case.
4559 : */
4560 0 : if( pPage->leaf &&
4561 : pPage->intKey &&
4562 : pPage->leafData &&
4563 : pPage->nOverflow==1 &&
4564 : pPage->aOvfl[0].idx==pPage->nCell &&
4565 : pPage->pParent->pgno!=1 &&
4566 : get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4567 : ){
4568 : /*
4569 : ** TODO: Check the siblings to the left of pPage. It may be that
4570 : ** they are not full and no new page is required.
4571 : */
4572 0 : return balance_quick(pPage, pParent);
4573 : }
4574 : #endif
4575 :
4576 : /*
4577 : ** Find the cell in the parent page whose left child points back
4578 : ** to pPage. The "idx" variable is the index of that cell. If pPage
4579 : ** is the rightmost child of pParent then set idx to pParent->nCell
4580 : */
4581 0 : if( pParent->idxShift ){
4582 : Pgno pgno;
4583 0 : pgno = pPage->pgno;
4584 : assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
4585 0 : for(idx=0; idx<pParent->nCell; idx++){
4586 0 : if( get4byte(findCell(pParent, idx))==pgno ){
4587 0 : break;
4588 : }
4589 : }
4590 : assert( idx<pParent->nCell
4591 : || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
4592 : }else{
4593 0 : idx = pPage->idxParent;
4594 : }
4595 :
4596 : /*
4597 : ** Initialize variables so that it will be safe to jump
4598 : ** directly to balance_cleanup at any moment.
4599 : */
4600 0 : nOld = nNew = 0;
4601 0 : sqlite3PagerRef(pParent->pDbPage);
4602 :
4603 : /*
4604 : ** Find sibling pages to pPage and the cells in pParent that divide
4605 : ** the siblings. An attempt is made to find NN siblings on either
4606 : ** side of pPage. More siblings are taken from one side, however, if
4607 : ** pPage there are fewer than NN siblings on the other side. If pParent
4608 : ** has NB or fewer children then all children of pParent are taken.
4609 : */
4610 0 : nxDiv = idx - NN;
4611 0 : if( nxDiv + NB > pParent->nCell ){
4612 0 : nxDiv = pParent->nCell - NB + 1;
4613 : }
4614 0 : if( nxDiv<0 ){
4615 0 : nxDiv = 0;
4616 : }
4617 0 : nDiv = 0;
4618 0 : for(i=0, k=nxDiv; i<NB; i++, k++){
4619 0 : if( k<pParent->nCell ){
4620 0 : apDiv[i] = findCell(pParent, k);
4621 0 : nDiv++;
4622 : assert( !pParent->leaf );
4623 0 : pgnoOld[i] = get4byte(apDiv[i]);
4624 0 : }else if( k==pParent->nCell ){
4625 0 : pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
4626 : }else{
4627 0 : break;
4628 : }
4629 0 : rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
4630 0 : if( rc ) goto balance_cleanup;
4631 0 : apOld[i]->idxParent = k;
4632 0 : apCopy[i] = 0;
4633 : assert( i==nOld );
4634 0 : nOld++;
4635 0 : nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
4636 : }
4637 :
4638 : /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
4639 : ** alignment */
4640 0 : nMaxCells = (nMaxCells + 1)&~1;
4641 :
4642 : /*
4643 : ** Allocate space for memory structures
4644 : */
4645 0 : apCell = sqliteMallocRaw(
4646 : nMaxCells*sizeof(u8*) /* apCell */
4647 : + nMaxCells*sizeof(int) /* szCell */
4648 : + ROUND8(sizeof(MemPage))*NB /* aCopy */
4649 : + pBt->pageSize*(5+NB) /* aSpace */
4650 : + (ISAUTOVACUUM ? nMaxCells : 0) /* aFrom */
4651 : );
4652 0 : if( apCell==0 ){
4653 0 : rc = SQLITE_NOMEM;
4654 0 : goto balance_cleanup;
4655 : }
4656 0 : szCell = (int*)&apCell[nMaxCells];
4657 0 : aCopy[0] = (u8*)&szCell[nMaxCells];
4658 : assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4659 0 : for(i=1; i<NB; i++){
4660 0 : aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4661 : assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4662 : }
4663 0 : aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4664 : assert( ((aSpace - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
4665 : #ifndef SQLITE_OMIT_AUTOVACUUM
4666 0 : if( pBt->autoVacuum ){
4667 0 : aFrom = &aSpace[5*pBt->pageSize];
4668 : }
4669 : #endif
4670 :
4671 : /*
4672 : ** Make copies of the content of pPage and its siblings into aOld[].
4673 : ** The rest of this function will use data from the copies rather
4674 : ** that the original pages since the original pages will be in the
4675 : ** process of being overwritten.
4676 : */
4677 0 : for(i=0; i<nOld; i++){
4678 0 : MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->pageSize];
4679 0 : p->aData = &((u8*)p)[-pBt->pageSize];
4680 0 : memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage));
4681 : /* The memcpy() above changes the value of p->aData so we have to
4682 : ** set it again. */
4683 0 : p->aData = &((u8*)p)[-pBt->pageSize];
4684 : }
4685 :
4686 : /*
4687 : ** Load pointers to all cells on sibling pages and the divider cells
4688 : ** into the local apCell[] array. Make copies of the divider cells
4689 : ** into space obtained form aSpace[] and remove the the divider Cells
4690 : ** from pParent.
4691 : **
4692 : ** If the siblings are on leaf pages, then the child pointers of the
4693 : ** divider cells are stripped from the cells before they are copied
4694 : ** into aSpace[]. In this way, all cells in apCell[] are without
4695 : ** child pointers. If siblings are not leaves, then all cell in
4696 : ** apCell[] include child pointers. Either way, all cells in apCell[]
4697 : ** are alike.
4698 : **
4699 : ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
4700 : ** leafData: 1 if pPage holds key+data and pParent holds only keys.
4701 : */
4702 0 : nCell = 0;
4703 0 : leafCorrection = pPage->leaf*4;
4704 0 : leafData = pPage->leafData && pPage->leaf;
4705 0 : for(i=0; i<nOld; i++){
4706 0 : MemPage *pOld = apCopy[i];
4707 0 : int limit = pOld->nCell+pOld->nOverflow;
4708 0 : for(j=0; j<limit; j++){
4709 : assert( nCell<nMaxCells );
4710 0 : apCell[nCell] = findOverflowCell(pOld, j);
4711 0 : szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
4712 : #ifndef SQLITE_OMIT_AUTOVACUUM
4713 0 : if( pBt->autoVacuum ){
4714 : int a;
4715 0 : aFrom[nCell] = i;
4716 0 : for(a=0; a<pOld->nOverflow; a++){
4717 0 : if( pOld->aOvfl[a].pCell==apCell[nCell] ){
4718 0 : aFrom[nCell] = 0xFF;
4719 0 : break;
4720 : }
4721 : }
4722 : }
4723 : #endif
4724 0 : nCell++;
4725 : }
4726 0 : if( i<nOld-1 ){
4727 0 : int sz = cellSizePtr(pParent, apDiv[i]);
4728 0 : if( leafData ){
4729 : /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
4730 : ** are duplicates of keys on the child pages. We need to remove
4731 : ** the divider cells from pParent, but the dividers cells are not
4732 : ** added to apCell[] because they are duplicates of child cells.
4733 : */
4734 0 : dropCell(pParent, nxDiv, sz);
4735 : }else{
4736 : u8 *pTemp;
4737 : assert( nCell<nMaxCells );
4738 0 : szCell[nCell] = sz;
4739 0 : pTemp = &aSpace[iSpace];
4740 0 : iSpace += sz;
4741 : assert( iSpace<=pBt->pageSize*5 );
4742 0 : memcpy(pTemp, apDiv[i], sz);
4743 0 : apCell[nCell] = pTemp+leafCorrection;
4744 : #ifndef SQLITE_OMIT_AUTOVACUUM
4745 0 : if( pBt->autoVacuum ){
4746 0 : aFrom[nCell] = 0xFF;
4747 : }
4748 : #endif
4749 0 : dropCell(pParent, nxDiv, sz);
4750 0 : szCell[nCell] -= leafCorrection;
4751 : assert( get4byte(pTemp)==pgnoOld[i] );
4752 0 : if( !pOld->leaf ){
4753 : assert( leafCorrection==0 );
4754 : /* The right pointer of the child page pOld becomes the left
4755 : ** pointer of the divider cell */
4756 0 : memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
4757 : }else{
4758 : assert( leafCorrection==4 );
4759 : }
4760 0 : nCell++;
4761 : }
4762 : }
4763 : }
4764 :
4765 : /*
4766 : ** Figure out the number of pages needed to hold all nCell cells.
4767 : ** Store this number in "k". Also compute szNew[] which is the total
4768 : ** size of all cells on the i-th page and cntNew[] which is the index
4769 : ** in apCell[] of the cell that divides page i from page i+1.
4770 : ** cntNew[k] should equal nCell.
4771 : **
4772 : ** Values computed by this block:
4773 : **
4774 : ** k: The total number of sibling pages
4775 : ** szNew[i]: Spaced used on the i-th sibling page.
4776 : ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
4777 : ** the right of the i-th sibling page.
4778 : ** usableSpace: Number of bytes of space available on each sibling.
4779 : **
4780 : */
4781 0 : usableSpace = pBt->usableSize - 12 + leafCorrection;
4782 0 : for(subtotal=k=i=0; i<nCell; i++){
4783 : assert( i<nMaxCells );
4784 0 : subtotal += szCell[i] + 2;
4785 0 : if( subtotal > usableSpace ){
4786 0 : szNew[k] = subtotal - szCell[i];
4787 0 : cntNew[k] = i;
4788 0 : if( leafData ){ i--; }
4789 0 : subtotal = 0;
4790 0 : k++;
4791 : }
4792 : }
4793 0 : szNew[k] = subtotal;
4794 0 : cntNew[k] = nCell;
4795 0 : k++;
4796 :
4797 : /*
4798 : ** The packing computed by the previous block is biased toward the siblings
4799 : ** on the left side. The left siblings are always nearly full, while the
4800 : ** right-most sibling might be nearly empty. This block of code attempts
4801 : ** to adjust the packing of siblings to get a better balance.
4802 : **
4803 : ** This adjustment is more than an optimization. The packing above might
4804 : ** be so out of balance as to be illegal. For example, the right-most
4805 : ** sibling might be completely empty. This adjustment is not optional.
4806 : */
4807 0 : for(i=k-1; i>0; i--){
4808 0 : int szRight = szNew[i]; /* Size of sibling on the right */
4809 0 : int szLeft = szNew[i-1]; /* Size of sibling on the left */
4810 : int r; /* Index of right-most cell in left sibling */
4811 : int d; /* Index of first cell to the left of right sibling */
4812 :
4813 0 : r = cntNew[i-1] - 1;
4814 0 : d = r + 1 - leafData;
4815 : assert( d<nMaxCells );
4816 : assert( r<nMaxCells );
4817 0 : while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
4818 0 : szRight += szCell[d] + 2;
4819 0 : szLeft -= szCell[r] + 2;
4820 0 : cntNew[i-1]--;
4821 0 : r = cntNew[i-1] - 1;
4822 0 : d = r + 1 - leafData;
4823 : }
4824 0 : szNew[i] = szRight;
4825 0 : szNew[i-1] = szLeft;
4826 : }
4827 :
4828 : /* Either we found one or more cells (cntnew[0])>0) or we are the
4829 : ** a virtual root page. A virtual root page is when the real root
4830 : ** page is page 1 and we are the only child of that page.
4831 : */
4832 : assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
4833 :
4834 : /*
4835 : ** Allocate k new pages. Reuse old pages where possible.
4836 : */
4837 : assert( pPage->pgno>1 );
4838 0 : pageFlags = pPage->aData[0];
4839 0 : for(i=0; i<k; i++){
4840 : MemPage *pNew;
4841 0 : if( i<nOld ){
4842 0 : pNew = apNew[i] = apOld[i];
4843 0 : pgnoNew[i] = pgnoOld[i];
4844 0 : apOld[i] = 0;
4845 0 : rc = sqlite3PagerWrite(pNew->pDbPage);
4846 0 : nNew++;
4847 0 : if( rc ) goto balance_cleanup;
4848 : }else{
4849 : assert( i>0 );
4850 0 : rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
4851 0 : if( rc ) goto balance_cleanup;
4852 0 : apNew[i] = pNew;
4853 0 : nNew++;
4854 : }
4855 0 : zeroPage(pNew, pageFlags);
4856 : }
4857 :
4858 : /* Free any old pages that were not reused as new pages.
4859 : */
4860 0 : while( i<nOld ){
4861 0 : rc = freePage(apOld[i]);
4862 0 : if( rc ) goto balance_cleanup;
4863 0 : releasePage(apOld[i]);
4864 0 : apOld[i] = 0;
4865 0 : i++;
4866 : }
4867 :
4868 : /*
4869 : ** Put the new pages in accending order. This helps to
4870 : ** keep entries in the disk file in order so that a scan
4871 : ** of the table is a linear scan through the file. That
4872 : ** in turn helps the operating system to deliver pages
4873 : ** from the disk more rapidly.
4874 : **
4875 : ** An O(n^2) insertion sort algorithm is used, but since
4876 : ** n is never more than NB (a small constant), that should
4877 : ** not be a problem.
4878 : **
4879 : ** When NB==3, this one optimization makes the database
4880 : ** about 25% faster for large insertions and deletions.
4881 : */
4882 0 : for(i=0; i<k-1; i++){
4883 0 : int minV = pgnoNew[i];
4884 0 : int minI = i;
4885 0 : for(j=i+1; j<k; j++){
4886 0 : if( pgnoNew[j]<(unsigned)minV ){
4887 0 : minI = j;
4888 0 : minV = pgnoNew[j];
4889 : }
4890 : }
4891 0 : if( minI>i ){
4892 : int t;
4893 : MemPage *pT;
4894 0 : t = pgnoNew[i];
4895 0 : pT = apNew[i];
4896 0 : pgnoNew[i] = pgnoNew[minI];
4897 0 : apNew[i] = apNew[minI];
4898 0 : pgnoNew[minI] = t;
4899 0 : apNew[minI] = pT;
4900 : }
4901 : }
4902 : TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
4903 : pgnoOld[0],
4904 : nOld>=2 ? pgnoOld[1] : 0,
4905 : nOld>=3 ? pgnoOld[2] : 0,
4906 : pgnoNew[0], szNew[0],
4907 : nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
4908 : nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
4909 : nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
4910 : nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
4911 :
4912 : /*
4913 : ** Evenly distribute the data in apCell[] across the new pages.
4914 : ** Insert divider cells into pParent as necessary.
4915 : */
4916 0 : j = 0;
4917 0 : for(i=0; i<nNew; i++){
4918 : /* Assemble the new sibling page. */
4919 0 : MemPage *pNew = apNew[i];
4920 : assert( j<nMaxCells );
4921 : assert( pNew->pgno==pgnoNew[i] );
4922 0 : assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
4923 : assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
4924 : assert( pNew->nOverflow==0 );
4925 :
4926 : #ifndef SQLITE_OMIT_AUTOVACUUM
4927 : /* If this is an auto-vacuum database, update the pointer map entries
4928 : ** that point to the siblings that were rearranged. These can be: left
4929 : ** children of cells, the right-child of the page, or overflow pages
4930 : ** pointed to by cells.
4931 : */
4932 0 : if( pBt->autoVacuum ){
4933 0 : for(k=j; k<cntNew[i]; k++){
4934 : assert( k<nMaxCells );
4935 0 : if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
4936 0 : rc = ptrmapPutOvfl(pNew, k-j);
4937 0 : if( rc!=SQLITE_OK ){
4938 0 : goto balance_cleanup;
4939 : }
4940 : }
4941 : }
4942 : }
4943 : #endif
4944 :
4945 0 : j = cntNew[i];
4946 :
4947 : /* If the sibling page assembled above was not the right-most sibling,
4948 : ** insert a divider cell into the parent page.
4949 : */
4950 0 : if( i<nNew-1 && j<nCell ){
4951 : u8 *pCell;
4952 : u8 *pTemp;
4953 : int sz;
4954 :
4955 : assert( j<nMaxCells );
4956 0 : pCell = apCell[j];
4957 0 : sz = szCell[j] + leafCorrection;
4958 0 : if( !pNew->leaf ){
4959 0 : memcpy(&pNew->aData[8], pCell, 4);
4960 0 : pTemp = 0;
4961 0 : }else if( leafData ){
4962 : /* If the tree is a leaf-data tree, and the siblings are leaves,
4963 : ** then there is no divider cell in apCell[]. Instead, the divider
4964 : ** cell consists of the integer key for the right-most cell of
4965 : ** the sibling-page assembled above only.
4966 : */
4967 : CellInfo info;
4968 0 : j--;
4969 0 : parseCellPtr(pNew, apCell[j], &info);
4970 0 : pCell = &aSpace[iSpace];
4971 0 : fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
4972 0 : iSpace += sz;
4973 : assert( iSpace<=pBt->pageSize*5 );
4974 0 : pTemp = 0;
4975 : }else{
4976 0 : pCell -= 4;
4977 0 : pTemp = &aSpace[iSpace];
4978 0 : iSpace += sz;
4979 : assert( iSpace<=pBt->pageSize*5 );
4980 : }
4981 0 : rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
4982 0 : if( rc!=SQLITE_OK ) goto balance_cleanup;
4983 0 : put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
4984 : #ifndef SQLITE_OMIT_AUTOVACUUM
4985 : /* If this is an auto-vacuum database, and not a leaf-data tree,
4986 : ** then update the pointer map with an entry for the overflow page
4987 : ** that the cell just inserted points to (if any).
4988 : */
4989 0 : if( pBt->autoVacuum && !leafData ){
4990 0 : rc = ptrmapPutOvfl(pParent, nxDiv);
4991 0 : if( rc!=SQLITE_OK ){
4992 0 : goto balance_cleanup;
4993 : }
4994 : }
4995 : #endif
4996 0 : j++;
4997 0 : nxDiv++;
4998 : }
4999 : }
5000 : assert( j==nCell );
5001 : assert( nOld>0 );
5002 : assert( nNew>0 );
5003 0 : if( (pageFlags & PTF_LEAF)==0 ){
5004 0 : memcpy(&apNew[nNew-1]->aData[8], &apCopy[nOld-1]->aData[8], 4);
5005 : }
5006 0 : if( nxDiv==pParent->nCell+pParent->nOverflow ){
5007 : /* Right-most sibling is the right-most child of pParent */
5008 0 : put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5009 : }else{
5010 : /* Right-most sibling is the left child of the first entry in pParent
5011 : ** past the right-most divider entry */
5012 0 : put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5013 : }
5014 :
5015 : /*
5016 : ** Reparent children of all cells.
5017 : */
5018 0 : for(i=0; i<nNew; i++){
5019 0 : rc = reparentChildPages(apNew[i]);
5020 0 : if( rc!=SQLITE_OK ) goto balance_cleanup;
5021 : }
5022 0 : rc = reparentChildPages(pParent);
5023 0 : if( rc!=SQLITE_OK ) goto balance_cleanup;
5024 :
5025 : /*
5026 : ** Balance the parent page. Note that the current page (pPage) might
5027 : ** have been added to the freelist so it might no longer be initialized.
5028 : ** But the parent page will always be initialized.
5029 : */
5030 : assert( pParent->isInit );
5031 0 : rc = balance(pParent, 0);
5032 :
5033 : /*
5034 : ** Cleanup before returning.
5035 : */
5036 0 : balance_cleanup:
5037 0 : sqliteFree(apCell);
5038 0 : for(i=0; i<nOld; i++){
5039 0 : releasePage(apOld[i]);
5040 : }
5041 0 : for(i=0; i<nNew; i++){
5042 0 : releasePage(apNew[i]);
5043 : }
5044 0 : releasePage(pParent);
5045 : TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5046 : pPage->pgno, nOld, nNew, nCell));
5047 0 : return rc;
5048 : }
5049 :
5050 : /*
5051 : ** This routine is called for the root page of a btree when the root
5052 : ** page contains no cells. This is an opportunity to make the tree
5053 : ** shallower by one level.
5054 : */
5055 5 : static int balance_shallower(MemPage *pPage){
5056 : MemPage *pChild; /* The only child page of pPage */
5057 : Pgno pgnoChild; /* Page number for pChild */
5058 5 : int rc = SQLITE_OK; /* Return code from subprocedures */
5059 : BtShared *pBt; /* The main BTree structure */
5060 : int mxCellPerPage; /* Maximum number of cells per page */
5061 : u8 **apCell; /* All cells from pages being balanced */
5062 : int *szCell; /* Local size of all cells */
5063 :
5064 : assert( pPage->pParent==0 );
5065 : assert( pPage->nCell==0 );
5066 5 : pBt = pPage->pBt;
5067 5 : mxCellPerPage = MX_CELL(pBt);
5068 5 : apCell = sqliteMallocRaw( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
5069 5 : if( apCell==0 ) return SQLITE_NOMEM;
5070 5 : szCell = (int*)&apCell[mxCellPerPage];
5071 5 : if( pPage->leaf ){
5072 : /* The table is completely empty */
5073 : TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5074 : }else{
5075 : /* The root page is empty but has one child. Transfer the
5076 : ** information from that one child into the root page if it
5077 : ** will fit. This reduces the depth of the tree by one.
5078 : **
5079 : ** If the root page is page 1, it has less space available than
5080 : ** its child (due to the 100 byte header that occurs at the beginning
5081 : ** of the database fle), so it might not be able to hold all of the
5082 : ** information currently contained in the child. If this is the
5083 : ** case, then do not do the transfer. Leave page 1 empty except
5084 : ** for the right-pointer to the child page. The child page becomes
5085 : ** the virtual root of the tree.
5086 : */
5087 0 : pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5088 : assert( pgnoChild>0 );
5089 : assert( pgnoChild<=sqlite3PagerPagecount(pPage->pBt->pPager) );
5090 0 : rc = getPage(pPage->pBt, pgnoChild, &pChild, 0);
5091 0 : if( rc ) goto end_shallow_balance;
5092 0 : if( pPage->pgno==1 ){
5093 0 : rc = initPage(pChild, pPage);
5094 0 : if( rc ) goto end_shallow_balance;
5095 : assert( pChild->nOverflow==0 );
5096 0 : if( pChild->nFree>=100 ){
5097 : /* The child information will fit on the root page, so do the
5098 : ** copy */
5099 : int i;
5100 0 : zeroPage(pPage, pChild->aData[0]);
5101 0 : for(i=0; i<pChild->nCell; i++){
5102 0 : apCell[i] = findCell(pChild,i);
5103 0 : szCell[i] = cellSizePtr(pChild, apCell[i]);
5104 : }
5105 0 : assemblePage(pPage, pChild->nCell, apCell, szCell);
5106 : /* Copy the right-pointer of the child to the parent. */
5107 0 : put4byte(&pPage->aData[pPage->hdrOffset+8],
5108 : get4byte(&pChild->aData[pChild->hdrOffset+8]));
5109 0 : freePage(pChild);
5110 : TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5111 : }else{
5112 : /* The child has more information that will fit on the root.
5113 : ** The tree is already balanced. Do nothing. */
5114 : TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5115 : }
5116 : }else{
5117 0 : memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5118 0 : pPage->isInit = 0;
5119 0 : pPage->pParent = 0;
5120 0 : rc = initPage(pPage, 0);
5121 : assert( rc==SQLITE_OK );
5122 0 : freePage(pChild);
5123 : TRACE(("BALANCE: transfer child %d into root %d\n",
5124 : pChild->pgno, pPage->pgno));
5125 : }
5126 0 : rc = reparentChildPages(pPage);
5127 : assert( pPage->nOverflow==0 );
5128 : #ifndef SQLITE_OMIT_AUTOVACUUM
5129 0 : if( pBt->autoVacuum ){
5130 : int i;
5131 0 : for(i=0; i<pPage->nCell; i++){
5132 0 : rc = ptrmapPutOvfl(pPage, i);
5133 0 : if( rc!=SQLITE_OK ){
5134 0 : goto end_shallow_balance;
5135 : }
5136 : }
5137 : }
5138 : #endif
5139 0 : if( rc!=SQLITE_OK ) goto end_shallow_balance;
5140 0 : releasePage(pChild);
5141 : }
5142 5 : end_shallow_balance:
5143 5 : sqliteFree(apCell);
5144 5 : return rc;
5145 : }
5146 :
5147 :
5148 : /*
5149 : ** The root page is overfull
5150 : **
5151 : ** When this happens, Create a new child page and copy the
5152 : ** contents of the root into the child. Then make the root
5153 : ** page an empty page with rightChild pointing to the new
5154 : ** child. Finally, call balance_internal() on the new child
5155 : ** to cause it to split.
5156 : */
5157 0 : static int balance_deeper(MemPage * |