📄 btree.c
字号:
/*** 2001 September 15**** The author disclaims copyright to this source code. In place of** a legal notice, here is a blessing:**** May you do good and not evil.** May you find forgiveness for yourself and forgive others.** May you share freely, never taking more than you give.***************************************************************************** $Id: btree.c,v 1.103 2004/03/10 13:42:38 drh Exp $**** This file implements a external (disk-based) database using BTrees.** For a detailed discussion of BTrees, refer to**** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:** "Sorting And Searching", pages 473-480. Addison-Wesley** Publishing Company, Reading, Massachusetts.**** The basic idea is that each page of the file contains N database** entries and N+1 pointers to subpages.**** ----------------------------------------------------------------** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |** ----------------------------------------------------------------**** All of the keys on the page that Ptr(0) points to have values less** than Key(0). All of the keys on page Ptr(1) and its subpages have** values greater than Key(0) and less than Key(1). All of the keys** on Ptr(N+1) and its subpages have values greater than Key(N). And** so forth.**** Finding a particular key requires reading O(log(M)) pages from the ** disk where M is the number of entries in the tree.**** In this implementation, a single file can hold one or more separate ** BTrees. Each BTree is identified by the index of its root page. The** key and data for any entry are combined to form the "payload". Up to** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes** then surplus bytes are stored on overflow pages. The payload for an** entry and the preceding pointer are combined to form a "Cell". Each ** page has a small header which contains the Ptr(N+1) pointer.**** The first page of the file contains a magic string used to verify that** the file really is a valid BTree database, a pointer to a list of unused** pages in the file, and some meta information. The root of the first** BTree begins on page 2 of the file. (Pages are numbered beginning with** 1, not 0.) Thus a minimum database contains 2 pages.*/#include "sqliteInt.h"#include "pager.h"#include "btree.h"#include <assert.h>/* Forward declarations */static BtOps sqliteBtreeOps;static BtCursorOps sqliteBtreeCursorOps;/*** Macros used for byteswapping. B is a pointer to the Btree** structure. This is needed to access the Btree.needSwab boolean** in order to tell if byte swapping is needed or not.** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer.** SWAB32 byteswaps a 32-bit integer.*/#define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X))#define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X))#define SWAB_ADD(B,X,A) \ if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); }/*** The following global variable - available only if SQLITE_TEST is** defined - is used to determine whether new databases are created in** native byte order or in non-native byte order. Non-native byte order** databases are created for testing purposes only. Under normal operation,** only native byte-order databases should be created, but we should be** able to read or write existing databases regardless of the byteorder.*/#ifdef SQLITE_TESTint btree_native_byte_order = 1;#else# define btree_native_byte_order 1#endif/*** Forward declarations of structures used only in this file.*/typedef struct PageOne PageOne;typedef struct MemPage MemPage;typedef struct PageHdr PageHdr;typedef struct Cell Cell;typedef struct CellHdr CellHdr;typedef struct FreeBlk FreeBlk;typedef struct OverflowPage OverflowPage;typedef struct FreelistInfo FreelistInfo;/*** All structures on a database page are aligned to 4-byte boundries.** This routine rounds up a number of bytes to the next multiple of 4.**** This might need to change for computer architectures that require** and 8-byte alignment boundry for structures.*/#define ROUNDUP(X) ((X+3) & ~3)/*** This is a magic string that appears at the beginning of every** SQLite database in order to identify the file as a real database.*/static const char zMagicHeader[] = "** This file contains an SQLite 2.1 database **";#define MAGIC_SIZE (sizeof(zMagicHeader))/*** This is a magic integer also used to test the integrity of the database** file. This integer is used in addition to the string above so that** if the file is written on a little-endian architecture and read** on a big-endian architectures (or vice versa) we can detect the** problem.**** The number used was obtained at random and has no special** significance other than the fact that it represents a different** integer on little-endian and big-endian machines.*/#define MAGIC 0xdae37528/*** The first page of the database file contains a magic header string** to identify the file as an SQLite database file. It also contains** a pointer to the first free page of the file. Page 2 contains the** root of the principle BTree. The file might contain other BTrees** rooted on pages above 2.**** The first page also contains SQLITE_N_BTREE_META integers that** can be used by higher-level routines.**** Remember that pages are numbered beginning with 1. (See pager.c** for additional information.) Page 0 does not exist and a page** number of 0 is used to mean "no such page".*/struct PageOne { char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */ int iMagic; /* Integer to verify correct byte order */ Pgno freeList; /* First free page in a list of all free pages */ int nFree; /* Number of pages on the free list */ int aMeta[SQLITE_N_BTREE_META-1]; /* User defined integers */};/*** Each database page has a header that is an instance of this** structure.**** PageHdr.firstFree is 0 if there is no free space on this page.** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a ** FreeBlk structure that describes the first block of free space. ** All free space is defined by a linked list of FreeBlk structures.**** Data is stored in a linked list of Cell structures. PageHdr.firstCell** is the index into MemPage.u.aDisk[] of the first cell on the page. The** Cells are kept in sorted order.**** A Cell contains all information about a database entry and a pointer** to a child page that contains other entries less than itself. In** other words, the i-th Cell contains both Ptr(i) and Key(i). The** right-most pointer of the page is contained in PageHdr.rightChild.*/struct PageHdr { Pgno rightChild; /* Child page that comes after all cells on this page */ u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */ u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */};/*** Entries on a page of the database are called "Cells". Each Cell** has a header and data. This structure defines the header. The** key and data (collectively the "payload") follow this header on** the database page.**** A definition of the complete Cell structure is given below. The** header for the cell must be defined first in order to do some** of the sizing #defines that follow.*/struct CellHdr { Pgno leftChild; /* Child page that comes before this cell */ u16 nKey; /* Number of bytes in the key */ u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */ u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */ u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */ u16 nData; /* Number of bytes of data */};/*** The key and data size are split into a lower 16-bit segment and an** upper 8-bit segment in order to pack them together into a smaller** space. The following macros reassembly a key or data size back** into an integer.*/#define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536)#define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536)/*** The minimum size of a complete Cell. The Cell must contain a header** and at least 4 bytes of payload.*/#define MIN_CELL_SIZE (sizeof(CellHdr)+4)/*** The maximum number of database entries that can be held in a single** page of the database. */#define MX_CELL ((SQLITE_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE)/*** The amount of usable space on a single page of the BTree. This is the** page size minus the overhead of the page header.*/#define USABLE_SPACE (SQLITE_USABLE_SIZE - sizeof(PageHdr))/*** The maximum amount of payload (in bytes) that can be stored locally for** a database entry. If the entry contains more data than this, the** extra goes onto overflow pages.**** This number is chosen so that at least 4 cells will fit on every page.*/#define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3)/*** Data on a database page is stored as a linked list of Cell structures.** Both the key and the data are stored in aPayload[]. The key always comes** first. The aPayload[] field grows as necessary to hold the key and data,** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the** page number of the first overflow page.**** Though this structure is fixed in size, the Cell on the database** page varies in size. Every cell has a CellHdr and at least 4 bytes** of payload space. Additional payload bytes (up to the maximum of** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as** needed.*/struct Cell { CellHdr h; /* The cell header */ char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */ Pgno ovfl; /* The first overflow page */};/*** Free space on a page is remembered using a linked list of the FreeBlk** structures. Space on a database page is allocated in increments of** at least 4 bytes and is always aligned to a 4-byte boundry. The** linked list of FreeBlks is always kept in order by address.*/struct FreeBlk { u16 iSize; /* Number of bytes in this block of free space */ u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */};/*** The number of bytes of payload that will fit on a single overflow page.*/#define OVERFLOW_SIZE (SQLITE_USABLE_SIZE-sizeof(Pgno))/*** When the key and data for a single entry in the BTree will not fit in** the MX_LOCAL_PAYLOAD bytes of space available on the database page,** then all extra bytes are written to a linked list of overflow pages.** Each overflow page is an instance of the following structure.**** Unused pages in the database are also represented by instances of** the OverflowPage structure. The PageOne.freeList field is the** page number of the first page in a linked list of unused database** pages.*/struct OverflowPage { Pgno iNext; char aPayload[OVERFLOW_SIZE];};/*** The PageOne.freeList field points to a linked list of overflow pages** hold information about free pages. The aPayload section of each** overflow page contains an instance of the following structure. The** aFree[] array holds the page number of nFree unused pages in the disk** file.*/struct FreelistInfo { int nFree; Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)];};/*** For every page in the database file, an instance of the following structure** is stored in memory. The u.aDisk[] array contains the raw bits read from** the disk. The rest is auxiliary information held in memory only. The** auxiliary info is only valid for regular database pages - it is not** used for overflow pages and pages on the freelist.**** Of particular interest in the auxiliary info is the apCell[] entry. Each** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are** put in this array so that they can be accessed in constant time, rather** than in linear time which would be needed if we had to walk the linked ** list on every access.**** Note that apCell[] contains enough space to hold up to two more Cells** than can possibly fit on one page. In the steady state, every apCell[]** points to memory inside u.aDisk[]. But in the middle of an insert** operation, some apCell[] entries may temporarily point to data space** outside of u.aDisk[]. This is a transient situation that is quickly** resolved. But while it is happening, it is possible for a database** page to hold as many as two more cells than it might otherwise hold.** The extra two entries in apCell[] are an allowance for this situation.**** The pParent field points back to the parent page. This allows us to** walk up the BTree from any leaf to the root. Care must be taken to** unref() the parent page pointer when this page is no longer referenced.** The pageDestructor() routine handles that chore.*/struct MemPage { union u_page_data { char aDisk[SQLITE_PAGE_SIZE]; /* Page data stored on disk */ PageHdr hdr; /* Overlay page header */ } u; u8 isInit; /* True if auxiliary data is initialized */ u8 idxShift; /* True if apCell[] indices have changed */ u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */ MemPage *pParent; /* The parent of this page. NULL for root */ int idxParent; /* Index in pParent->apCell[] of this node */ int nFree; /* Number of free bytes in u.aDisk[] */ int nCell; /* Number of entries on this page */ Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */};/*** The in-memory image of a disk page has the auxiliary information appended** to the end. EXTRA_SIZE is the number of bytes of space needed to hold** that extra information.*/#define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data))/*** Everything we need to know about an open database*/struct Btree { BtOps *pOps; /* Function table */ Pager *pPager; /* The page cache */ BtCursor *pCursor; /* A list of all open cursors */ PageOne *page1; /* First page of the database */ u8 inTrans; /* True if a transaction is in progress */ u8 inCkpt; /* True if there is a checkpoint on the transaction */ u8 readOnly; /* True if the underlying file is readonly */ u8 needSwab; /* Need to byte-swapping */};typedef Btree Bt;/*** A cursor is a pointer to a particular entry in the BTree.** The entry is identified by its MemPage and the index in** MemPage.apCell[] of the entry.*/struct BtCursor { BtCursorOps *pOps; /* Function table */ Btree *pBt; /* The Btree to which this cursor belongs */ BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */ BtCursor *pShared; /* Loop of cursors with the same root page */ Pgno pgnoRoot; /* The root page of this tree */ MemPage *pPage; /* Page that contains the entry */ int idx; /* Index of the entry in pPage->apCell[] */ u8 wrFlag; /* True if writable */ u8 eSkip; /* Determines if next step operation is a no-op */ u8 iMatch; /* compare result from last sqliteBtreeMoveto() */};/*** Legal values for BtCursor.eSkip.*/#define SKIP_NONE 0 /* Always step the cursor */#define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */#define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */#define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid *//* Forward declarations */static int fileBtreeCloseCursor(BtCursor *pCur);/*** Routines for byte swapping.*/u16 swab16(u16 x){ return ((x & 0xff)<<8) | ((x>>8)&0xff);}u32 swab32(u32 x){ return ((x & 0xff)<<24) | ((x & 0xff00)<<8) | ((x>>8) & 0xff00) | ((x>>24)&0xff);}/*** Compute the total number of bytes that a Cell needs on the main** database page. The number returned includes the Cell header,** local payload storage, and the pointer to overflow pages (if** applicable). Additional space allocated on overflow pages** is NOT included in the value returned from this routine.*/static int cellSize(Btree *pBt, Cell *pCell){ int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); if( n>MX_LOCAL_PAYLOAD ){ n = MX_LOCAL_PAYLOAD + sizeof(Pgno); }else{ n = ROUNDUP(n); } n += sizeof(CellHdr); return n;}/*** Defragment the page given. All Cells are moved to the** beginning of the page and all free space is collected
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -