📄 btree.c
字号:
/*
** This file implements a external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
** "Sorting And Searching", pages 473-480. Addison-Wesley
** Publishing Company, Reading, Massachusetts.
**
** The basic idea is that each page of the file contains N database
** entries and N+1 pointers to subpages.
**
** ----------------------------------------------------------------
** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
** ----------------------------------------------------------------
**
** All of the keys on the page that Ptr(0) points to have values less
** than Key(0). All of the keys on page Ptr(1) and its subpages have
** values greater than Key(0) and less than Key(1). All of the keys
** on Ptr(N+1) and its subpages have values greater than Key(N). And
** so forth.
**
** Finding a particular key requires reading O(log(M)) pages from the
** disk where M is the number of entries in the tree.
**
** In this implementation, a single file can hold one or more separate
** BTrees. Each BTree is identified by the index of its root page. The
** key and data for any entry are combined to form the "payload". Up to
** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the
** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes
** then surplus bytes are stored on overflow pages. The payload for an
** entry and the preceding pointer are combined to form a "Cell". Each
** page has a small header which contains the Ptr(N+1) pointer.
**
** The first page of the file contains a magic string used to verify that
** the file really is a valid BTree database, a pointer to a list of unused
** pages in the file, and some meta information. The root of the first
** BTree begins on page 2 of the file. (Pages are numbered beginning with
** 1, not 0.) Thus a minimum database contains 2 pages.
*/
#include "eDbInit.h"
/* Forward declarations */
static BtOps eDbBtreeOps;
static BtCursorOps eDbBtreeCursorOps;
/*
** Macros used for byteswapping. B is a pointer to the Btree
** structure. This is needed to access the Btree.needSwab boolean
** in order to tell if byte swapping is needed or not.
** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer.
** SWAB32 byteswaps a 32-bit integer.
*/
#define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X))
#define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X))
#define SWAB_ADD(B,X,A) \
if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); }
/*
** The following global variable - available only if eDb_TEST is
** defined - is used to determine whether new databases are created in
** native byte order or in non-native byte order. Non-native byte order
** databases are created for testing purposes only. Under normal operation,
** only native byte-order databases should be created, but we should be
** able to read or write existing databases regardless of the byteorder.
*/
#ifdef eDb_TEST
int btree_native_byte_order = 1;
#else
# define btree_native_byte_order 1
#endif
/*
** Forward declarations of structures used only in this file.
*/
typedef struct PageOne PageOne;
typedef struct MemPage MemPage;
typedef struct PageHdr PageHdr;
typedef struct Cell Cell;
typedef struct CellHdr CellHdr;
typedef struct FreeBlk FreeBlk;
typedef struct OverflowPage OverflowPage;
typedef struct FreelistInfo FreelistInfo;
/*
** All structures on a database page are aligned to 4-byte boundries.
** This routine rounds up a number of bytes to the next multiple of 4.
**
** This might need to change for computer architectures that require
** and 8-byte alignment boundry for structures.
*/
#define ROUNDUP(X) ((X+3) & ~3)
/*
** This is a magic string that appears at the beginning of every
** eDb database in order to identify the file as a real database.
*/
static const char zMagicHeader[] =
"** This file contains an eDb 1.0 database **";
#define MAGIC_SIZE (sizeof(zMagicHeader))
/*
** This is a magic integer also used to test the integrity of the database
** file. This integer is used in addition to the string above so that
** if the file is written on a little-endian architecture and read
** on a big-endian architectures (or vice versa) we can detect the
** problem.
**
** The number used was obtained at random and has no special
** significance other than the fact that it represents a different
** integer on little-endian and big-endian machines.
*/
#define MAGIC 0xdae37528
/*
** The first page of the database file contains a magic header string
** to identify the file as an eDb database file. It also contains
** a pointer to the first free page of the file. Page 2 contains the
** root of the principle BTree. The file might contain other BTrees
** rooted on pages above 2.
**
** The first page also contains eDb_N_BTREE_META integers that
** can be used by higher-level routines.
**
** Remember that pages are numbered beginning with 1. (See pager.c
** for additional information.) Page 0 does not exist and a page
** number of 0 is used to mean "no such page".
*/
struct PageOne {
char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */
int iMagic; /* Integer to verify correct byte order */
Pgno freeList; /* First free page in a list of all free pages */
int nFree; /* Number of pages on the free list */
int aMeta[eDb_N_BTREE_META-1]; /* User defined integers */
};
/*
** Each database page has a header that is an instance of this
** structure.
**
** PageHdr.firstFree is 0 if there is no free space on this page.
** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a
** FreeBlk structure that describes the first block of free space.
** All free space is defined by a linked list of FreeBlk structures.
**
** Data is stored in a linked list of Cell structures. PageHdr.firstCell
** is the index into MemPage.u.aDisk[] of the first cell on the page. The
** Cells are kept in sorted order.
**
** A Cell contains all information about a database entry and a pointer
** to a child page that contains other entries less than itself. In
** other words, the i-th Cell contains both Ptr(i) and Key(i). The
** right-most pointer of the page is contained in PageHdr.rightChild.
*/
struct PageHdr {
Pgno rightChild; /* Child page that comes after all cells on this page */
u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */
u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */
};
/*
** Entries on a page of the database are called "Cells". Each Cell
** has a header and data. This structure defines the header. The
** key and data (collectively the "payload") follow this header on
** the database page.
**
** A definition of the complete Cell structure is given below. The
** header for the cell must be defined first in order to do some
** of the sizing #defines that follow.
*/
struct CellHdr {
Pgno leftChild; /* Child page that comes before this cell */
u16 nKey; /* Number of bytes in the key */
u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */
u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */
u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */
u16 nData; /* Number of bytes of data */
};
/*
** The key and data size are split into a lower 16-bit segment and an
** upper 8-bit segment in order to pack them together into a smaller
** space. The following macros reassembly a key or data size back
** into an integer.
*/
#define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536)
#define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536)
/*
** The minimum size of a complete Cell. The Cell must contain a header
** and at least 4 bytes of payload.
*/
#define MIN_CELL_SIZE (sizeof(CellHdr)+4)
/*
** The maximum number of database entries that can be held in a single
** page of the database.
*/
#define MX_CELL ((eDb_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE)
/*
** The amount of usable space on a single page of the BTree. This is the
** page size minus the overhead of the page header.
*/
#define USABLE_SPACE (eDb_USABLE_SIZE - sizeof(PageHdr))
/*
** The maximum amount of payload (in bytes) that can be stored locally for
** a database entry. If the entry contains more data than this, the
** extra goes onto overflow pages.
**
** This number is chosen so that at least 4 cells will fit on every page.
*/
#define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3)
/*
** Data on a database page is stored as a linked list of Cell structures.
** Both the key and the data are stored in aPayload[]. The key always comes
** first. The aPayload[] field grows as necessary to hold the key and data,
** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and
** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the
** page number of the first overflow page.
**
** Though this structure is fixed in size, the Cell on the database
** page varies in size. Every cell has a CellHdr and at least 4 bytes
** of payload space. Additional payload bytes (up to the maximum of
** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as
** needed.
*/
struct Cell {
CellHdr h; /* The cell header */
char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */
Pgno ovfl; /* The first overflow page */
};
/*
** Free space on a page is remembered using a linked list of the FreeBlk
** structures. Space on a database page is allocated in increments of
** at least 4 bytes and is always aligned to a 4-byte boundry. The
** linked list of FreeBlks is always kept in order by address.
*/
struct FreeBlk {
u16 iSize; /* Number of bytes in this block of free space */
u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */
};
/*
** The number of bytes of payload that will fit on a single overflow page.
*/
#define OVERFLOW_SIZE (eDb_USABLE_SIZE-sizeof(Pgno))
/*
** When the key and data for a single entry in the BTree will not fit in
** the MX_LOCAL_PAYLOAD bytes of space available on the database page,
** then all extra bytes are written to a linked list of overflow pages.
** Each overflow page is an instance of the following structure.
**
** Unused pages in the database are also represented by instances of
** the OverflowPage structure. The PageOne.freeList field is the
** page number of the first page in a linked list of unused database
** pages.
*/
struct OverflowPage {
Pgno iNext;
char aPayload[OVERFLOW_SIZE];
};
/*
** The PageOne.freeList field points to a linked list of overflow pages
** hold information about free pages. The aPayload section of each
** overflow page contains an instance of the following structure. The
** aFree[] array holds the page number of nFree unused pages in the disk
** file.
*/
struct FreelistInfo {
int nFree;
Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)];
};
/*
** For every page in the database file, an instance of the following structure
** is stored in memory. The u.aDisk[] array contains the raw bits read from
** the disk. The rest is auxiliary information held in memory only. The
** auxiliary info is only valid for regular database pages - it is not
** used for overflow pages and pages on the freelist.
**
** Of particular interest in the auxiliary info is the apCell[] entry. Each
** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are
** put in this array so that they can be accessed in constant time, rather
** than in linear time which would be needed if we had to walk the linked
** list on every access.
**
** Note that apCell[] contains enough space to hold up to two more Cells
** than can possibly fit on one page. In the steady state, every apCell[]
** points to memory inside u.aDisk[]. But in the middle of an insert
** operation, some apCell[] entries may temporarily point to data space
** outside of u.aDisk[]. This is a transient situation that is quickly
** resolved. But while it is happening, it is possible for a database
** page to hold as many as two more cells than it might otherwise hold.
** The extra two entries in apCell[] are an allowance for this situation.
**
** The pParent field points back to the parent page. This allows us to
** walk up the BTree from any leaf to the root. Care must be taken to
** unref() the parent page pointer when this page is no longer referenced.
** The pageDestructor() routine handles that chore.
*/
struct MemPage {
union u_page_data {
char aDisk[eDb_PAGE_SIZE]; /* Page data stored on disk */
PageHdr hdr; /* Overlay page header */
} u;
u8 isInit; /* True if auxiliary data is initialized */
u8 idxShift; /* True if apCell[] indices have changed */
u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */
MemPage *pParent; /* The parent of this page. NULL for root */
int idxParent; /* Index in pParent->apCell[] of this node */
int nFree; /* Number of free bytes in u.aDisk[] */
int nCell; /* Number of entries on this page */
Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */
};
/*
** The in-memory image of a disk page has the auxiliary information appended
** to the end. EXTRA_SIZE is the number of bytes of space needed to hold
** that extra information.
*/
#define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data))
/*
** Everything we need to know about an open database
*/
struct Btree {
BtOps *pOps; /* Function table */
Pager *pPager; /* The page cache */
BtCursor *pCursor; /* A list of all open cursors */
PageOne *page1; /* First page of the database */
u8 inTrans; /* True if a transaction is in progress */
u8 inCkpt; /* True if there is a checkpoint on the transaction */
u8 readOnly; /* True if the underlying file is readonly */
u8 needSwab; /* Need to byte-swapping */
};
typedef Btree Bt;
/*
** A cursor is a pointer to a particular entry in the BTree.
** The entry is identified by its MemPage and the index in
** MemPage.apCell[] of the entry.
*/
struct BtCursor {
BtCursorOps *pOps; /* Function table */
Btree *pBt; /* The Btree to which this cursor belongs */
BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */
BtCursor *pShared; /* Loop of cursors with the same root page */
Pgno pgnoRoot; /* The root page of this tree */
MemPage *pPage; /* Page that contains the entry */
int idx; /* Index of the entry in pPage->apCell[] */
u8 wrFlag; /* True if writable */
u8 eSkip; /* Determines if next step operation is a no-op */
u8 iMatch; /* compare result from last eDbBtreeMoveto() */
};
/*
** Legal values for BtCursor.eSkip.
*/
#define SKIP_NONE 0 /* Always step the cursor */
#define SKIP_NEXT 1 /* The next eDbBtreeNext() is a no-op */
#define SKIP_PREV 2 /* The next eDbBtreePrevious() is a no-op */
#define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */
/* Forward declarations */
static int fileBtreeCloseCursor(BtCursor *pCur);
/*
** Routines for byte swapping.
*/
u16 swab16(u16 x){
return ((x & 0xff)<<8) | ((x>>8)&0xff);
}
u32 swab32(u32 x){
return ((x & 0xff)<<24) | ((x & 0xff00)<<8) |
((x>>8) & 0xff00) | ((x>>24)&0xff);
}
/*
** Compute the total number of bytes that a Cell needs on the main
** database page. The number returned includes the Cell header,
** local payload storage, and the pointer to overflow pages (if
** applicable). Additional space allocated on overflow pages
** is NOT included in the value returned from this routine.
*/
static int cellSize(Btree *pBt, Cell *pCell){
int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
if( n>MX_LOCAL_PAYLOAD ){
n = MX_LOCAL_PAYLOAD + sizeof(Pgno);
}else{
n = ROUNDUP(n);
}
n += sizeof(CellHdr);
return n;
}
/*
** Defragment the page given. All Cells are moved to the
** beginning of the page and all free space is collected
** into one big FreeBlk at the end of the page.
*/
static void defragmentPage(Btree *pBt, MemPage *pPage){
int pc, i, n;
FreeBlk *pFBlk;
char newPage[eDb_USABLE_SIZE];
assert( eDbpager_iswriteable(pPage) );
assert( pPage->isInit );
pc = sizeof(PageHdr);
pPage->u.hdr.firstCell = SWAB16(pBt, pc);
memcpy(newPage, pPage->u.aDisk, pc);
for(i=0; i<pPage->nCell; i++){
Cell *pCell = pPage->apCell[i];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -