📄 pager.c
字号:
/*
** This is the implementation of the page cache subsystem or "pager".
**
** The pager is used to access a database disk file. It implements
** atomic commit and rollback through the use of a journal file that
** is separate from the database file. The pager also implements file
** locking to prevent two processes from writing the same database
** file simultaneously, or one process from reading the database while
** another is writing.
*/
#include "eDbInit.h"
/*
** Macros for troubleshooting. Normally turned off
*/
#if 0
static Pager *mainPager = 0;
#define SET_PAGER(X) if( mainPager==0 ) mainPager = (X)
#define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0
#define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X)
#define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y)
#define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
#else
#define SET_PAGER(X)
#define CLR_PAGER(X)
#define TRACE1(X)
#define TRACE2(X,Y)
#define TRACE3(X,Y,Z)
#endif
/*
** The page cache as a whole is always in one of the following
** states:
**
** eDb_UNLOCK The page cache is not currently reading or
** writing the database file. There is no
** data held in memory. This is the initial
** state.
**
** eDb_READLOCK The page cache is reading the database.
** Writing is not permitted. There can be
** multiple readers accessing the same database
** file at the same time.
**
** eDb_WRITELOCK The page cache is writing the database.
** Access is exclusive. No other processes or
** threads can be reading or writing while one
** process is writing.
**
** The page cache comes up in eDb_UNLOCK. The first time a
** eDb_page_get() occurs, the state transitions to eDb_READLOCK.
** After all pages have been released using eDb_page_unref(),
** the state transitions back to eDb_UNLOCK. The first time
** that eDb_page_write() is called, the state transitions to
** eDb_WRITELOCK. (Note that eDb_page_write() can only be
** called on an outstanding page which means that the pager must
** be in eDb_READLOCK before it transitions to eDb_WRITELOCK.)
** The eDb_page_rollback() and eDb_page_commit() functions
** transition the state from eDb_WRITELOCK back to eDb_READLOCK.
*/
#define eDb_UNLOCK 0
#define eDb_READLOCK 1
#define eDb_WRITELOCK 2
/*
** Each in-memory image of a page begins with the following header.
** This header is only visible to this pager module. The client
** code that calls pager sees only the data that follows the header.
**
** Client code should call eDbpager_write() on a page prior to making
** any modifications to that page. The first time eDbpager_write()
** is called, the original page contents are written into the rollback
** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once
** the journal page has made it onto the disk surface, PgHdr.needSync
** is cleared. The modified page cannot be written back into the original
** database file until the journal pages has been synced to disk and the
** PgHdr.needSync has been cleared.
**
** The PgHdr.dirty flag is set when eDbpager_write() is called and
** is cleared again when the page content is written back to the original
** database file.
*/
typedef struct PgHdr PgHdr;
struct PgHdr {
Pager *pPager; /* The pager to which this page belongs */
Pgno pgno; /* The page number for this page */
PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */
int nRef; /* Number of users of this page */
PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */
PgHdr *pNextAll, *pPrevAll; /* A list of all pages */
PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */
u8 inJournal; /* TRUE if has been written to journal */
u8 inCkpt; /* TRUE if written to the checkpoint journal */
u8 dirty; /* TRUE if we need to write back changes */
u8 needSync; /* Sync journal before writing this page */
u8 alwaysRollback; /* Disable dont_rollback() for this page */
PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */
/* eDb_PAGE_SIZE bytes of page data follow this header */
/* Pager.nExtra bytes of local data follow the page data */
};
/*
** A macro used for invoking the codec if there is one
*/
#ifdef eDb_HAS_CODEC
# define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
#else
# define CODEC(P,D,N,X)
#endif
/*
** Convert a pointer to a PgHdr into a pointer to its data
** and back again.
*/
#define PGHDR_TO_DATA(P) ((void*)(&(P)[1]))
#define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1])
#define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[eDb_PAGE_SIZE])
/*
** How big to make the hash table used for locating in-memory pages
** by page number.
*/
#define N_PG_HASH MAX_PAGES
/*
** Hash a page number
*/
#define pager_hash(PN) ((PN)&(N_PG_HASH-1))
/*
** A open page cache is an instance of the following structure.
*/
struct Pager {
char *zFilename; /* Name of the database file */
char *zJournal; /* Name of the journal file */
char *zDirectory; /* Directory hold database and journal files */
OsFile fd, jfd; /* File descriptors for database and journal */
OsFile cpfd; /* File descriptor for the checkpoint journal */
int dbSize; /* Number of pages in the file */
int origDbSize; /* dbSize before the current change */
int ckptSize; /* Size of database (in pages) at ckpt_begin() */
off_t ckptJSize; /* Size of journal at ckpt_begin() */
int nRec; /* Number of pages written to the journal */
u32 cksumInit; /* Quasi-random value added to every checksum */
int ckptNRec; /* Number of records in the checkpoint journal */
int nExtra; /* Add this many bytes to each in-memory page */
void (*xDestructor)(void*); /* Call this routine when freeing pages */
int nPage; /* Total number of in-memory pages */
int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
int mxPage; /* Maximum number of pages to hold in cache */
int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
void *pCodecArg; /* First argument to xCodec() */
u8 journalOpen; /* True if journal file descriptors is valid */
u8 journalStarted; /* True if header of journal is synced */
u8 useJournal; /* Use a rollback journal on this file */
u8 ckptOpen; /* True if the checkpoint journal is open */
u8 ckptInUse; /* True we are in a checkpoint */
u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/
u8 noSync; /* Do not sync the journal if true */
u8 fullSync; /* Do extra syncs of the journal for robustness */
u8 state; /* eDb_UNLOCK, _READLOCK or _WRITELOCK */
u8 errMask; /* One of several kinds of errors */
u8 tempFile; /* zFilename is a temporary file */
u8 readOnly; /* True for a read-only database */
u8 needSync; /* True if an fsync() is needed on the journal */
u8 dirtyFile; /* True if database file has changed in any way */
u8 alwaysRollback; /* Disable dont_rollback() for all pages */
u8 *aInJournal; /* One bit for each page in the database file */
u8 *aInCkpt; /* One bit for each page in the database */
PgHdr *pFirst, *pLast; /* List of free pages */
PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */
PgHdr *pAll; /* List of all pages */
PgHdr *pCkpt; /* List of pages in the checkpoint journal */
PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */
};
/*
** These are bits that can be set in Pager.errMask.
*/
#define PAGER_ERR_FULL 0x01 /* a write() failed */
#define PAGER_ERR_MEM 0x02 /* malloc() failed */
#define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */
#define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */
#define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */
/*
** The journal file contains page records in the following
** format.
**
** Actually, this structure is the complete page record for pager
** formats less than 3. Beginning with format 3, this record is surrounded
** by two checksums.
*/
typedef struct PageRecord PageRecord;
struct PageRecord {
Pgno pgno; /* The page number */
char aData[eDb_PAGE_SIZE]; /* Original data for page pgno */
};
/*
** Journal files begin with the following magic string. The data
** was obtained from /dev/random. It is used only as a sanity check.
**
** There are three journal formats (so far). The 1st journal format writes
** 32-bit integers in the byte-order of the host machine. New
** formats writes integers as big-endian. All new journals use the
** new format, but we have to be able to read an older journal in order
** to rollback journals created by older versions of the library.
**
** The 3rd journal format (added for 2.8.0) adds additional sanity
** checking information to the journal. If the power fails while the
** journal is being written, semi-random garbage data might appear in
** the journal file after power is restored. If an attempt is then made
** to roll the journal back, the database could be corrupted. The additional
** sanity checking data is an attempt to discover the garbage in the
** journal and ignore it.
**
** The sanity checking information for the 3rd journal format consists
** of a 32-bit checksum on each page of data. The checksum covers both
** the page number and the eDb_PAGE_SIZE bytes of data for the page.
** This cksum is initialized to a 32-bit random value that appears in the
** journal file right after the header. The random initializer is important,
** because garbage data that appears at the end of a journal is likely
** data that was once in other files that have now been deleted. If the
** garbage data came from an obsolete journal file, the checksums might
** be correct. But by initializing the checksum to random value which
** is different for every journal, we minimize that risk.
*/
static const unsigned char aJournalMagic1[] = {
0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
};
static const unsigned char aJournalMagic2[] = {
0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
};
static const unsigned char aJournalMagic3[] = {
0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6,
};
#define JOURNAL_FORMAT_1 1
#define JOURNAL_FORMAT_2 2
#define JOURNAL_FORMAT_3 3
/*
** The following integer determines what format to use when creating
** new primary journal files. By default we always use format 3.
** When testing, we can set this value to older journal formats in order to
** make sure that newer versions of the library are able to rollback older
** journal files.
**
** Note that checkpoint journals always use format 2 and omit the header.
*/
#ifdef eDb_TEST
int journal_format = 3;
#else
# define journal_format 3
#endif
/*
** The size of the header and of each page in the journal varies according
** to which journal format is being used. The following macros figure out
** the sizes based on format numbers.
*/
#define JOURNAL_HDR_SZ(X) \
(sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32))
#define JOURNAL_PG_SZ(X) \
(eDb_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32))
/*
** Enable reference count tracking here:
*/
#ifdef eDb_TEST
int pager_refinfo_enable = 0;
static void pager_refinfo(PgHdr *p){
static int cnt = 0;
if( !pager_refinfo_enable ) return;
printf(
"REFCNT: %4d addr=0x%08x nRef=%d\n",
p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
);
cnt++; /* Something to set a breakpoint on */
}
# define REFINFO(X) pager_refinfo(X)
#else
# define REFINFO(X)
#endif
/*
** Read a 32-bit integer from the given file descriptor. Store the integer
** that is read in *pRes. Return eDb_OK if everything worked, or an
** error code is something goes wrong.
**
** If the journal format is 2 or 3, read a big-endian integer. If the
** journal format is 1, read an integer in the native byte-order of the
** host machine.
*/
static int read32bits(int format, OsFile *fd, u32 *pRes){
u32 res;
int rc;
rc = eDbOsRead(fd, &res, sizeof(res));
if( rc==eDb_OK && format>JOURNAL_FORMAT_1 ){
unsigned char ac[4];
memcpy(ac, &res, 4);
res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
}
*pRes = res;
return rc;
}
/*
** Write a 32-bit integer into the given file descriptor. Return eDb_OK
** on success or an error code is something goes wrong.
**
** If the journal format is 2 or 3, write the integer as 4 big-endian
** bytes. If the journal format is 1, write the integer in the native
** byte order. In normal operation, only formats 2 and 3 are used.
** Journal format 1 is only used for testing.
*/
static int write32bits(OsFile *fd, u32 val){
unsigned char ac[4];
if( journal_format<=1 ){
return eDbOsWrite(fd, &val, 4);
}
ac[0] = (val>>24) & 0xff;
ac[1] = (val>>16) & 0xff;
ac[2] = (val>>8) & 0xff;
ac[3] = val & 0xff;
return eDbOsWrite(fd, ac, 4);
}
/*
** Write a 32-bit integer into a page header right before the
** page data. This will overwrite the PgHdr.pDirty pointer.
**
** The integer is big-endian for formats 2 and 3 and native byte order
** for journal format 1.
*/
static void store32bits(u32 val, PgHdr *p, int offset){
unsigned char *ac;
ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
if( journal_format<=1 ){
memcpy(ac, &val, 4);
}else{
ac[0] = (val>>24) & 0xff;
ac[1] = (val>>16) & 0xff;
ac[2] = (val>>8) & 0xff;
ac[3] = val & 0xff;
}
}
/*
** Convert the bits in the pPager->errMask into an approprate
** return code.
*/
static int pager_errcode(Pager *pPager){
int rc = eDb_OK;
if( pPager->errMask & PAGER_ERR_LOCK ) rc = eDb_PROTOCOL;
if( pPager->errMask & PAGER_ERR_DISK ) rc = eDb_IOERR;
if( pPager->errMask & PAGER_ERR_FULL ) rc = eDb_FULL;
if( pPager->errMask & PAGER_ERR_MEM ) rc = eDb_NOMEM;
if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = eDb_CORRUPT;
return rc;
}
/*
** Add or remove a page from the list of all pages that are in the
** checkpoint journal.
**
** The Pager keeps a separate list of pages that are currently in
** the checkpoint journal. This helps the eDbpager_ckpt_commit()
** routine run MUCH faster for the common case where there are many
** pages in memory but only a few are in the checkpoint journal.
*/
static void page_add_to_ckpt_list(PgHdr *pPg){
Pager *pPager = pPg->pPager;
if( pPg->inCkpt ) return;
assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
pPg->pPrevCkpt = 0;
if( pPager->pCkpt ){
pPager->pCkpt->pPrevCkpt = pPg;
}
pPg->pNextCkpt = pPager->pCkpt;
pPager->pCkpt = pPg;
pPg->inCkpt = 1;
}
static void page_remove_from_ckpt_list(PgHdr *pPg){
if( !pPg->inCkpt ) return;
if( pPg->pPrevCkpt ){
assert( pPg->pPrevCkpt->pNextCkpt==pPg );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -