📄 pager.c
字号:
pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
}else{
assert( pPg->pPager->pCkpt==pPg );
pPg->pPager->pCkpt = pPg->pNextCkpt;
}
if( pPg->pNextCkpt ){
assert( pPg->pNextCkpt->pPrevCkpt==pPg );
pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
}
pPg->pNextCkpt = 0;
pPg->pPrevCkpt = 0;
pPg->inCkpt = 0;
}
/*
** Find a page in the hash table given its page number. Return
** a pointer to the page or NULL if not found.
*/
static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
PgHdr *p = pPager->aHash[pager_hash(pgno)];
while( p && p->pgno!=pgno ){
p = p->pNextHash;
}
return p;
}
/*
** Unlock the database and clear the in-memory cache. This routine
** sets the state of the pager back to what it was when it was first
** opened. Any outstanding pages are invalidated and subsequent attempts
** to access those pages will likely result in a coredump.
*/
static void pager_reset(Pager *pPager){
PgHdr *pPg, *pNext;
for(pPg=pPager->pAll; pPg; pPg=pNext){
pNext = pPg->pNextAll;
eDbFree(pPg);
}
pPager->pFirst = 0;
pPager->pFirstSynced = 0;
pPager->pLast = 0;
pPager->pAll = 0;
memset(pPager->aHash, 0, sizeof(pPager->aHash));
pPager->nPage = 0;
if( pPager->state>=eDb_WRITELOCK ){
eDbpager_rollback(pPager);
}
eDbOsUnlock(&pPager->fd);
pPager->state = eDb_UNLOCK;
pPager->dbSize = -1;
pPager->nRef = 0;
assert( pPager->journalOpen==0 );
}
/*
** When this routine is called, the pager has the journal file open and
** a write lock on the database. This routine releases the database
** write lock and acquires a read lock in its place. The journal file
** is deleted and closed.
**
** TODO: Consider keeping the journal file open for temporary databases.
** This might give a performance improvement on windows where opening
** a file is an expensive operation.
*/
static int pager_unwritelock(Pager *pPager){
int rc;
PgHdr *pPg;
if( pPager->state<eDb_WRITELOCK ) return eDb_OK;
eDbpager_ckpt_commit(pPager);
if( pPager->ckptOpen ){
eDbOsClose(&pPager->cpfd);
pPager->ckptOpen = 0;
}
if( pPager->journalOpen ){
eDbOsClose(&pPager->jfd);
pPager->journalOpen = 0;
eDbOsDelete(pPager->zJournal);
eDbFree( pPager->aInJournal );
pPager->aInJournal = 0;
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
pPg->inJournal = 0;
pPg->dirty = 0;
pPg->needSync = 0;
}
}else{
assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
}
rc = eDbOsReadLock(&pPager->fd);
if( rc==eDb_OK ){
pPager->state = eDb_READLOCK;
}else{
/* This can only happen if a process does a BEGIN, then forks and the
** child process does the COMMIT. Because of the semantics of unix
** file locking, the unlock will fail.
*/
pPager->state = eDb_UNLOCK;
}
return rc;
}
/*
** Compute and return a checksum for the page of data.
**
** This is not a real checksum. It is really just the sum of the
** random initial value and the page number. We considered do a checksum
** of the database, but that was found to be too slow.
*/
static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
u32 cksum = pPager->cksumInit + pgno;
return cksum;
}
/*
** Read a single page from the journal file opened on file descriptor
** jfd. Playback this one page.
**
** There are three different journal formats. The format parameter determines
** which format is used by the journal that is played back.
*/
static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
int rc;
PgHdr *pPg; /* An existing page in the cache */
PageRecord pgRec;
u32 cksum;
rc = read32bits(format, jfd, &pgRec.pgno);
if( rc!=eDb_OK ) return rc;
rc = eDbOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
if( rc!=eDb_OK ) return rc;
/* Sanity checking on the page. This is more important that I originally
** thought. If a power failure occurs while the journal is being written,
** it could cause invalid data to be written into the journal. We need to
** detect this invalid data (with high probability) and ignore it.
*/
if( pgRec.pgno==0 ){
return eDb_DONE;
}
if( pgRec.pgno>(unsigned)pPager->dbSize ){
return eDb_OK;
}
if( format>=JOURNAL_FORMAT_3 ){
rc = read32bits(format, jfd, &cksum);
if( rc ) return rc;
if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){
return eDb_DONE;
}
}
/* Playback the page. Update the in-memory copy of the page
** at the same time, if there is one.
*/
pPg = pager_lookup(pPager, pgRec.pgno);
TRACE2("PLAYBACK %d\n", pgRec.pgno);
eDbOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)eDb_PAGE_SIZE);
rc = eDbOsWrite(&pPager->fd, pgRec.aData, eDb_PAGE_SIZE);
if( pPg ){
/* No page should ever be rolled back that is in use, except for page
** 1 which is held in use in order to keep the lock on the database
** active.
*/
assert( pPg->nRef==0 || pPg->pgno==1 );
memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, eDb_PAGE_SIZE);
memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
pPg->dirty = 0;
pPg->needSync = 0;
CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
}
return rc;
}
/*
** Playback the journal and thus restore the database file to
** the state it was in before we started making changes.
**
** The journal file format is as follows:
**
** * 8 byte prefix. One of the aJournalMagic123 vectors defined
** above. The format of the journal file is determined by which
** of the three prefix vectors is seen.
** * 4 byte big-endian integer which is the number of valid page records
** in the journal. If this value is 0xffffffff, then compute the
** number of page records from the journal size. This field appears
** in format 3 only.
** * 4 byte big-endian integer which is the initial value for the
** sanity checksum. This field appears in format 3 only.
** * 4 byte integer which is the number of pages to truncate the
** database to during a rollback.
** * Zero or more pages instances, each as follows:
** + 4 byte page number.
** + eDb_PAGE_SIZE bytes of data.
** + 4 byte checksum (format 3 only)
**
** When we speak of the journal header, we mean the first 4 bullets above.
** Each entry in the journal is an instance of the 5th bullet. Note that
** bullets 2 and 3 only appear in format-3 journals.
**
** Call the value from the second bullet "nRec". nRec is the number of
** valid page entries in the journal. In most cases, you can compute the
** value of nRec from the size of the journal file. But if a power
** failure occurred while the journal was being written, it could be the
** case that the size of the journal file had already been increased but
** the extra entries had not yet made it safely to disk. In such a case,
** the value of nRec computed from the file size would be too large. For
** that reason, we always use the nRec value in the header.
**
** If the nRec value is 0xffffffff it means that nRec should be computed
** from the file size. This value is used when the user selects the
** no-sync option for the journal. A power failure could lead to corruption
** in this case. But for things like temporary table (which will be
** deleted when the power is restored) we don't care.
**
** Journal formats 1 and 2 do not have an nRec value in the header so we
** have to compute nRec from the file size. This has risks (as described
** above) which is why all persistent tables have been changed to use
** format 3.
**
** If the file opened as the journal file is not a well-formed
** journal file then the database will likely already be
** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask
** and eDb_CORRUPT is returned. If it all works, then this routine
** returns eDb_OK.
*/
static int pager_playback(Pager *pPager, int useJournalSize){
off_t szJ; /* Size of the journal file in bytes */
int nRec; /* Number of Records in the journal */
int i; /* Loop counter */
Pgno mxPg = 0; /* Size of the original file in pages */
int format; /* Format of the journal file. */
unsigned char aMagic[sizeof(aJournalMagic1)];
int rc;
/* Figure out how many records are in the journal. Abort early if
** the journal is empty.
*/
assert( pPager->journalOpen );
eDbOsSeek(&pPager->jfd, 0);
rc = eDbOsFileSize(&pPager->jfd, &szJ);
if( rc!=eDb_OK ){
goto end_playback;
}
/* If the journal file is too small to contain a complete header,
** it must mean that the process that created the journal was just
** beginning to write the journal file when it died. In that case,
** the database file should have still been completely unchanged.
** Nothing needs to be rolled back. We can safely ignore this journal.
*/
if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
goto end_playback;
}
/* Read the beginning of the journal and truncate the
** database file back to its original size.
*/
rc = eDbOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
if( rc!=eDb_OK ){
rc = eDb_PROTOCOL;
goto end_playback;
}
if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){
format = JOURNAL_FORMAT_3;
}else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){
format = JOURNAL_FORMAT_2;
}else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){
format = JOURNAL_FORMAT_1;
}else{
rc = eDb_PROTOCOL;
goto end_playback;
}
if( format>=JOURNAL_FORMAT_3 ){
if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){
/* Ignore the journal if it is too small to contain a complete
** header. We already did this test once above, but at the prior
** test, we did not know the journal format and so we had to assume
** the smallest possible header. Now we know the header is bigger
** than the minimum so we test again.
*/
goto end_playback;
}
rc = read32bits(format, &pPager->jfd, (u32*)&nRec);
if( rc ) goto end_playback;
rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
if( rc ) goto end_playback;
if( nRec==0xffffffff || useJournalSize ){
nRec = (int)((szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3));
}
}else{
nRec = (int)((szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2));
assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ );
}
rc = read32bits(format, &pPager->jfd, &mxPg);
if( rc!=eDb_OK ){
goto end_playback;
}
assert( pPager->origDbSize==0 || pPager->origDbSize==(int)mxPg );
rc = eDbOsTruncate(&pPager->fd, eDb_PAGE_SIZE*(off_t)mxPg);
if( rc!=eDb_OK ){
goto end_playback;
}
pPager->dbSize = mxPg;
/* Copy original pages out of the journal and back into the database file.
*/
for(i=0; i<nRec; i++){
rc = pager_playback_one_page(pPager, &pPager->jfd, format);
if( rc!=eDb_OK ){
if( rc==eDb_DONE ){
rc = eDb_OK;
}
break;
}
}
/* Pages that have been written to the journal but never synced
** where not restored by the loop above. We have to restore those
** pages by reading them back from the original database.
*/
if( rc==eDb_OK ){
PgHdr *pPg;
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
char zBuf[eDb_PAGE_SIZE];
if( !pPg->dirty ) continue;
if( (int)pPg->pgno <= pPager->origDbSize ){
eDbOsSeek(&pPager->fd, eDb_PAGE_SIZE*(off_t)(pPg->pgno-1));
rc = eDbOsRead(&pPager->fd, zBuf, eDb_PAGE_SIZE);
TRACE2("REFETCH %d\n", pPg->pgno);
CODEC(pPager, zBuf, pPg->pgno, 2);
if( rc ) break;
}else{
memset(zBuf, 0, eDb_PAGE_SIZE);
}
if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), eDb_PAGE_SIZE) ){
memcpy(PGHDR_TO_DATA(pPg), zBuf, eDb_PAGE_SIZE);
memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
}
pPg->needSync = 0;
pPg->dirty = 0;
}
}
end_playback:
if( rc!=eDb_OK ){
pager_unwritelock(pPager);
pPager->errMask |= PAGER_ERR_CORRUPT;
rc = eDb_CORRUPT;
}else{
rc = pager_unwritelock(pPager);
}
return rc;
}
/*
** Playback the checkpoint journal.
**
** This is similar to playing back the transaction journal but with
** a few extra twists.
**
** (1) The number of pages in the database file at the start of
** the checkpoint is stored in pPager->ckptSize, not in the
** journal file itself.
**
** (2) In addition to playing back the checkpoint journal, also
** playback all pages of the transaction journal beginning
** at offset pPager->ckptJSize.
*/
static int pager_ckpt_playback(Pager *pPager){
off_t szJ; /* Size of the full journal */
int nRec; /* Number of Records */
int i; /* Loop counter */
int rc;
/* Truncate the database back to its original size.
*/
rc = eDbOsTruncate(&pPager->fd, eDb_PAGE_SIZE*(off_t)pPager->ckptSize);
pPager->dbSize = pPager->ckptSize;
/* Figure out how many records are in the checkpoint journal.
*/
assert( pPager->ckptInUse && pPager->journalOpen );
eDbOsSeek(&pPager->cpfd, 0);
nRec = pPager->ckptNRec;
/* Copy original pages out of the checkpoint journal and back into the
** database file. Note that the checkpoint journal always uses format
** 2 instead of format 3 since it does not need to be concerned with
** power failures corrupting the journal and can thus omit the checksums.
*/
for(i=nRec-1; i>=0; i--){
rc = pager_playback_one_page(pPager, &pPager->cpfd, 2);
assert( rc!=eDb_DONE );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -