📄 buf0flu.c
字号:
/******************************************************The database buffer buf_pool flush algorithm(c) 1995-2001 Innobase OyCreated 11/11/1995 Heikki Tuuri*******************************************************/#include "buf0flu.h"#ifdef UNIV_NONINL#include "buf0flu.ic"#include "trx0sys.h"#endif#include "ut0byte.h"#include "ut0lst.h"#include "page0page.h"#include "fil0fil.h"#include "buf0buf.h"#include "buf0lru.h"#include "buf0rea.h"#include "ibuf0ibuf.h"#include "log0log.h"#include "os0file.h"#include "trx0sys.h"#include "srv0srv.h"/* When flushed, dirty blocks are searched in neigborhoods of this size, andflushed along with the original page. */#define BUF_FLUSH_AREA ut_min(BUF_READ_AHEAD_AREA,\ buf_pool->curr_size / 16)/**********************************************************************Validates the flush list. */staticiboolbuf_flush_validate_low(void);/*========================*/ /* out: TRUE if ok *//************************************************************************Inserts a modified block into the flush list. */voidbuf_flush_insert_into_flush_list(/*=============================*/ buf_block_t* block) /* in: block which is modified */{#ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex)));#endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) || (ut_dulint_cmp( (UT_LIST_GET_FIRST(buf_pool->flush_list)) ->oldest_modification, block->oldest_modification) <= 0)); UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); ut_ad(buf_flush_validate_low());}/************************************************************************Inserts a modified block into the flush list in the right sorted position.This function is used by recovery, because there the modifications do notnecessarily come in the order of lsn's. */voidbuf_flush_insert_sorted_into_flush_list(/*====================================*/ buf_block_t* block) /* in: block which is modified */{ buf_block_t* prev_b; buf_block_t* b; #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex)));#endif /* UNIV_SYNC_DEBUG */ prev_b = NULL; b = UT_LIST_GET_FIRST(buf_pool->flush_list); while (b && (ut_dulint_cmp(b->oldest_modification, block->oldest_modification) > 0)) { prev_b = b; b = UT_LIST_GET_NEXT(flush_list, b); } if (prev_b == NULL) { UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); } else { UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, block); } ut_ad(buf_flush_validate_low());}/************************************************************************Returns TRUE if the file page block is immediately suitable for replacement,i.e., the transition FILE_PAGE => NOT_USED allowed. */iboolbuf_flush_ready_for_replace(/*========================*/ /* out: TRUE if can replace immediately */ buf_block_t* block) /* in: buffer control block, must be in state BUF_BLOCK_FILE_PAGE and in the LRU list */{#ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex)));#endif /* UNIV_SYNC_DEBUG */ if (block->state != BUF_BLOCK_FILE_PAGE) { ut_print_timestamp(stderr); fprintf(stderr," InnoDB: Error: buffer block state %lu in the LRU list!\n", (ulong)block->state); ut_print_buf(stderr, (byte*)block, sizeof(buf_block_t)); return(FALSE); } if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) || (block->buf_fix_count != 0) || (block->io_fix != 0)) { return(FALSE); } return(TRUE);}/************************************************************************Returns TRUE if the block is modified and ready for flushing. */UNIV_INLINEiboolbuf_flush_ready_for_flush(/*======================*/ /* out: TRUE if can flush immediately */ buf_block_t* block, /* in: buffer control block, must be in state BUF_BLOCK_FILE_PAGE */ ulint flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */{#ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex)));#endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) && (block->io_fix == 0)) { if (flush_type != BUF_FLUSH_LRU) { return(TRUE); } else if (block->buf_fix_count == 0) { /* If we are flushing the LRU list, to avoid deadlocks we require the block not to be bufferfixed, and hence not latched. */ return(TRUE); } } return(FALSE);}/************************************************************************Updates the flush system data structures when a write is completed. */voidbuf_flush_write_complete(/*=====================*/ buf_block_t* block) /* in: pointer to the block in question */{ ut_ad(block);#ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex)));#endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list)); (buf_pool->n_flush[block->flush_type])--; if (block->flush_type == BUF_FLUSH_LRU) { /* Put the block to the end of the LRU list to wait to be moved to the free list */ buf_LRU_make_block_old(block); buf_pool->LRU_flush_ended++; } /* fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[block->flush_type]); */ if ((buf_pool->n_flush[block->flush_type] == 0) && (buf_pool->init_flush[block->flush_type] == FALSE)) { /* The running flush batch has ended */ os_event_set(buf_pool->no_flush[block->flush_type]); }}/************************************************************************Flushes possible buffered writes from the doublewrite memory buffer to disk,and also wakes up the aio thread if simulated aio is used. It is veryimportant to call this function after a batch of writes has been posted,and also when we may have to wait for a page latch! Otherwise a deadlockof threads can occur. */staticvoidbuf_flush_buffered_writes(void)/*===========================*/{ buf_block_t* block; byte* write_buf; ulint len; ulint len2; ulint i; if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) { os_aio_simulated_wake_handler_threads(); return; } mutex_enter(&(trx_doublewrite->mutex)); /* Write first to doublewrite buffer blocks. We use synchronous aio and thus know that file write has been completed when the control returns. */ if (trx_doublewrite->first_free == 0) { mutex_exit(&(trx_doublewrite->mutex)); return; } for (i = 0; i < trx_doublewrite->first_free; i++) { block = trx_doublewrite->buf_block_arr[i]; ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4) != mach_read_from_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr," InnoDB: ERROR: The page to be written seems corrupt!\n""InnoDB: The lsn fields do not match! Noticed in the buffer pool\n""InnoDB: before posting to the doublewrite buffer.\n"); } if (block->check_index_page_at_flush && !page_simple_validate(block->frame)) { buf_page_print(block->frame); ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Apparent corruption of an index page n:o %lu in space %lu\n" "InnoDB: to be written to data file. We intentionally crash server\n" "InnoDB: to prevent corrupt data from ending up in data\n" "InnoDB: files.\n", (ulong) block->offset, (ulong) block->space); ut_error; } } /* increment the doublewrite flushed pages counter */ srv_dblwr_pages_written+= trx_doublewrite->first_free; srv_dblwr_writes++; if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; } else { len = trx_doublewrite->first_free * UNIV_PAGE_SIZE; } fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, trx_doublewrite->block1, 0, len, (void*)trx_doublewrite->write_buf, NULL); write_buf = trx_doublewrite->write_buf; for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) { if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4) != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr," InnoDB: ERROR: The page to be written seems corrupt!\n""InnoDB: The lsn fields do not match! Noticed in the doublewrite block1.\n"); } } if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE; fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, trx_doublewrite->block2, 0, len, (void*)(trx_doublewrite->write_buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE), NULL); write_buf = trx_doublewrite->write_buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) { if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4) != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr," InnoDB: ERROR: The page to be written seems corrupt!\n""InnoDB: The lsn fields do not match! Noticed in the doublewrite block2.\n"); } } } /* Now flush the doublewrite buffer data to disk */ fil_flush(TRX_SYS_SPACE); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer blocks. Next do the writes to the intended positions. */ for (i = 0; i < trx_doublewrite->first_free; i++) { block = trx_doublewrite->buf_block_arr[i]; if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4) != mach_read_from_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr," InnoDB: ERROR: The page to be written seems corrupt!\n""InnoDB: The lsn fields do not match! Noticed in the buffer pool\n""InnoDB: after posting and flushing the doublewrite buffer.\n""InnoDB: Page buf fix count %lu, io fix %lu, state %lu\n", (ulong)block->buf_fix_count, (ulong)block->io_fix, (ulong)block->state); } ut_a(block->state == BUF_BLOCK_FILE_PAGE); fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, (void*)block->frame, (void*)block); } /* Wake possible simulated aio thread to actually post the writes to the operating system */ os_aio_simulated_wake_handler_threads(); /* Wait that all async writes to tablespaces have been posted to the OS */ os_aio_wait_until_no_pending_writes(); /* Now we flush the data to disk (for example, with fsync) */ fil_flush_file_spaces(FIL_TABLESPACE); /* We can now reuse the doublewrite memory buffer: */ trx_doublewrite->first_free = 0; mutex_exit(&(trx_doublewrite->mutex)); }/************************************************************************Posts a buffer page for writing. If the doublewrite memory buffer isfull, calls buf_flush_buffered_writes and waits for for free space toappear. */staticvoidbuf_flush_post_to_doublewrite_buf(/*==============================*/ buf_block_t* block) /* in: buffer block to write */{try_again: mutex_enter(&(trx_doublewrite->mutex)); ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (trx_doublewrite->first_free >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { mutex_exit(&(trx_doublewrite->mutex)); buf_flush_buffered_writes(); goto try_again; } ut_memcpy(trx_doublewrite->write_buf + UNIV_PAGE_SIZE * trx_doublewrite->first_free, block->frame, UNIV_PAGE_SIZE); trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block; trx_doublewrite->first_free++; if (trx_doublewrite->first_free >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { mutex_exit(&(trx_doublewrite->mutex)); buf_flush_buffered_writes(); return; } mutex_exit(&(trx_doublewrite->mutex));}/************************************************************************Initializes a page for writing to the tablespace. */voidbuf_flush_init_for_writing(/*=======================*/ byte* page, /* in: page */ dulint newest_lsn, /* in: newest modification lsn to the page */ ulint space, /* in: space id */ ulint page_no) /* in: page number */{ /* Write the newest modification lsn to the page header and trailer */ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, newest_lsn); /* Write the page number and the space id */ mach_write_to_4(page + FIL_PAGE_OFFSET, page_no); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space); /* Store the new formula checksum */ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, srv_use_checksums ? buf_calc_page_new_checksum(page) : BUF_NO_CHECKSUM_MAGIC); /* We overwrite the first 4 bytes of the end lsn field to store the old formula checksum. Since it depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the new formula checksum. */ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, srv_use_checksums ? buf_calc_page_old_checksum(page) : BUF_NO_CHECKSUM_MAGIC);}/************************************************************************Does an asynchronous write of a buffer page. NOTE: in simulated aio andalso when the doublewrite buffer is used, we must callbuf_flush_buffered_writes after we have posted a batch of writes! */staticvoidbuf_flush_write_block_low(/*======================*/ buf_block_t* block) /* in: buffer block to write */{#ifdef UNIV_LOG_DEBUG static ibool univ_log_debug_warned;#endif /* UNIV_LOG_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE);#ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(block->space, block->offset) == 0);#endif ut_ad(!ut_dulint_is_zero(block->newest_modification));#ifdef UNIV_LOG_DEBUG if (!univ_log_debug_warned) { univ_log_debug_warned = TRUE; fputs( "Warning: cannot force log to disk if UNIV_LOG_DEBUG is defined!\n" "Crash recovery will not work!\n", stderr); }#else /* Force the log to the disk before writing the modified block */ log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);#endif buf_flush_init_for_writing(block->frame, block->newest_modification, block->space, block->offset); if (!srv_use_doublewrite_buf || !trx_doublewrite) { fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, (void*)block->frame, (void*)block); } else { buf_flush_post_to_doublewrite_buf(block); }}/************************************************************************Writes a page asynchronously from the buffer buf_pool to a file, if it can befound in the buf_pool and it is in a flushable state. NOTE: in simulated aiowe must call os_aio_simulated_wake_handler_threads after we have posted a batchof writes! */staticulintbuf_flush_try_page(/*===============*/ /* out: 1 if a page was flushed, 0 otherwise */ ulint space, /* in: space id */ ulint offset, /* in: page offset */ ulint flush_type) /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST, or BUF_FLUSH_SINGLE_PAGE */{ buf_block_t* block; ibool locked; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -