📄 page0page.c
字号:
/******************************************************Index page routines(c) 1994-1996 Innobase OyCreated 2/2/1994 Heikki Tuuri*******************************************************/#define THIS_MODULE#include "page0page.h"#ifdef UNIV_NONINL#include "page0page.ic"#endif#undef THIS_MODULE#include "page0cur.h"#include "lock0lock.h"#include "fut0lst.h"#include "btr0sea.h"#include "buf0buf.h"#include "srv0srv.h"#include "btr0btr.h"/* THE INDEX PAGE ============== The index page consists of a page header which contains the page'sid and other information. On top of it are the the index recordsin a heap linked into a one way linear list according to alphabetic order.Just below page end is an array of pointers which we call page directory,to about every sixth record in the list. The pointers are placed inthe directory in the alphabetical order of the records pointed to,enabling us to make binary search using the array. Each slot n:o Iin the directory points to a record, where a 4-bit field contains a countof those records which are in the linear list between pointer I and the pointer I - 1 in the directory, including the recordpointed to by pointer I and not including the record pointed to by I - 1.We say that the record pointed to by slot I, or that slot I, ownsthese records. The count is always kept in the range 4 to 8, withthe exception that it is 1 for the first slot, and 1--8 for the second slot. An essentially binary search can be performed in the list of indexrecords, like we could do if we had pointer to every record in thepage directory. The data structure is, however, more efficient whenwe are doing inserts, because most inserts are just pushed on a heap.Only every 8th insert requires block move in the directory pointertable, which itself is quite small. A record is deleted from the pageby just taking it off the linear list and updating the number of ownedrecords-field of the record which owns it, and updating the page directory,if necessary. A special case is the one when the record owns itself.Because the overhead of inserts is so small, we may also increase thepage size from the projected default of 8 kB to 64 kB without toomuch loss of efficiency in inserts. Bigger page becomes actualwhen the disk transfer rate compared to seek and latency time rises.On the present system, the page size is set so that the page transfertime (3 ms) is 20 % of the disk random access time (15 ms).When the page is split, merged, or becomes full but contains deletedrecords, we have to reorganize the page.Assuming a page size of 8 kB, a typical index page of a secondaryindex contains 300 index entries, and the size of the page directoryis 50 x 4 bytes = 200 bytes. *//*******************************************************************Looks for the directory slot which owns the given record. */ulintpage_dir_find_owner_slot(/*=====================*/ /* out: the directory slot number */ rec_t* rec) /* in: the physical record */{ page_t* page; register uint16 rec_offs_bytes; register page_dir_slot_t* slot; register const page_dir_slot_t* first_slot; register rec_t* r = rec; ut_ad(page_rec_check(rec)); page = buf_frame_align(rec); first_slot = page_dir_get_nth_slot(page, 0); slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1); if (page_is_comp(page)) { while (rec_get_n_owned(r, TRUE) == 0) { r = page + rec_get_next_offs(r, TRUE); ut_ad(r >= page + PAGE_NEW_SUPREMUM); ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); } } else { while (rec_get_n_owned(r, FALSE) == 0) { r = page + rec_get_next_offs(r, FALSE); ut_ad(r >= page + PAGE_OLD_SUPREMUM); ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR)); } } rec_offs_bytes = mach_encode_2(r - page); while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) { if (UNIV_UNLIKELY(slot == first_slot)) { fprintf(stderr, "InnoDB: Probable data corruption on page %lu\n" "InnoDB: Original record ", (ulong) buf_frame_get_page_no(page)); if (page_is_comp(page)) { fputs("(compact record)", stderr); } else { rec_print_old(stderr, rec); } fputs("\n" "InnoDB: on that page.\n" "InnoDB: Cannot find the dir slot for record ", stderr); if (page_is_comp(page)) { fputs("(compact record)", stderr); } else { rec_print_old(stderr, page + mach_decode_2(rec_offs_bytes)); } fputs("\n" "InnoDB: on that page!\n", stderr); buf_page_print(page); ut_error; } slot += PAGE_DIR_SLOT_SIZE; } return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);}/******************************************************************Used to check the consistency of a directory slot. */staticiboolpage_dir_slot_check(/*================*/ /* out: TRUE if succeed */ page_dir_slot_t* slot) /* in: slot */{ page_t* page; ulint n_slots; ulint n_owned; ut_a(slot); page = buf_frame_align(slot); n_slots = page_dir_get_n_slots(page); ut_a(slot <= page_dir_get_nth_slot(page, 0)); ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1)); ut_a(page_rec_check(page_dir_slot_get_rec(slot))); n_owned = rec_get_n_owned(page_dir_slot_get_rec(slot), page_is_comp(page)); if (slot == page_dir_get_nth_slot(page, 0)) { ut_a(n_owned == 1); } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) { ut_a(n_owned >= 1); ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); } else { ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED); ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED); } return(TRUE);}/*****************************************************************Sets the max trx id field value. */voidpage_set_max_trx_id(/*================*/ page_t* page, /* in: page */ dulint trx_id) /* in: transaction id */{ buf_block_t* block; ut_ad(page); block = buf_block_align(page); if (block->is_hashed) { rw_lock_x_lock(&btr_search_latch); } /* It is not necessary to write this change to the redo log, as during a database recovery we assume that the max trx id of every page is the maximum trx id assigned before the crash. */ mach_write_to_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID, trx_id); if (block->is_hashed) { rw_lock_x_unlock(&btr_search_latch); }}/****************************************************************Allocates a block of memory from an index page. */byte*page_mem_alloc(/*===========*/ /* out: pointer to start of allocated buffer, or NULL if allocation fails */ page_t* page, /* in: index page */ ulint need, /* in: number of bytes needed */ dict_index_t* index, /* in: record descriptor */ ulint* heap_no)/* out: this contains the heap number of the allocated record if allocation succeeds */{ rec_t* rec; byte* block; ulint avl_space; ulint garbage; ut_ad(page && heap_no); /* If there are records in the free list, look if the first is big enough */ rec = page_header_get_ptr(page, PAGE_FREE); if (rec) { mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; *offsets_ = (sizeof offsets_) / sizeof *offsets_; offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap); if (rec_offs_size(offsets) >= need) { page_header_set_ptr(page, PAGE_FREE, page_rec_get_next(rec)); garbage = page_header_get_field(page, PAGE_GARBAGE); ut_ad(garbage >= need); page_header_set_field(page, PAGE_GARBAGE, garbage - need); *heap_no = rec_get_heap_no(rec, page_is_comp(page)); block = rec_get_start(rec, offsets); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } return(block); } if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } } /* Could not find space from the free list, try top of heap */ avl_space = page_get_max_insert_size(page, 1); if (avl_space >= need) { block = page_header_get_ptr(page, PAGE_HEAP_TOP); page_header_set_ptr(page, PAGE_HEAP_TOP, block + need); *heap_no = page_dir_get_n_heap(page); page_dir_set_n_heap(page, 1 + *heap_no); return(block); } return(NULL);}/**************************************************************Writes a log record of page creation. */UNIV_INLINEvoidpage_create_write_log(/*==================*/ buf_frame_t* frame, /* in: a buffer frame where the page is created */ mtr_t* mtr, /* in: mini-transaction handle */ ulint comp) /* in: nonzero=compact page format */{ mlog_write_initial_log_record(frame, comp ? MLOG_COMP_PAGE_CREATE : MLOG_PAGE_CREATE, mtr);}/***************************************************************Parses a redo log record of creating a page. */byte*page_parse_create(/*==============*/ /* out: end of log record or NULL */ byte* ptr, /* in: buffer */ byte* end_ptr __attribute__((unused)), /* in: buffer end */ ulint comp, /* in: nonzero=compact page format */ page_t* page, /* in: page or NULL */ mtr_t* mtr) /* in: mtr or NULL */{ ut_ad(ptr && end_ptr); /* The record is empty, except for the record initial part */ if (page) { page_create(page, mtr, comp); } return(ptr);}/**************************************************************The index page creation function. */page_t* page_create(/*========*/ /* out: pointer to the page */ buf_frame_t* frame, /* in: a buffer frame where the page is created */ mtr_t* mtr, /* in: mini-transaction handle */ ulint comp) /* in: nonzero=compact page format */{ page_dir_slot_t* slot; mem_heap_t* heap; dtuple_t* tuple; dfield_t* field; byte* heap_top; rec_t* infimum_rec; rec_t* supremum_rec; page_t* page; dict_index_t* index; ulint* offsets; index = comp ? srv_sys->dummy_ind2 : srv_sys->dummy_ind1; ut_ad(frame && mtr); ut_ad(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE <= PAGE_DATA); ut_ad(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE <= PAGE_DATA); /* 1. INCREMENT MODIFY CLOCK */ buf_frame_modify_clock_inc(frame); /* 2. WRITE LOG INFORMATION */ page_create_write_log(frame, mtr, comp); page = frame; fil_page_set_type(page, FIL_PAGE_INDEX); heap = mem_heap_create(200); /* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */ /* Create first a data tuple for infimum record */ tuple = dtuple_create(heap, 1); dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM); field = dtuple_get_nth_field(tuple, 0); dfield_set_data(field, "infimum", 8); dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8, 0); /* Set the corresponding physical record to its place in the page record heap */ heap_top = page + PAGE_DATA; infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); ut_a(infimum_rec == page + (comp ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); rec_set_n_owned(infimum_rec, comp, 1); rec_set_heap_no(infimum_rec, comp, 0); offsets = rec_get_offsets(infimum_rec, index, NULL, ULINT_UNDEFINED, &heap); heap_top = rec_get_end(infimum_rec, offsets); /* Create then a tuple for supremum */ tuple = dtuple_create(heap, 1); dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM); field = dtuple_get_nth_field(tuple, 0); dfield_set_data(field, "supremum", comp ? 8 : 9); dtype_set(dfield_get_type(field), DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9, 0); supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple); ut_a(supremum_rec == page + (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)); rec_set_n_owned(supremum_rec, comp, 1); rec_set_heap_no(supremum_rec, comp, 1); offsets = rec_get_offsets(supremum_rec, index, offsets, ULINT_UNDEFINED, &heap); heap_top = rec_get_end(supremum_rec, offsets); ut_ad(heap_top == page + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END)); mem_heap_free(heap); /* 4. INITIALIZE THE PAGE */ page_header_set_field(page, PAGE_N_DIR_SLOTS, 2); page_header_set_ptr(page, PAGE_HEAP_TOP, heap_top); page_header_set_field(page, PAGE_N_HEAP, comp ? 0x8002 : 2); page_header_set_ptr(page, PAGE_FREE, NULL); page_header_set_field(page, PAGE_GARBAGE, 0); page_header_set_ptr(page, PAGE_LAST_INSERT, NULL); page_header_set_field(page, PAGE_DIRECTION, PAGE_NO_DIRECTION); page_header_set_field(page, PAGE_N_DIRECTION, 0); page_header_set_field(page, PAGE_N_RECS, 0); page_set_max_trx_id(page, ut_dulint_zero); memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START - (heap_top - page)); /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */ /* Set the slots to point to infimum and supremum. */ slot = page_dir_get_nth_slot(page, 0); page_dir_slot_set_rec(slot, infimum_rec); slot = page_dir_get_nth_slot(page, 1); page_dir_slot_set_rec(slot, supremum_rec); /* Set the next pointers in infimum and supremum */ rec_set_next_offs(infimum_rec, comp, (ulint)(supremum_rec - page)); rec_set_next_offs(supremum_rec, comp, 0); return(page);}/*****************************************************************Differs from page_copy_rec_list_end, because this function does nottouch the lock table and max trx id on page. */voidpage_copy_rec_list_end_no_locks(/*============================*/ page_t* new_page, /* in: index page to copy to */ page_t* page, /* in: index page */ rec_t* rec, /* in: record on page */ dict_index_t* index, /* in: record descriptor */ mtr_t* mtr) /* in: mtr */{ page_cur_t cur1; page_cur_t cur2; rec_t* sup; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; *offsets_ = (sizeof offsets_) / sizeof *offsets_; page_cur_position(rec, &cur1); if (page_cur_is_before_first(&cur1)) { page_cur_move_to_next(&cur1); } ut_a((ibool)!!page_is_comp(new_page) == index->table->comp); ut_a(page_is_comp(new_page) == page_is_comp(page)); ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint) (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); page_cur_set_before_first(new_page, &cur2); /* Copy records from the original page to the new page */ sup = page_get_supremum_rec(page); for (;;) { rec_t* cur1_rec = page_cur_get_rec(&cur1); if (cur1_rec == sup) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -