📄 bt_split.c
字号:
/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2004 * Sleepycat Software. All rights reserved. *//* * Copyright (c) 1990, 1993, 1994, 1995, 1996 * Keith Bostic. All rights reserved. *//* * Copyright (c) 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: bt_split.c,v 11.66 2004/10/01 13:00:21 bostic Exp $ */#include "db_config.h"#ifndef NO_SYSTEM_INCLUDES#include <sys/types.h>#include <string.h>#endif#include "db_int.h"#include "dbinc/db_page.h"#include "dbinc/db_shash.h"#include "dbinc/lock.h"#include "dbinc/mp.h"#include "dbinc/btree.h"static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *));static int __bam_page __P((DBC *, EPG *, EPG *));static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *, int));static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));static int __bam_root __P((DBC *, EPG *));static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));/* * __bam_split -- * Split a page. * * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *)); */int__bam_split(dbc, arg, root_pgnop) DBC *dbc; void *arg; db_pgno_t *root_pgnop;{ BTREE_CURSOR *cp; enum { UP, DOWN } dir; db_pgno_t root_pgno; int exact, level, ret; cp = (BTREE_CURSOR *)dbc->internal; root_pgno = cp->root; /* * The locking protocol we use to avoid deadlock to acquire locks by * walking down the tree, but we do it as lazily as possible, locking * the root only as a last resort. We expect all stack pages to have * been discarded before we're called; we discard all short-term locks. * * When __bam_split is first called, we know that a leaf page was too * full for an insert. We don't know what leaf page it was, but we * have the key/recno that caused the problem. We call XX_search to * reacquire the leaf page, but this time get both the leaf page and * its parent, locked. We then split the leaf page and see if the new * internal key will fit into the parent page. If it will, we're done. * * If it won't, we discard our current locks and repeat the process, * only this time acquiring the parent page and its parent, locked. * This process repeats until we succeed in the split, splitting the * root page as the final resort. The entire process then repeats, * as necessary, until we split a leaf page. * * XXX * A traditional method of speeding this up is to maintain a stack of * the pages traversed in the original search. You can detect if the * stack is correct by storing the page's LSN when it was searched and * comparing that LSN with the current one when it's locked during the * split. This would be an easy change for this code, but I have no * numbers that indicate it's worthwhile. */ for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { /* * Acquire a page and its parent, locked. */ if ((ret = (dbc->dbtype == DB_BTREE ? __bam_search(dbc, PGNO_INVALID, arg, S_WRPAIR, level, NULL, &exact) : __bam_rsearch(dbc, (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) break; if (root_pgnop != NULL) *root_pgnop = cp->csp[0].page->pgno == root_pgno ? root_pgno : cp->csp[-1].page->pgno; /* * Split the page if it still needs it (it's possible another * thread of control has already split the page). If we are * guaranteed that two items will fit on the page, the split * is no longer necessary. */ if (2 * B_MAXSIZEONPAGE(cp->ovflsize) <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) { __bam_stkrel(dbc, STK_NOLOCK); break; } ret = cp->csp[0].page->pgno == root_pgno ? __bam_root(dbc, &cp->csp[0]) : __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); BT_STK_CLR(cp); switch (ret) { case 0: /* Once we've split the leaf page, we're done. */ if (level == LEAFLEVEL) return (0); /* Switch directions. */ if (dir == UP) dir = DOWN; break; case DB_NEEDSPLIT: /* * It's possible to fail to split repeatedly, as other * threads may be modifying the tree, or the page usage * is sufficiently bad that we don't get enough space * the first time. */ if (dir == DOWN) dir = UP; break; default: goto err; } }err: if (root_pgnop != NULL) *root_pgnop = cp->root; return (ret);}/* * __bam_root -- * Split the root page of a btree. */static int__bam_root(dbc, cp) DBC *dbc; EPG *cp;{ DB *dbp; DBT log_dbt; DB_LSN log_lsn; DB_MPOOLFILE *mpf; PAGE *lp, *rp; db_indx_t split; u_int32_t opflags; int ret, t_ret; dbp = dbc->dbp; mpf = dbp->mpf; lp = rp = NULL; /* Yeah, right. */ if (cp->page->level >= MAXBTREELEVEL) { __db_err(dbp->dbenv, "Too many btree levels: %d", cp->page->level); ret = ENOSPC; goto err; } /* Create new left and right pages for the split. */ if ((ret = __db_new(dbc, TYPE(cp->page), &lp)) != 0 || (ret = __db_new(dbc, TYPE(cp->page), &rp)) != 0) goto err; P_INIT(lp, dbp->pgsize, lp->pgno, PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, cp->page->level, TYPE(cp->page)); P_INIT(rp, dbp->pgsize, rp->pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, cp->page->level, TYPE(cp->page)); /* Split the page. */ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) goto err; /* Log the change. */ if (DBC_LOGGING(dbc)) { memset(&log_dbt, 0, sizeof(log_dbt)); log_dbt.data = cp->page; log_dbt.size = dbp->pgsize; ZERO_LSN(log_lsn); opflags = F_ISSET( (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0; if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0, PGNO(lp), &LSN(lp), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &log_lsn, dbc->internal->root, &log_dbt, opflags)) != 0) goto err; } else LSN_NOT_LOGGED(LSN(cp->page)); LSN(lp) = LSN(cp->page); LSN(rp) = LSN(cp->page); /* Clean up the new root page. */ if ((ret = (dbc->dbtype == DB_RECNO ? __ram_root(dbc, cp->page, lp, rp) : __bam_broot(dbc, cp->page, lp, rp))) != 0) goto err; /* Adjust any cursors. */ ret = __bam_ca_split(dbc, cp->page->pgno, lp->pgno, rp->pgno, split, 1); /* Success or error: release pages and locks. */err: if ((t_ret = __memp_fput(mpf, cp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; if ((t_ret = __TLPUT(dbc, cp->lock)) != 0 && ret == 0) ret = t_ret; if (lp != NULL && (t_ret = __memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; if (rp != NULL && (t_ret = __memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; return (ret);}/* * __bam_page -- * Split the non-root page of a btree. */static int__bam_page(dbc, pp, cp) DBC *dbc; EPG *pp, *cp;{ BTREE_CURSOR *bc; DBT log_dbt; DB_LSN log_lsn; DB *dbp; DB_LOCK rplock, tplock; DB_MPOOLFILE *mpf; DB_LSN save_lsn; PAGE *lp, *rp, *alloc_rp, *tp; db_indx_t split; u_int32_t opflags; int ret, t_ret; dbp = dbc->dbp; mpf = dbp->mpf; alloc_rp = lp = rp = tp = NULL; LOCK_INIT(rplock); LOCK_INIT(tplock); ret = -1; /* * Create a new right page for the split, and fill in everything * except its LSN and page number. * * We malloc space for both the left and right pages, so we don't get * a new page from the underlying buffer pool until we know the split * is going to succeed. The reason is that we can't release locks * acquired during the get-a-new-page process because metadata page * locks can't be discarded on failure since we may have modified the * free list. So, if you assume that we're holding a write lock on the * leaf page which ran out of space and started this split (e.g., we * have already written records to the page, or we retrieved a record * from it with the DB_RMW flag set), failing in a split with both a * leaf page locked and the metadata page locked can potentially lock * up the tree badly, because we've violated the rule of always locking * down the tree, and never up. */ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &rp)) != 0) goto err; P_INIT(rp, dbp->pgsize, 0, ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page), ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page), cp->page->level, TYPE(cp->page)); /* * Create new left page for the split, and fill in everything * except its LSN and next-page page number. */ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &lp)) != 0) goto err; P_INIT(lp, dbp->pgsize, PGNO(cp->page), ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page), ISINTERNAL(cp->page) ? PGNO_INVALID : 0, cp->page->level, TYPE(cp->page)); /* * Split right. * * Only the indices are sorted on the page, i.e., the key/data pairs * aren't, so it's simpler to copy the data from the split page onto * two new pages instead of copying half the data to a new right page * and compacting the left page in place. Since the left page can't * change, we swap the original and the allocated left page after the * split. */ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) goto err; /* * Test to see if we are going to be able to insert the new pages into * the parent page. The interesting failure here is that the parent * page can't hold the new keys, and has to be split in turn, in which * case we want to release all the locks we can. */ if ((ret = __bam_pinsert(dbc, pp, lp, rp, 1)) != 0) goto err; /* * Fix up the previous pointer of any leaf page following the split * page. * * There's interesting deadlock situations here as we try to write-lock * a page that's not in our direct ancestry. Consider a cursor walking * backward through the leaf pages, that has our following page locked, * and is waiting on a lock for the page we're splitting. In that case * we're going to deadlock here . It's probably OK, stepping backward * through the tree isn't a common operation. */ if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID) { if ((ret = __db_lget(dbc, 0, NEXT_PGNO(cp->page), DB_LOCK_WRITE, 0, &tplock)) != 0) goto err; if ((ret = __memp_fget(mpf, &NEXT_PGNO(cp->page), 0, &tp)) != 0) goto err; } /* * We've got everything locked down we need, and we know the split * is going to succeed. Go and get the additional page we'll need. */ if ((ret = __db_new(dbc, TYPE(cp->page), &alloc_rp)) != 0) goto err; /* * Lock the new page. We need to do this for two reasons: first, the * fast-lookup code might have a reference to this page in bt_lpgno if * the page was recently deleted from the tree, and that code doesn't * walk the tree and so won't encounter the parent's page lock. * Second, a dirty reader could get to this page via the parent or old * page after the split is done but before the transaction is committed * or aborted. */ if ((ret = __db_lget(dbc, 0, PGNO(alloc_rp), DB_LOCK_WRITE, 0, &rplock)) != 0) goto err; /* * Fix up the page numbers we didn't have before. We have to do this * before calling __bam_pinsert because it may copy a page number onto * the parent page and it takes the page number from its page argument. */ PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp); /* Actually update the parent page. */ if ((ret = __bam_pinsert(dbc, pp, lp, rp, 0)) != 0) goto err; bc = (BTREE_CURSOR *)dbc->internal; /* Log the change. */ if (DBC_LOGGING(dbc)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -