📄 bt_split.c
字号:
if (F_ISSET(dbc, DBC_OPD)) { if (dbp->dup_compare == __bam_defcmp) func = __bam_defpfx; else func = NULL; } else func = t->bt_prefix; nbytes = BINTERNAL_PSIZE(child_bk->len); nksize = child_bk->len; if (func == NULL) goto noprefix; if (ppage->prev_pgno == PGNO_INVALID && off <= 1) goto noprefix; tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) - (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX)); if (B_TYPE(tmp_bk->type) != B_KEYDATA) goto noprefix; memset(&a, 0, sizeof(a)); a.size = tmp_bk->len; a.data = tmp_bk->data; memset(&b, 0, sizeof(b)); b.size = child_bk->len; b.data = child_bk->data; nksize = (u_int32_t)func(dbp, &a, &b); if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) nbytes = n; elsenoprefix: nksize = child_bk->len; if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); memset(&bi, 0, sizeof(bi)); bi.len = nksize; B_TSET(bi.type, child_bk->type, 0); bi.pgno = rchild->pgno; bi.nrecs = nrecs; memset(&hdr, 0, sizeof(hdr)); hdr.data = &bi; hdr.size = SSZA(BINTERNAL, data); memset(&data, 0, sizeof(data)); data.data = child_bk->data; data.size = nksize; if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) return (ret); break; case B_DUPLICATE: case B_OVERFLOW: nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); memset(&bi, 0, sizeof(bi)); bi.len = BOVERFLOW_SIZE; B_TSET(bi.type, child_bk->type, 0); bi.pgno = rchild->pgno; bi.nrecs = nrecs; memset(&hdr, 0, sizeof(hdr)); hdr.data = &bi; hdr.size = SSZA(BINTERNAL, data); memset(&data, 0, sizeof(data)); data.data = child_bk; data.size = BOVERFLOW_SIZE; if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bk->type) == B_OVERFLOW) if ((ret = __db_ovref(dbc, ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) return (ret); break; default: return (__db_pgfmt(dbp->dbenv, rchild->pgno)); } break; case P_IRECNO: case P_LRECNO: nbytes = RINTERNAL_PSIZE; if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); /* Add a new record for the right page. */ memset(&hdr, 0, sizeof(hdr)); hdr.data = &ri; hdr.size = RINTERNAL_SIZE; ri.pgno = rchild->pgno; ri.nrecs = nrecs; if ((ret = __db_pitem(dbc, ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); break; default: return (__db_pgfmt(dbp->dbenv, rchild->pgno)); } /* * If a Recno or Btree with record numbers AM page, or an off-page * duplicates tree, adjust the parent page's left page record count. */ if (F_ISSET(cp, C_RECNUM)) { /* Log the change. */ if (DBC_LOGGING(dbc)) { if ((ret = __bam_cadjust_log(dbp, dbc->txn, &LSN(ppage), 0, PGNO(ppage), &LSN(ppage), parent->indx, -(int32_t)nrecs, 0)) != 0) return (ret); } else LSN_NOT_LOGGED(LSN(ppage)); /* Update the left page count. */ if (dbc->dbtype == DB_RECNO) GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs; else GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs; } return (0);}/* * __bam_psplit -- * Do the real work of splitting the page. */static int__bam_psplit(dbc, cp, lp, rp, splitret) DBC *dbc; EPG *cp; PAGE *lp, *rp; db_indx_t *splitret;{ DB *dbp; PAGE *pp; db_indx_t half, *inp, nbytes, off, splitp, top; int adjust, cnt, iflag, isbigkey, ret; dbp = dbc->dbp; pp = cp->page; inp = P_INP(dbp, pp); adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; /* * If we're splitting the first (last) page on a level because we're * inserting (appending) a key to it, it's likely that the data is * sorted. Moving a single item to the new page is less work and can * push the fill factor higher than normal. This is trivial when we * are splitting a new page before the beginning of the tree, all of * the interesting tests are against values of 0. * * Catching appends to the tree is harder. In a simple append, we're * inserting an item that sorts past the end of the tree; the cursor * will point past the last element on the page. But, in trees with * duplicates, the cursor may point to the last entry on the page -- * in this case, the entry will also be the last element of a duplicate * set (the last because the search call specified the S_DUPLAST flag). * The only way to differentiate between an insert immediately before * the last item in a tree or an append after a duplicate set which is * also the last item in the tree is to call the comparison function. * When splitting internal pages during an append, the search code * guarantees the cursor always points to the largest page item less * than the new internal entry. To summarize, we want to catch three * possible index values: * * NUM_ENT(page) Btree/Recno leaf insert past end-of-tree * NUM_ENT(page) - O_INDX Btree or Recno internal insert past EOT * NUM_ENT(page) - P_INDX Btree leaf insert past EOT after a set * of duplicates * * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert * near the end of the tree, and not after the end of the tree at all. * Do a simple test which might be wrong because calling the comparison * functions is expensive. Regardless, it's not a big deal if we're * wrong, we'll do the split the right way next time. */ off = 0; if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust) off = NUM_ENT(pp) - adjust; else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) off = adjust; if (off != 0) goto sort; /* * Split the data to the left and right pages. Try not to split on * an overflow key. (Overflow keys on internal pages will slow down * searches.) Refuse to split in the middle of a set of duplicates. * * First, find the optimum place to split. * * It's possible to try and split past the last record on the page if * there's a very large record at the end of the page. Make sure this * doesn't happen by bounding the check at the next-to-last entry on * the page. * * Note, we try and split half the data present on the page. This is * because another process may have already split the page and left * it half empty. We don't try and skip the split -- we don't know * how much space we're going to need on the page, and we may need up * to half the page for a big item, so there's no easy test to decide * if we need to split or not. Besides, if two threads are inserting * data into the same place in the database, we're probably going to * need more space soon anyway. */ top = NUM_ENT(pp) - adjust; half = (dbp->pgsize - HOFFSET(pp)) / 2; for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) switch (TYPE(pp)) { case P_IBTREE: if (B_TYPE( GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA) nbytes += BINTERNAL_SIZE( GET_BINTERNAL(dbp, pp, off)->len); else nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); break; case P_LBTREE: if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == B_KEYDATA) nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, pp, off)->len); else nbytes += BOVERFLOW_SIZE; ++off; /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == B_KEYDATA) nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, pp, off)->len); else nbytes += BOVERFLOW_SIZE; break; case P_IRECNO: nbytes += RINTERNAL_SIZE; break; default: return (__db_pgfmt(dbp->dbenv, pp->pgno)); }sort: splitp = off; /* * Splitp is either at or just past the optimum split point. If the * tree type is such that we're going to promote a key to an internal * page, and our current choice is an overflow key, look for something * close by that's smaller. */ switch (TYPE(pp)) { case P_IBTREE: iflag = 1; isbigkey = B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA; break; case P_LBTREE: case P_LDUP: iflag = 0; isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) != B_KEYDATA; break; default: iflag = isbigkey = 0; } if (isbigkey) for (cnt = 1; cnt <= 3; ++cnt) { off = splitp + cnt * adjust; if (off < (db_indx_t)NUM_ENT(pp) && ((iflag && B_TYPE( GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) || B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == B_KEYDATA)) { splitp = off; break; } if (splitp <= (db_indx_t)(cnt * adjust)) continue; off = splitp - cnt * adjust; if (iflag ? B_TYPE( GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA : B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == B_KEYDATA) { splitp = off; break; } } /* * We can't split in the middle a set of duplicates. We know that * no duplicate set can take up more than about 25% of the page, * because that's the point where we push it off onto a duplicate * page set. So, this loop can't be unbounded. */ if (TYPE(pp) == P_LBTREE && inp[splitp] == inp[splitp - adjust]) for (cnt = 1;; ++cnt) { off = splitp + cnt * adjust; if (off < NUM_ENT(pp) && inp[splitp] != inp[off]) { splitp = off; break; } if (splitp <= (db_indx_t)(cnt * adjust)) continue; off = splitp - cnt * adjust; if (inp[splitp] != inp[off]) { splitp = off + adjust; break; } } /* We're going to split at splitp. */ if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) return (ret); if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) return (ret); *splitret = splitp; return (0);}/* * __bam_copy -- * Copy a set of records from one page to another. * * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); */int__bam_copy(dbp, pp, cp, nxt, stop) DB *dbp; PAGE *pp, *cp; u_int32_t nxt, stop;{ db_indx_t *cinp, nbytes, off, *pinp; cinp = P_INP(dbp, cp); pinp = P_INP(dbp, pp); /* * Nxt is the offset of the next record to be placed on the target page. */ for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { switch (TYPE(pp)) { case P_IBTREE: if (B_TYPE( GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA) nbytes = BINTERNAL_SIZE( GET_BINTERNAL(dbp, pp, nxt)->len); else nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); break; case P_LBTREE: /* * If we're on a key and it's a duplicate, just copy * the offset. */ if (off != 0 && (nxt % P_INDX) == 0 && pinp[nxt] == pinp[nxt - P_INDX]) { cinp[off] = cinp[off - P_INDX]; continue; } /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) == B_KEYDATA) nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp, pp, nxt)->len); else nbytes = BOVERFLOW_SIZE; break; case P_IRECNO: nbytes = RINTERNAL_SIZE; break; default: return (__db_pgfmt(dbp->dbenv, pp->pgno)); } cinp[off] = HOFFSET(cp) -= nbytes; memcpy(P_ENTRY(dbp, cp, off), P_ENTRY(dbp, pp, nxt), nbytes); } return (0);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -