📄 bt_split.c

📁 File system using stacked.
💻 C
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
			if (F_ISSET(dbc, DBC_OPD)) {				if (dbp->dup_compare == __bam_defcmp)					func = __bam_defpfx;				else					func = NULL;			} else				func = t->bt_prefix;			nbytes = BINTERNAL_PSIZE(child_bk->len);			nksize = child_bk->len;			if (func == NULL)				goto noprefix;			if (ppage->prev_pgno == PGNO_INVALID && off <= 1)				goto noprefix;			tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) -			    (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX));			if (B_TYPE(tmp_bk->type) != B_KEYDATA)				goto noprefix;			memset(&a, 0, sizeof(a));			a.size = tmp_bk->len;			a.data = tmp_bk->data;			memset(&b, 0, sizeof(b));			b.size = child_bk->len;			b.data = child_bk->data;			nksize = (u_int32_t)func(dbp, &a, &b);			if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)				nbytes = n;			elsenoprefix:			nksize = child_bk->len;			if (P_FREESPACE(dbp, ppage) < nbytes)				return (DB_NEEDSPLIT);			if (space_check)				return (0);			memset(&bi, 0, sizeof(bi));			bi.len = nksize;			B_TSET(bi.type, child_bk->type, 0);			bi.pgno = rchild->pgno;			bi.nrecs = nrecs;			memset(&hdr, 0, sizeof(hdr));			hdr.data = &bi;			hdr.size = SSZA(BINTERNAL, data);			memset(&data, 0, sizeof(data));			data.data = child_bk->data;			data.size = nksize;			if ((ret = __db_pitem(dbc, ppage, off,			    BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)				return (ret);			break;		case B_DUPLICATE:		case B_OVERFLOW:			nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);			if (P_FREESPACE(dbp, ppage) < nbytes)				return (DB_NEEDSPLIT);			if (space_check)				return (0);			memset(&bi, 0, sizeof(bi));			bi.len = BOVERFLOW_SIZE;			B_TSET(bi.type, child_bk->type, 0);			bi.pgno = rchild->pgno;			bi.nrecs = nrecs;			memset(&hdr, 0, sizeof(hdr));			hdr.data = &bi;			hdr.size = SSZA(BINTERNAL, data);			memset(&data, 0, sizeof(data));			data.data = child_bk;			data.size = BOVERFLOW_SIZE;			if ((ret = __db_pitem(dbc, ppage, off,			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)				return (ret);			/* Increment the overflow ref count. */			if (B_TYPE(child_bk->type) == B_OVERFLOW)				if ((ret = __db_ovref(dbc,				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)					return (ret);			break;		default:			return (__db_pgfmt(dbp->dbenv, rchild->pgno));		}		break;	case P_IRECNO:	case P_LRECNO:		nbytes = RINTERNAL_PSIZE;		if (P_FREESPACE(dbp, ppage) < nbytes)			return (DB_NEEDSPLIT);		if (space_check)			return (0);		/* Add a new record for the right page. */		memset(&hdr, 0, sizeof(hdr));		hdr.data = &ri;		hdr.size = RINTERNAL_SIZE;		ri.pgno = rchild->pgno;		ri.nrecs = nrecs;		if ((ret = __db_pitem(dbc,		    ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)			return (ret);		break;	default:		return (__db_pgfmt(dbp->dbenv, rchild->pgno));	}	/*	 * If a Recno or Btree with record numbers AM page, or an off-page	 * duplicates tree, adjust the parent page's left page record count.	 */	if (F_ISSET(cp, C_RECNUM)) {		/* Log the change. */		if (DBC_LOGGING(dbc)) {		if ((ret = __bam_cadjust_log(dbp, dbc->txn,		    &LSN(ppage), 0, PGNO(ppage),		    &LSN(ppage), parent->indx, -(int32_t)nrecs, 0)) != 0)			return (ret);		} else			LSN_NOT_LOGGED(LSN(ppage));		/* Update the left page count. */		if (dbc->dbtype == DB_RECNO)			GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;		else			GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;	}	return (0);}/* * __bam_psplit -- *	Do the real work of splitting the page. */static int__bam_psplit(dbc, cp, lp, rp, splitret)	DBC *dbc;	EPG *cp;	PAGE *lp, *rp;	db_indx_t *splitret;{	DB *dbp;	PAGE *pp;	db_indx_t half, *inp, nbytes, off, splitp, top;	int adjust, cnt, iflag, isbigkey, ret;	dbp = dbc->dbp;	pp = cp->page;	inp = P_INP(dbp, pp);	adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;	/*	 * If we're splitting the first (last) page on a level because we're	 * inserting (appending) a key to it, it's likely that the data is	 * sorted.  Moving a single item to the new page is less work and can	 * push the fill factor higher than normal.  This is trivial when we	 * are splitting a new page before the beginning of the tree, all of	 * the interesting tests are against values of 0.	 *	 * Catching appends to the tree is harder.  In a simple append, we're	 * inserting an item that sorts past the end of the tree; the cursor	 * will point past the last element on the page.  But, in trees with	 * duplicates, the cursor may point to the last entry on the page --	 * in this case, the entry will also be the last element of a duplicate	 * set (the last because the search call specified the S_DUPLAST flag).	 * The only way to differentiate between an insert immediately before	 * the last item in a tree or an append after a duplicate set which is	 * also the last item in the tree is to call the comparison function.	 * When splitting internal pages during an append, the search code	 * guarantees the cursor always points to the largest page item less	 * than the new internal entry.  To summarize, we want to catch three	 * possible index values:	 *	 *	NUM_ENT(page)		Btree/Recno leaf insert past end-of-tree	 *	NUM_ENT(page) - O_INDX	Btree or Recno internal insert past EOT	 *	NUM_ENT(page) - P_INDX	Btree leaf insert past EOT after a set	 *				    of duplicates	 *	 * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert	 * near the end of the tree, and not after the end of the tree at all.	 * Do a simple test which might be wrong because calling the comparison	 * functions is expensive.  Regardless, it's not a big deal if we're	 * wrong, we'll do the split the right way next time.	 */	off = 0;	if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust)		off = NUM_ENT(pp) - adjust;	else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)		off = adjust;	if (off != 0)		goto sort;	/*	 * Split the data to the left and right pages.  Try not to split on	 * an overflow key.  (Overflow keys on internal pages will slow down	 * searches.)  Refuse to split in the middle of a set of duplicates.	 *	 * First, find the optimum place to split.	 *	 * It's possible to try and split past the last record on the page if	 * there's a very large record at the end of the page.  Make sure this	 * doesn't happen by bounding the check at the next-to-last entry on	 * the page.	 *	 * Note, we try and split half the data present on the page.  This is	 * because another process may have already split the page and left	 * it half empty.  We don't try and skip the split -- we don't know	 * how much space we're going to need on the page, and we may need up	 * to half the page for a big item, so there's no easy test to decide	 * if we need to split or not.  Besides, if two threads are inserting	 * data into the same place in the database, we're probably going to	 * need more space soon anyway.	 */	top = NUM_ENT(pp) - adjust;	half = (dbp->pgsize - HOFFSET(pp)) / 2;	for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)		switch (TYPE(pp)) {		case P_IBTREE:			if (B_TYPE(			    GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA)				nbytes += BINTERNAL_SIZE(				   GET_BINTERNAL(dbp, pp, off)->len);			else				nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);			break;		case P_LBTREE:			if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==			    B_KEYDATA)				nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,				    pp, off)->len);			else				nbytes += BOVERFLOW_SIZE;			++off;			/* FALLTHROUGH */		case P_LDUP:		case P_LRECNO:			if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==			    B_KEYDATA)				nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,				    pp, off)->len);			else				nbytes += BOVERFLOW_SIZE;			break;		case P_IRECNO:			nbytes += RINTERNAL_SIZE;			break;		default:			return (__db_pgfmt(dbp->dbenv, pp->pgno));		}sort:	splitp = off;	/*	 * Splitp is either at or just past the optimum split point.  If the	 * tree type is such that we're going to promote a key to an internal	 * page, and our current choice is an overflow key, look for something	 * close by that's smaller.	 */	switch (TYPE(pp)) {	case P_IBTREE:		iflag = 1;		isbigkey =		    B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA;		break;	case P_LBTREE:	case P_LDUP:		iflag = 0;		isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) !=		    B_KEYDATA;		break;	default:		iflag = isbigkey = 0;	}	if (isbigkey)		for (cnt = 1; cnt <= 3; ++cnt) {			off = splitp + cnt * adjust;			if (off < (db_indx_t)NUM_ENT(pp) &&			    ((iflag && B_TYPE(			    GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) ||			    B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==			    B_KEYDATA)) {				splitp = off;				break;			}			if (splitp <= (db_indx_t)(cnt * adjust))				continue;			off = splitp - cnt * adjust;			if (iflag ? B_TYPE(			    GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA :			    B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==			    B_KEYDATA) {				splitp = off;				break;			}		}	/*	 * We can't split in the middle a set of duplicates.  We know that	 * no duplicate set can take up more than about 25% of the page,	 * because that's the point where we push it off onto a duplicate	 * page set.  So, this loop can't be unbounded.	 */	if (TYPE(pp) == P_LBTREE &&	    inp[splitp] == inp[splitp - adjust])		for (cnt = 1;; ++cnt) {			off = splitp + cnt * adjust;			if (off < NUM_ENT(pp) &&			    inp[splitp] != inp[off]) {				splitp = off;				break;			}			if (splitp <= (db_indx_t)(cnt * adjust))				continue;			off = splitp - cnt * adjust;			if (inp[splitp] != inp[off]) {				splitp = off + adjust;				break;			}		}	/* We're going to split at splitp. */	if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)		return (ret);	if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)		return (ret);	*splitret = splitp;	return (0);}/* * __bam_copy -- *	Copy a set of records from one page to another. * * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); */int__bam_copy(dbp, pp, cp, nxt, stop)	DB *dbp;	PAGE *pp, *cp;	u_int32_t nxt, stop;{	db_indx_t *cinp, nbytes, off, *pinp;	cinp = P_INP(dbp, cp);	pinp = P_INP(dbp, pp);	/*	 * Nxt is the offset of the next record to be placed on the target page.	 */	for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {		switch (TYPE(pp)) {		case P_IBTREE:			if (B_TYPE(			    GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA)				nbytes = BINTERNAL_SIZE(				    GET_BINTERNAL(dbp, pp, nxt)->len);			else				nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);			break;		case P_LBTREE:			/*			 * If we're on a key and it's a duplicate, just copy			 * the offset.			 */			if (off != 0 && (nxt % P_INDX) == 0 &&			    pinp[nxt] == pinp[nxt - P_INDX]) {				cinp[off] = cinp[off - P_INDX];				continue;			}			/* FALLTHROUGH */		case P_LDUP:		case P_LRECNO:			if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==			    B_KEYDATA)				nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,				    pp, nxt)->len);			else				nbytes = BOVERFLOW_SIZE;			break;		case P_IRECNO:			nbytes = RINTERNAL_SIZE;			break;		default:			return (__db_pgfmt(dbp->dbenv, pp->pgno));		}		cinp[off] = HOFFSET(cp) -= nbytes;		memcpy(P_ENTRY(dbp, cp, off), P_ENTRY(dbp, pp, nxt), nbytes);	}	return (0);}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -