📄 vfs_cluster.c
字号:
/*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */#include <sys/param.h>#include <sys/proc.h>#include <sys/buf.h>#include <sys/vnode.h>#include <sys/mount.h>#include <sys/trace.h>#include <sys/malloc.h>#include <sys/resourcevar.h>#include <libkern/libkern.h>#ifdef DEBUG#include <vm/vm.h>#include <sys/sysctl.h>int doreallocblks = 1;struct ctldebug debug13 = { "doreallocblks", &doreallocblks };#else/* XXX for cluster_write */#define doreallocblks 1#endif/* * Local declarations */struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, daddr_t, long, int));struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, daddr_t, daddr_t, long, int, long));void cluster_wbuild __P((struct vnode *, struct buf *, long, daddr_t, int, daddr_t));struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));#ifdef DIAGNOSTIC/* * Set to 1 if reads of block zero should cause readahead to be done. * Set to 0 treats a read of block zero as a non-sequential read. * * Setting to one assumes that most reads of block zero of files are due to * sequential passes over the files (e.g. cat, sum) where additional blocks * will soon be needed. Setting to zero assumes that the majority are * surgical strikes to get particular info (e.g. size, file) where readahead * blocks will not be used and, in fact, push out other potentially useful * blocks from the cache. The former seems intuitive, but some quick tests * showed that the latter performed better from a system-wide point of view. */int doclusterraz = 0;#define ISSEQREAD(vp, blk) \ (((blk) != 0 || doclusterraz) && \ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))#else#define ISSEQREAD(vp, blk) \ ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))#endif/* * This replaces bread. If this is a bread at the beginning of a file and * lastr is 0, we assume this is the first read and we'll read up to two * blocks if they are sequential. After that, we'll do regular read ahead * in clustered chunks. * * There are 4 or 5 cases depending on how you count: * Desired block is in the cache: * 1 Not sequential access (0 I/Os). * 2 Access is sequential, do read-ahead (1 ASYNC). * Desired block is not in cache: * 3 Not sequential access (1 SYNC). * 4 Sequential access, next block is contiguous (1 SYNC). * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) * * There are potentially two buffers that require I/O. * bp is the block requested. * rbp is the read-ahead block. * If either is NULL, then you don't have to do the I/O. */cluster_read(vp, filesize, lblkno, size, cred, bpp) struct vnode *vp; u_quad_t filesize; daddr_t lblkno; long size; struct ucred *cred; struct buf **bpp;{ struct buf *bp, *rbp; daddr_t blkno, ioblkno; long flags; int error, num_ra, alreadyincore;#ifdef DIAGNOSTIC if (size == 0) panic("cluster_read: size = 0");#endif error = 0; flags = B_READ; *bpp = bp = getblk(vp, lblkno, size, 0, 0); if (bp->b_flags & B_CACHE) { /* * Desired block is in cache; do any readahead ASYNC. * Case 1, 2. */ trace(TR_BREADHIT, pack(vp, size), lblkno); flags |= B_ASYNC; ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); alreadyincore = (int)incore(vp, ioblkno); bp = NULL; } else { /* Block wasn't in cache, case 3, 4, 5. */ trace(TR_BREADMISS, pack(vp, size), lblkno); bp->b_flags |= B_READ; ioblkno = lblkno; alreadyincore = 0; curproc->p_stats->p_ru.ru_inblock++; /* XXX */ } /* * XXX * Replace 1 with a window size based on some permutation of * maxcontig and rot_delay. This will let you figure out how * many blocks you should read-ahead (case 2, 4, 5). * * If the access isn't sequential, reset the window to 1. * Note that a read to the same block is considered sequential. * This catches the case where the file is being read sequentially, * but at smaller than the filesystem block size. */ rbp = NULL; if (!ISSEQREAD(vp, lblkno)) { vp->v_ralen = 0; vp->v_maxra = lblkno; } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && blkno != -1) { /* * Reading sequentially, and the next block is not in the * cache. We are going to try reading ahead. */ if (num_ra) { /* * If our desired readahead block had been read * in a previous readahead but is no longer in * core, then we may be reading ahead too far * or are not using our readahead very rapidly. * In this case we scale back the window. */ if (!alreadyincore && ioblkno <= vp->v_maxra) vp->v_ralen = max(vp->v_ralen >> 1, 1); /* * There are more sequential blocks than our current * window allows, scale up. Ideally we want to get * in sync with the filesystem maxcontig value. */ else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) vp->v_ralen = vp->v_ralen ? min(num_ra, vp->v_ralen << 1) : 1; if (num_ra > vp->v_ralen) num_ra = vp->v_ralen; } if (num_ra) /* case 2, 4 */ rbp = cluster_rbuild(vp, filesize, bp, ioblkno, blkno, size, num_ra, flags); else if (ioblkno == lblkno) { bp->b_blkno = blkno; /* Case 5: check how many blocks to read ahead */ ++ioblkno; if ((ioblkno + 1) * size > filesize || incore(vp, ioblkno) || (error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) goto skip_readahead; /* * Adjust readahead as above */ if (num_ra) { if (!alreadyincore && ioblkno <= vp->v_maxra) vp->v_ralen = max(vp->v_ralen >> 1, 1); else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) vp->v_ralen = vp->v_ralen ? min(num_ra,vp->v_ralen<<1) : 1; if (num_ra > vp->v_ralen) num_ra = vp->v_ralen; } flags |= B_ASYNC; if (num_ra) rbp = cluster_rbuild(vp, filesize, NULL, ioblkno, blkno, size, num_ra, flags); else { rbp = getblk(vp, ioblkno, size, 0, 0); rbp->b_flags |= flags; rbp->b_blkno = blkno; } } else { /* case 2; read ahead single block */ rbp = getblk(vp, ioblkno, size, 0, 0); rbp->b_flags |= flags; rbp->b_blkno = blkno; } if (rbp == bp) /* case 4 */ rbp = NULL; else if (rbp) { /* case 2, 5 */ trace(TR_BREADMISSRA, pack(vp, (num_ra + 1) * size), ioblkno); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ } } /* XXX Kirk, do we need to make sure the bp has creds? */skip_readahead: if (bp) if (bp->b_flags & (B_DONE | B_DELWRI)) panic("cluster_read: DONE bp"); else error = VOP_STRATEGY(bp); if (rbp) if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { rbp->b_flags &= ~(B_ASYNC | B_READ); brelse(rbp); } else (void) VOP_STRATEGY(rbp); /* * Recalculate our maximum readahead */ if (rbp == NULL) rbp = bp; if (rbp) vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; if (bp) return(biowait(bp)); return(error);}/* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */struct buf *cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) struct vnode *vp; u_quad_t filesize; struct buf *bp; daddr_t lbn; daddr_t blkno; long size; int run; long flags;{ struct cluster_save *b_save; struct buf *tbp; daddr_t bn; int i, inc;#ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_rbuild: size %d != filesize %d\n", size, vp->v_mount->mnt_stat.f_iosize);#endif if (size * (lbn + run + 1) > filesize) --run; if (run == 0) { if (!bp) { bp = getblk(vp, lbn, size, 0, 0); bp->b_blkno = blkno; bp->b_flags |= flags; } return(bp); } bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); if (bp->b_flags & (B_DONE | B_DELWRI)) return (bp); b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), M_SEGMENT, M_WAITOK); b_save->bs_bufsize = b_save->bs_bcount = size; b_save->bs_nchildren = 0; b_save->bs_children = (struct buf **)(b_save + 1); b_save->bs_saveaddr = bp->b_saveaddr; bp->b_saveaddr = (caddr_t) b_save; inc = btodb(size); for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { if (incore(vp, lbn + i)) { if (i == 1) { bp->b_saveaddr = b_save->bs_saveaddr; bp->b_flags &= ~B_CALL; bp->b_iodone = NULL; allocbuf(bp, size); free(b_save, M_SEGMENT); } else allocbuf(bp, size * i); break; } tbp = getblk(vp, lbn + i, 0, 0, 0); /* * getblk may return some memory in the buffer if there were * no empty buffers to shed it to. If there is currently * memory in the buffer, we move it down size bytes to make * room for the valid pages that cluster_callback will insert. * We do this now so we don't have to do it at interrupt time * in the callback routine. */ if (tbp->b_bufsize != 0) { caddr_t bdata = (char *)tbp->b_data; if (tbp->b_bufsize + size > MAXBSIZE) panic("cluster_rbuild: too much memory"); if (tbp->b_bufsize > size) { /* * XXX if the source and destination regions * overlap we have to copy backward to avoid * clobbering any valid pages (i.e. pagemove * implementations typically can't handle * overlap). */ bdata += tbp->b_bufsize; while (bdata > (char *)tbp->b_data) { bdata -= CLBYTES; pagemove(bdata, bdata + size, CLBYTES); } } else pagemove(bdata, bdata + size, tbp->b_bufsize); } tbp->b_blkno = bn; tbp->b_flags |= flags | B_READ | B_ASYNC; ++b_save->bs_nchildren; b_save->bs_children[i - 1] = tbp; } return(bp);}/* * Either get a new buffer or grow the existing one. */struct buf *cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -