📄 extent_map.c
字号:
/* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * extent_map.c * * In-memory extent map for OCFS2. Man, this code was prettier in * the library. * * Copyright (C) 2004 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License, version 2, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */#include <linux/fs.h>#include <linux/init.h>#include <linux/types.h>#include <linux/slab.h>#include <linux/rbtree.h>#define MLOG_MASK_PREFIX ML_EXTENT_MAP#include <cluster/masklog.h>#include "ocfs2.h"#include "extent_map.h"#include "inode.h"#include "super.h"#include "buffer_head_io.h"/* * SUCK SUCK SUCK * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h */struct ocfs2_extent_map_entry { struct rb_node e_node; int e_tree_depth; struct ocfs2_extent_rec e_rec;};struct ocfs2_em_insert_context { int need_left; int need_right; struct ocfs2_extent_map_entry *new_ent; struct ocfs2_extent_map_entry *old_ent; struct ocfs2_extent_map_entry *left_ent; struct ocfs2_extent_map_entry *right_ent;};static kmem_cache_t *ocfs2_em_ent_cachep = NULL;static struct ocfs2_extent_map_entry *ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, u32 cpos, u32 clusters, struct rb_node ***ret_p, struct rb_node **ret_parent);static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, struct ocfs2_extent_map_entry *ent);static int ocfs2_extent_map_find_leaf(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_list *el);static int ocfs2_extent_map_lookup_read(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_map_entry **ret_ent);static int ocfs2_extent_map_try_insert(struct inode *inode, struct ocfs2_extent_rec *rec, int tree_depth, struct ocfs2_em_insert_context *ctxt);/* returns 1 only if the rec contains all the given clusters -- that is that * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + * clusters) is >= the argument's endpoint */static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, u32 cpos, u32 clusters){ if (le32_to_cpu(rec->e_cpos) > cpos) return 0; if (cpos + clusters > le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) return 0; return 1;}/* * Find an entry in the tree that intersects the region passed in. * Note that this will find straddled intervals, it is up to the * callers to enforce any boundary conditions. * * Callers must hold ip_lock. This lookup is not guaranteed to return * a tree_depth 0 match, and as such can race inserts if the lock * were not held. * * The rb_node garbage lets insertion share the search. Trivial * callers pass NULL. */static struct ocfs2_extent_map_entry *ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, u32 cpos, u32 clusters, struct rb_node ***ret_p, struct rb_node **ret_parent){ struct rb_node **p = &em->em_extents.rb_node; struct rb_node *parent = NULL; struct ocfs2_extent_map_entry *ent = NULL; while (*p) { parent = *p; ent = rb_entry(parent, struct ocfs2_extent_map_entry, e_node); if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { p = &(*p)->rb_left; ent = NULL; } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + le32_to_cpu(ent->e_rec.e_clusters))) { p = &(*p)->rb_right; ent = NULL; } else break; } if (ret_p != NULL) *ret_p = p; if (ret_parent != NULL) *ret_parent = parent; return ent;}/* * Find the leaf containing the interval we want. While we're on our * way down the tree, fill in every record we see at any depth, because * we might want it later. * * Note that this code is run without ip_lock. That's because it * sleeps while reading. If someone is also filling the extent list at * the same time we are, we might have to restart. */static int ocfs2_extent_map_find_leaf(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_list *el){ int i, ret; struct buffer_head *eb_bh = NULL; u64 blkno; u32 rec_end; struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec; /* * The bh data containing the el cannot change here, because * we hold alloc_sem. So we can do this without other * locks. */ while (el->l_tree_depth) { blkno = 0; for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; rec_end = (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)); ret = -EBADR; if (rec_end > OCFS2_I(inode)->ip_clusters) { mlog_errno(ret); ocfs2_error(inode->i_sb, "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", i, le64_to_cpu(rec->e_blkno), OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_clusters); goto out_free; } if (rec_end <= cpos) { ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } continue; } if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } continue; } /* * We've found a record that matches our * interval. We don't insert it because we're * about to traverse it. */ /* Check to see if we're stradling */ ret = -ESRCH; if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { mlog_errno(ret); goto out_free; } /* * If we've already found a record, the el has * two records covering the same interval. * EEEK! */ ret = -EBADR; if (blkno) { mlog_errno(ret); ocfs2_error(inode->i_sb, "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n", cpos, clusters, OCFS2_I(inode)->ip_blkno, blkno, i, le64_to_cpu(rec->e_blkno)); goto out_free; } blkno = le64_to_cpu(rec->e_blkno); } /* * We don't support holes, and we're still up * in the branches, so we'd better have found someone */ ret = -EBADR; if (!blkno) { ocfs2_error(inode->i_sb, "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n", cpos, clusters, OCFS2_I(inode)->ip_blkno); mlog_errno(ret); goto out_free; } if (eb_bh) { brelse(eb_bh); eb_bh = NULL; } ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &eb_bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); goto out_free; } eb = (struct ocfs2_extent_block *)eb_bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); ret = -EIO; goto out_free; } el = &eb->h_list; } if (el->l_tree_depth) BUG(); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > OCFS2_I(inode)->ip_clusters) { ret = -EBADR; mlog_errno(ret); ocfs2_error(inode->i_sb, "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", i, le64_to_cpu(rec->e_blkno), OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_clusters); return ret; } ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } } ret = 0;out_free: if (eb_bh) brelse(eb_bh); return ret;}/* * This lookup actually will read from disk. It has one invariant: * It will never re-traverse blocks. This means that all inserts should * be new regions or more granular regions (both allowed by insert). */static int ocfs2_extent_map_lookup_read(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_map_entry **ret_ent){ int ret; u64 blkno; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; struct buffer_head *bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_dinode *di; struct ocfs2_extent_list *el; spin_lock(&OCFS2_I(inode)->ip_lock); ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); if (ent) { if (!ent->e_tree_depth) { spin_unlock(&OCFS2_I(inode)->ip_lock); *ret_ent = ent; return 0; } blkno = le64_to_cpu(ent->e_rec.e_blkno); spin_unlock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); if (bh) brelse(bh); return ret; } eb = (struct ocfs2_extent_block *)bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); brelse(bh); return -EIO; } el = &eb->h_list; } else { spin_unlock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); if (bh) brelse(bh); return ret; } di = (struct ocfs2_dinode *)bh->b_data; if (!OCFS2_IS_VALID_DINODE(di)) { brelse(bh); OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); return -EIO; } el = &di->id2.i_list; } ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); brelse(bh); if (ret) { mlog_errno(ret); return ret; } ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); if (!ent) { ret = -ESRCH; mlog_errno(ret); return ret; } if (ent->e_tree_depth) BUG(); /* FIXME: Make sure this isn't a corruption */ *ret_ent = ent; return 0;}/* * Callers must hold ip_lock. This can insert pieces of the tree, * thus racing lookup if the lock weren't held. */static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, struct ocfs2_extent_map_entry *ent){ struct rb_node **p, *parent; struct ocfs2_extent_map_entry *old_ent; old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), le32_to_cpu(ent->e_rec.e_clusters), &p, &parent); if (old_ent) return -EEXIST; rb_link_node(&ent->e_node, parent, p); rb_insert_color(&ent->e_node, &em->em_extents); return 0;}/* * Simple rule: on any return code other than -EAGAIN, anything left * in the insert_context will be freed. * * Simple rule #2: A return code of -EEXIST from this function or * its calls to ocfs2_extent_map_insert_entry() signifies that another * thread beat us to the insert. It is not an actual error, but it * tells the caller we have no more work to do. */static int ocfs2_extent_map_try_insert(struct inode *inode, struct ocfs2_extent_rec *rec, int tree_depth, struct ocfs2_em_insert_context *ctxt){ int ret; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *old_ent; ctxt->need_left = 0; ctxt->need_right = 0; ctxt->old_ent = NULL; spin_lock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); if (!ret) { ctxt->new_ent = NULL; goto out_unlock; } /* Since insert_entry failed, the map MUST have old_ent */ old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), NULL, NULL); if (!old_ent) BUG(); if (old_ent->e_tree_depth < tree_depth) { /* Another thread beat us to the lower tree_depth */ ret = -EEXIST; goto out_unlock; } if (old_ent->e_tree_depth == tree_depth) { /* * Another thread beat us to this tree_depth. * Let's make sure we agree with that thread (the * extent_rec should be identical). */ if (!memcmp(rec, &old_ent->e_rec, sizeof(struct ocfs2_extent_rec))) ret = 0; else /* FIXME: Should this be ESRCH/EBADR??? */ ret = -EEXIST; goto out_unlock; } /* * We do it in this order specifically so that no actual tree * changes occur until we have all the pieces we need. We * don't want malloc failures to leave an inconsistent tree. * Whenever we drop the lock, another process could be * inserting. Also note that, if another process just beat us * to an insert, we might not need the same pieces we needed * the first go round. In the end, the pieces we need will * be used, and the pieces we don't will be freed. */ ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > le32_to_cpu(old_ent->e_rec.e_cpos)); ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + le32_to_cpu(old_ent->e_rec.e_clusters)) > (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); ret = -EAGAIN; if (ctxt->need_left) { if (!ctxt->left_ent) goto out_unlock; *(ctxt->left_ent) = *old_ent; ctxt->left_ent->e_rec.e_clusters = cpu_to_le32(le32_to_cpu(rec->e_cpos) - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); } if (ctxt->need_right) { if (!ctxt->right_ent) goto out_unlock; *(ctxt->right_ent) = *old_ent; ctxt->right_ent->e_rec.e_cpos = cpu_to_le32(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)); ctxt->right_ent->e_rec.e_clusters = cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + le32_to_cpu(old_ent->e_rec.e_clusters)) - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -