📄 vote.c
字号:
/* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * vote.c * * description here * * Copyright (C) 2003, 2004 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */#include <linux/types.h>#include <linux/slab.h>#include <linux/highmem.h>#include <linux/smp_lock.h>#include <linux/kthread.h>#include <cluster/heartbeat.h>#include <cluster/nodemanager.h>#include <cluster/tcp.h>#include <dlm/dlmapi.h>#define MLOG_MASK_PREFIX ML_VOTE#include <cluster/masklog.h>#include "ocfs2.h"#include "alloc.h"#include "dlmglue.h"#include "extent_map.h"#include "heartbeat.h"#include "inode.h"#include "journal.h"#include "slot_map.h"#include "vote.h"#include "buffer_head_io.h"#define OCFS2_MESSAGE_TYPE_VOTE (0x1)#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)struct ocfs2_msg_hdr{ __be32 h_response_id; /* used to lookup message handle on sending * node. */ __be32 h_request; __be64 h_blkno; __be32 h_generation; __be32 h_node_num; /* node sending this particular message. */};/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this * for the network. */#define OCFS2_VOTE_FILENAME_LEN 256struct ocfs2_vote_msg{ struct ocfs2_msg_hdr v_hdr; union { __be32 v_generic1; __be32 v_orphaned_slot; /* Used during delete votes */ __be32 v_nlink; /* Used during unlink votes */ } md1; /* Message type dependant 1 */ __be32 v_unlink_namelen; __be64 v_unlink_parent; u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];};/* Responses are given these values to maintain backwards * compatibility with older ocfs2 versions */#define OCFS2_RESPONSE_OK (0)#define OCFS2_RESPONSE_BUSY (-16)#define OCFS2_RESPONSE_BAD_MSG (-22)struct ocfs2_response_msg{ struct ocfs2_msg_hdr r_hdr; __be32 r_response; __be32 r_orphaned_slot;};struct ocfs2_vote_work { struct list_head w_list; struct ocfs2_vote_msg w_msg;};enum ocfs2_vote_request { OCFS2_VOTE_REQ_INVALID = 0, OCFS2_VOTE_REQ_DELETE, OCFS2_VOTE_REQ_UNLINK, OCFS2_VOTE_REQ_RENAME, OCFS2_VOTE_REQ_MOUNT, OCFS2_VOTE_REQ_UMOUNT, OCFS2_VOTE_REQ_LAST};static inline int ocfs2_is_valid_vote_request(int request){ return OCFS2_VOTE_REQ_INVALID < request && request < OCFS2_VOTE_REQ_LAST;}typedef void (*ocfs2_net_response_callback)(void *priv, struct ocfs2_response_msg *resp);struct ocfs2_net_response_cb { ocfs2_net_response_callback rc_cb; void *rc_priv;};struct ocfs2_net_wait_ctxt { struct list_head n_list; u32 n_response_id; wait_queue_head_t n_event; struct ocfs2_node_map n_node_map; int n_response; /* an agreggate response. 0 if * all nodes are go, < 0 on any * negative response from any * node or network error. */ struct ocfs2_net_response_cb *n_callback;};static void ocfs2_process_mount_request(struct ocfs2_super *osb, unsigned int node_num){ mlog(0, "MOUNT vote from node %u\n", node_num); /* The other node only sends us this message when he has an EX * on the superblock, so our recovery threads (if having been * launched) are waiting on it.*/ ocfs2_recovery_map_clear(osb, node_num); ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num); /* We clear the umount map here because a node may have been * previously mounted, safely unmounted but never stopped * heartbeating - in which case we'd have a stale entry. */ ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);}static void ocfs2_process_umount_request(struct ocfs2_super *osb, unsigned int node_num){ mlog(0, "UMOUNT vote from node %u\n", node_num); ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num); ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);}void ocfs2_mark_inode_remotely_deleted(struct inode *inode, int deleting_node){ struct ocfs2_inode_info *oi = OCFS2_I(inode); assert_spin_locked(&oi->ip_lock); /* We set the SKIP_DELETE flag on the inode so we don't try to * delete it in delete_inode ourselves, thus avoiding * unecessary lock pinging. If the other node failed to wipe * the inode as a result of a crash, then recovery will pick * up the slack. */ oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; if (deleting_node != O2NM_INVALID_NODE_NUM) oi->ip_deleting_node = deleting_node;}static int ocfs2_process_delete_request(struct inode *inode, int *orphaned_slot, int deleting_node){ int response = OCFS2_RESPONSE_BUSY; mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", inode->i_ino, inode->i_nlink, *orphaned_slot); spin_lock(&OCFS2_I(inode)->ip_lock); /* Whatever our vote response is, we want to make sure that * the orphaned slot is recorded properly on this node *and* * on the requesting node. Technically, if the requesting node * did not know which slot the inode is orphaned in but we * respond with BUSY he doesn't actually need the orphaned * slot, but it doesn't hurt to do it here anyway. */ if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != OCFS2_INVALID_SLOT && OCFS2_I(inode)->ip_orphaned_slot != (*orphaned_slot), "Inode %"MLFu64": This node thinks it's " "orphaned in slot %d, messaged it's in %d\n", OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_orphaned_slot, *orphaned_slot); mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n", OCFS2_I(inode)->ip_blkno, *orphaned_slot); OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; } else { mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n", OCFS2_I(inode)->ip_orphaned_slot, OCFS2_I(inode)->ip_blkno); *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; } /* vote no if the file is still open. */ if (OCFS2_I(inode)->ip_open_count) { mlog(0, "open count = %u\n", OCFS2_I(inode)->ip_open_count); spin_unlock(&OCFS2_I(inode)->ip_lock); goto done; } spin_unlock(&OCFS2_I(inode)->ip_lock); /* directories are a bit ugly... What if someone is sitting in * it? We want to make sure the inode is removed completely as * a result of the iput in process_vote. */ if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); goto done; } if (filemap_fdatawrite(inode->i_mapping)) { mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n", OCFS2_I(inode)->ip_blkno); goto done; } sync_mapping_buffers(inode->i_mapping); truncate_inode_pages(inode->i_mapping, 0); ocfs2_extent_map_trunc(inode, 0); spin_lock(&OCFS2_I(inode)->ip_lock); /* double check open count - someone might have raced this * thread into ocfs2_file_open while we were writing out * data. If we're to allow a wipe of this inode now, we *must* * hold the spinlock until we've marked it. */ if (OCFS2_I(inode)->ip_open_count) { mlog(0, "Raced to wipe! open count = %u\n", OCFS2_I(inode)->ip_open_count); spin_unlock(&OCFS2_I(inode)->ip_lock); goto done; } /* Mark the inode as being wiped from disk. */ ocfs2_mark_inode_remotely_deleted(inode, deleting_node); spin_unlock(&OCFS2_I(inode)->ip_lock); /* Not sure this is necessary anymore. */ d_prune_aliases(inode); /* If we get here, then we're voting 'yes', so commit the * delete on our side. */ response = OCFS2_RESPONSE_OK;done: return response;}static int ocfs2_match_dentry(struct dentry *dentry, u64 parent_blkno, unsigned int namelen, const char *name){ struct inode *parent; if (!dentry->d_parent) { mlog(0, "Detached from parent.\n"); return 0; } parent = dentry->d_parent->d_inode; /* Negative parent dentry? */ if (!parent) return 0; /* Name is in a different directory. */ if (OCFS2_I(parent)->ip_blkno != parent_blkno) return 0; if (dentry->d_name.len != namelen) return 0; /* comparison above guarantees this is safe. */ if (memcmp(dentry->d_name.name, name, namelen)) return 0; return 1;}static void ocfs2_process_dentry_request(struct inode *inode, int rename, unsigned int new_nlink, u64 parent_blkno, unsigned int namelen, const char *name){ struct dentry *dentry = NULL; struct list_head *p; struct ocfs2_inode_info *oi = OCFS2_I(inode); mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno, namelen, namelen, name); spin_lock(&dcache_lock); /* Another node is removing this name from the system. It is * up to us to find the corresponding dentry and if it exists, * unhash it from the dcache. */ list_for_each(p, &inode->i_dentry) { dentry = list_entry(p, struct dentry, d_alias); if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) { mlog(0, "dentry found: %.*s\n", dentry->d_name.len, dentry->d_name.name); dget_locked(dentry); break; } dentry = NULL; } spin_unlock(&dcache_lock); if (dentry) { d_delete(dentry); dput(dentry); } /* rename votes don't send link counts */ if (!rename) { mlog(0, "new_nlink = %u\n", new_nlink); /* We don't have the proper locks here to directly * change i_nlink and besides, the vote is sent * *before* the operation so it may have failed on the * other node. This passes a hint to ocfs2_drop_inode * to force ocfs2_delete_inode, who will take the * proper cluster locks to sort things out. */ if (new_nlink == 0) { spin_lock(&oi->ip_lock); oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;#ifdef OCFS2_DELETE_INODE_WORKAROUND /* Do a sync now as we can't be sure whether * the inode will actually be orphaned or * not. We condition this on the open count as * otherwise, ocfs2_file_release will handle * it for us. */ if (!oi->ip_open_count) { spin_unlock(&oi->ip_lock); write_inode_now(inode, 1); /* strange indentation past the * 'else', but I want to keep the non * hack code purty :) */ } else#endif spin_unlock(&OCFS2_I(inode)->ip_lock); } }}static void ocfs2_process_vote(struct ocfs2_super *osb, struct ocfs2_vote_msg *msg){ int net_status, vote_response; int orphaned_slot = 0; int deleting_node = O2NM_INVALID_NODE_NUM; int rename = 0; unsigned int node_num, generation, new_nlink, namelen; u64 blkno, parent_blkno; enum ocfs2_vote_request request; struct inode *inode = NULL; struct ocfs2_msg_hdr *hdr = &msg->v_hdr; struct ocfs2_response_msg response; /* decode the network mumbo jumbo into local variables. */ request = be32_to_cpu(hdr->h_request); blkno = be64_to_cpu(hdr->h_blkno); generation = be32_to_cpu(hdr->h_generation); node_num = be32_to_cpu(hdr->h_node_num); if (request == OCFS2_VOTE_REQ_DELETE) { orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); deleting_node = be32_to_cpu(msg->v_hdr.h_node_num); } mlog(0, "processing vote: request = %u, blkno = %"MLFu64", " "generation = %u, node_num = %u, priv1 = %u\n", request, blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1)); if (!ocfs2_is_valid_vote_request(request)) { mlog(ML_ERROR, "Invalid vote request %d from node %u\n", request, node_num); vote_response = OCFS2_RESPONSE_BAD_MSG; goto respond; } vote_response = OCFS2_RESPONSE_OK; switch (request) { case OCFS2_VOTE_REQ_UMOUNT: ocfs2_process_umount_request(osb, node_num);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -