📄 ctdb_recoverd.c
字号:
/* force the start of the election process */static void force_election(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx, uint32_t vnn, struct ctdb_node_map *nodemap){ int ret; struct ctdb_context *ctdb = rec->ctdb; /* set all nodes to recovery mode to stop all internode traffic */ ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE); if (ret!=0) { DEBUG(0, (__location__ " Unable to set recovery mode to active on cluster\n")); return; } ret = send_election_request(rec, mem_ctx, vnn); if (ret!=0) { DEBUG(0, (__location__ " failed to initiate recmaster election")); return; } /* wait for a few seconds to collect all responses */ ctdb_wait_timeout(ctdb, ctdb->tunable.election_timeout);}/* handler for when a node changes its flags*/static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data, void *private_data){ int ret; struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr; struct ctdb_node_map *nodemap=NULL; TALLOC_CTX *tmp_ctx; int i; if (data.dsize != sizeof(*c)) { DEBUG(0,(__location__ "Invalid data in ctdb_node_flag_change\n")); return; } tmp_ctx = talloc_new(ctdb); CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx); ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap); for (i=0;i<nodemap->num;i++) { if (nodemap->nodes[i].vnn == c->vnn) break; } if (i == nodemap->num) { DEBUG(0,(__location__ "Flag change for non-existant node %u\n", c->vnn)); talloc_free(tmp_ctx); return; } /* Dont let messages from remote nodes change the DISCONNECTED flag. This flag is handled locally based on whether the local node can communicate with the node or not. */ c->flags &= ~NODE_FLAGS_DISCONNECTED; if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) { c->flags |= NODE_FLAGS_DISCONNECTED; } if (nodemap->nodes[i].flags != c->flags) { DEBUG(0,("Node %u has changed flags - now 0x%x\n", c->vnn, c->flags)); } nodemap->nodes[i].flags = c->flags; ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_master); if (ret == 0) { ret = ctdb_ctrl_getrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode); } if (ret == 0 && ctdb->recovery_master == ctdb->vnn && ctdb->recovery_mode == CTDB_RECOVERY_NORMAL && ctdb->takeover.enabled) { ret = ctdb_takeover_run(ctdb, nodemap); if (ret != 0) { DEBUG(0, (__location__ " Unable to setup public takeover addresses\n")); } } talloc_free(tmp_ctx);}/* the main monitoring loop */static void monitor_cluster(struct ctdb_context *ctdb){ uint32_t vnn, num_active, recmode, recmaster; TALLOC_CTX *mem_ctx=NULL; struct ctdb_node_map *nodemap=NULL; struct ctdb_node_map *remote_nodemap=NULL; struct ctdb_vnn_map *vnnmap=NULL; struct ctdb_vnn_map *remote_vnnmap=NULL; int i, j, ret; bool need_takeover_run; struct ctdb_recoverd *rec; rec = talloc_zero(ctdb, struct ctdb_recoverd); CTDB_NO_MEMORY_FATAL(ctdb, rec); rec->ctdb = ctdb; rec->banned_nodes = talloc_zero_array(rec, struct ban_state *, ctdb->num_nodes); CTDB_NO_MEMORY_FATAL(ctdb, rec->banned_nodes); rec->priority_time = timeval_current(); /* register a message port for recovery elections */ ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec); /* and one for when nodes are disabled/enabled */ ctdb_set_message_handler(ctdb, CTDB_SRVID_NODE_FLAGS_CHANGED, monitor_handler, rec); /* and one for when nodes are banned */ ctdb_set_message_handler(ctdb, CTDB_SRVID_BAN_NODE, ban_handler, rec); /* and one for when nodes are unbanned */ ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec); again: need_takeover_run = false; if (mem_ctx) { talloc_free(mem_ctx); mem_ctx = NULL; } mem_ctx = talloc_new(ctdb); if (!mem_ctx) { DEBUG(0,("Failed to create temporary context\n")); exit(-1); } /* we only check for recovery once every second */ ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval); /* get relevant tunables */ ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable); if (ret != 0) { DEBUG(0,("Failed to get tunables - retrying\n")); goto again; } vnn = ctdb_ctrl_getvnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE); if (vnn == (uint32_t)-1) { DEBUG(0,("Failed to get local vnn - retrying\n")); goto again; } /* get the vnnmap */ ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, &vnnmap); if (ret != 0) { DEBUG(0, (__location__ " Unable to get vnnmap from node %u\n", vnn)); goto again; } /* get number of nodes */ ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), vnn, mem_ctx, &nodemap); if (ret != 0) { DEBUG(0, (__location__ " Unable to get nodemap from node %u\n", vnn)); goto again; } /* count how many active nodes there are */ num_active = 0; for (i=0; i<nodemap->num; i++) { if (rec->banned_nodes[nodemap->nodes[i].vnn] != NULL) { nodemap->nodes[i].flags |= NODE_FLAGS_BANNED; } else { nodemap->nodes[i].flags &= ~NODE_FLAGS_BANNED; } if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) { num_active++; } } /* check which node is the recovery master */ ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), vnn, &recmaster); if (ret != 0) { DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", vnn)); goto again; } if (recmaster == (uint32_t)-1) { DEBUG(0,(__location__ " Initial recovery master set - forcing election\n")); force_election(rec, mem_ctx, vnn, nodemap); goto again; } /* verify that the recmaster node is still active */ for (j=0; j<nodemap->num; j++) { if (nodemap->nodes[j].vnn==recmaster) { break; } } if (j == nodemap->num) { DEBUG(0, ("Recmaster node %u not in list. Force reelection\n", recmaster)); force_election(rec, mem_ctx, vnn, nodemap); goto again; } if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { DEBUG(0, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].vnn)); force_election(rec, mem_ctx, vnn, nodemap); goto again; } /* if we are not the recmaster then we do not need to check if recovery is needed */ if (vnn!=recmaster) { goto again; } /* verify that all active nodes agree that we are the recmaster */ for (j=0; j<nodemap->num; j++) { if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } if (nodemap->nodes[j].vnn == vnn) { continue; } ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, &recmaster); if (ret != 0) { DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", vnn)); goto again; } if (recmaster!=vnn) { DEBUG(0, ("Node %u does not agree we are the recmaster. Force reelection\n", nodemap->nodes[j].vnn)); force_election(rec, mem_ctx, vnn, nodemap); goto again; } } /* verify that all active nodes are in normal mode and not in recovery mode */ for (j=0; j<nodemap->num; j++) { if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } ret = ctdb_ctrl_getrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, &recmode); if (ret != 0) { DEBUG(0, ("Unable to get recmode from node %u\n", vnn)); goto again; } if (recmode != CTDB_RECOVERY_NORMAL) { DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", nodemap->nodes[j].vnn)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } } /* get the nodemap for all active remote nodes and verify they are the same as for this node */ for (j=0; j<nodemap->num; j++) { if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } if (nodemap->nodes[j].vnn == vnn) { continue; } ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, mem_ctx, &remote_nodemap); if (ret != 0) { DEBUG(0, (__location__ " Unable to get nodemap from remote node %u\n", nodemap->nodes[j].vnn)); goto again; } /* if the nodes disagree on how many nodes there are then this is a good reason to try recovery */ if (remote_nodemap->num != nodemap->num) { DEBUG(0, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n", nodemap->nodes[j].vnn, remote_nodemap->num, nodemap->num)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } /* if the nodes disagree on which nodes exist and are active, then that is also a good reason to do recovery */ for (i=0;i<nodemap->num;i++) { if (remote_nodemap->nodes[i].vnn != nodemap->nodes[i].vnn) { DEBUG(0, (__location__ " Remote node:%u has different nodemap vnn for %d (%u vs %u).\n", nodemap->nodes[j].vnn, i, remote_nodemap->nodes[i].vnn, nodemap->nodes[i].vnn)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) != (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) { DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n", nodemap->nodes[j].vnn, i, remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } } /* update our nodemap flags according to the other server - this gets the NODE_FLAGS_DISABLED flag. Note that the remote node is authoritative for its flags (except CONNECTED, which we know matches in this code) */ if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) { nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags; need_takeover_run = true; } } /* there better be the same number of lmasters in the vnn map as there are active nodes or we will have to do a recovery */ if (vnnmap->size != num_active) { DEBUG(0, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n", vnnmap->size, num_active)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, ctdb->vnn); goto again; } /* verify that all active nodes in the nodemap also exist in the vnnmap. */ for (j=0; j<nodemap->num; j++) { if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } if (nodemap->nodes[j].vnn == vnn) { continue; } for (i=0; i<vnnmap->size; i++) { if (vnnmap->map[i] == nodemap->nodes[j].vnn) { break; } } if (i == vnnmap->size) { DEBUG(0, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n", nodemap->nodes[j].vnn)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } } /* verify that all other nodes have the same vnnmap and are from the same generation */ for (j=0; j<nodemap->num; j++) { if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } if (nodemap->nodes[j].vnn == vnn) { continue; } ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, mem_ctx, &remote_vnnmap); if (ret != 0) { DEBUG(0, (__location__ " Unable to get vnnmap from remote node %u\n", nodemap->nodes[j].vnn)); goto again; } /* verify the vnnmap generation is the same */ if (vnnmap->generation != remote_vnnmap->generation) { DEBUG(0, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n", nodemap->nodes[j].vnn, remote_vnnmap->generation, vnnmap->generation)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } /* verify the vnnmap size is the same */ if (vnnmap->size != remote_vnnmap->size) { DEBUG(0, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n", nodemap->nodes[j].vnn, remote_vnnmap->size, vnnmap->size)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } /* verify the vnnmap is the same */ for (i=0;i<vnnmap->size;i++) { if (remote_vnnmap->map[i] != vnnmap->map[i]) { DEBUG(0, (__location__ " Remote node %u has different vnnmap.\n", nodemap->nodes[j].vnn)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn); goto again; } } } /* we might need to change who has what IP assigned */ if (need_takeover_run && ctdb->takeover.enabled) { ret = ctdb_takeover_run(ctdb, nodemap); if (ret != 0) { DEBUG(0, (__location__ " Unable to setup public takeover addresses\n")); } } goto again;}/* event handler for when the main ctdbd dies */static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde, uint16_t flags, void *private_data){ DEBUG(0,("recovery daemon parent died - exiting\n")); _exit(1);}/* startup the recovery daemon as a child of the main ctdb daemon */int ctdb_start_recoverd(struct ctdb_context *ctdb){ int ret; int fd[2]; pid_t child; if (pipe(fd) != 0) { return -1; } child = fork(); if (child == -1) { return -1; } if (child != 0) { close(fd[0]); return 0; } close(fd[1]); /* shutdown the transport */ ctdb->methods->shutdown(ctdb); /* get a new event context */ talloc_free(ctdb->ev); ctdb->ev = event_context_init(ctdb); event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE, ctdb_recoverd_parent, &fd[0]); close(ctdb->daemon.sd); ctdb->daemon.sd = -1; srandom(getpid() ^ time(NULL)); /* initialise ctdb */ ret = ctdb_socket_connect(ctdb); if (ret != 0) { DEBUG(0, (__location__ " Failed to init ctdb\n")); exit(1); } monitor_cluster(ctdb); DEBUG(0,("ERROR: ctdb_recoverd finished!?\n")); return -1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -