📄 checkpointserverfile.c
字号:
/* MPICH-V2 Copyright (C) 2002, 2003 Groupe Cluster et Grid, LRI, Universite de Paris Sud This file is part of MPICH-V2. MPICH-V2 is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. MPICH-V2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with MPICH-V2; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA $Id: checkpointServerFile.c,v 1.6 2004/04/09 18:58:08 bouteill Exp $*/#include "config.h"#include "debug.h"#include "checkpointServerFile.h"#include <stdlib.h>#include <string.h>#include <unistd.h>#include <errno.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <dirent.h>#ifndef DEF_PERM#define DEF_PERM (S_IREAD | S_IWRITE | S_IRGRP | S_IWGRP)#endifstatic int nfill_filename(char *buff, int n, int group, int rank, int seq);int getopenCheckpointFile(int group, int rank, int seq){ int f; char name[64]; struct flock lock; nfill_filename(name, 64, group, rank, seq); if((f = open(name, O_RDONLY)) < 0) { printe("Opening checkpoint image file %s for reading", name); return -1; } printi("CS_file","Locking file for read/write %s\n", name); lock.l_type = F_RDLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = 0; if(fcntl(f,F_SETLKW, &lock) == -1) printe("Lock %s", name); printi("CS_file", "File %s locked for read/write\n", name); return f;}int putopenCheckpointFile(int group, int rank, int seq){ int f; char buff[64]; char *oldname; char *newname; struct flock lock; nfill_filename(buff, 64, group, rank, seq); newname = strdup(buff); oldname = strdup(strcat(buff,".bak")); if((f = creat(oldname,DEF_PERM)) == -1) { printe("Creating checkpoint file %s", oldname); free(oldname);free(newname);return -1; } printi("CS_file","Locking file for read/write %s\n", oldname); lock.l_type = F_WRLCK | F_RDLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = 0; while(fcntl(f,F_SETLKW, &lock) == -1) printe("Lock %s", oldname); printi("CS_file", "File %s locked for read/write\n", oldname); *((int *)buff) = group; *((int *)(buff + sizeof(int))) = rank; *((int *)(buff + 2 * sizeof(int))) = seq; printi("CS_file", "Writting initial data to Checkpoint file %d:%d#%d", group, rank, seq); if(write(f, buff, 3 * sizeof(int)) != (3 * sizeof(int))) { printe("Writting initial data to Checkpoint file %d:%d#%d", group, rank, seq); free(oldname);free(newname);return -1; } return f;}int discardCheckpointFile(int fd, int group, int rank, int seq){ char buff[64]; char *oldname; char *newname; struct flock lock; printi("CS_file", "Discarding checkpoint file for %d %d %d\n", group, rank, seq); nfill_filename(buff, 64, group, rank, seq); newname = strdup(buff); oldname = strdup(strcat(buff,".bak")); printi("CS_file", "Unlocking file %s for read/write\n", oldname); lock.l_type = F_WRLCK | F_RDLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = 0; while(fcntl(fd,F_UNLCK, &lock) == -1) printe("Unlock %s", oldname); if(close(fd) < 0) printe("close"); printi("CS_file", "file %s unlocked for read/write\n", oldname); printi("CS_file", "removing temporary file %s\n", oldname); unlink(oldname); free(newname);free(oldname);return 0;}int delCheckpointFileExact(int group, int rank, int seq){ char buff[64]; nfill_filename(buff, 64, group, rank, seq); printi("CS_file", "deleting %s", buff); if(unlink(buff) < 0) { if(errno == ENOENT) { printi("CS_file", "Request to delete non existent file %s", buff); return 0; } printe("Cannot delete requested file %s", buff); return -1; } return 1;}int delCheckpointFileUpto(int group, int rank, int seq){ DIR *dir; struct dirent *file; int fgroup, frank, fseq; int ret = 0; if((dir = opendir(".")) == NULL) qerror("DEL Request failed %d:%d upto %d, opendir", group, rank, seq); while((file = readdir(dir)) != NULL) { sscanf(file->d_name, "ckptimg-g%d-r%d-s%d", &fgroup, &frank, &fseq); if((fgroup == group) && (frank == rank) && (fseq <= seq)) { if(delCheckpointFileExact(group, rank, fseq) == 1) ret++; } } closedir(dir); printi("CS_file", "DEL Request %d:%d upto %d deleted %d files", group, rank, seq, ret); return ret;}int confirmCheckpointFile(int fd, int group, int rank, int seq){ char buff[64]; char *oldname; char *newname; struct flock lock; printi("CS_file", "Confirming checkpoint file for %d:%d(%d)\n", group, rank, seq); nfill_filename(buff, 64, group, rank, seq); newname = strdup(buff); oldname = strdup(strcat(buff,".bak")); printi("CS_file", "Destroying previous checkpoint image %s\n", newname); unlink(newname); printi("CS_file", "Unlocking file %s for read/write\n", oldname); lock.l_type = F_WRLCK | F_RDLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = 0; while(fcntl(fd, F_UNLCK, &lock) == -1) printe("Unlock %s", oldname); if(close(fd) < 0) printe("close"); printi("CS_file", "file %s unlocked for read/write\n", oldname); printi("CS_file", "Moving temporary checkpoint file %s to confirmed checkpoint file %s\n", oldname, newname); if(rename(oldname, newname) == -1) { printe("<CRITICAL> Previous checkpoint %s was destroyed because confirmation was recieved but temporary checkpoint %s cannot be moved due to filesystem failure ! %s", newname, oldname); } free(oldname);free(newname);return 0;}static int nfill_filename(char *buff, int n, int group, int rank, int seq){ if(seq == -1) return snprintf(buff, n, "ckptimg-g%d-r%d", group, rank); else return snprintf(buff, n, "ckptimg-g%d-r%d-s%d", group, rank, seq);}int protowriteCheckpointFile(int fd, int protosize, int where, void *buff, int size){ int ret; if(where + size > protosize) { printw("Implementation protocol data is to large, waited %d recieved %d\n", protosize, where + size); return -1; } if(lseek(fd, where + 5 * sizeof(int), SEEK_SET) < 0) { printe("Cannot seek in checkpoint file to the specified position in protocol data area %d (upper bound=%d)\n", where + 5 * sizeof(int), protosize + 5 * sizeof(int)); return -1; } if((ret = write(fd, buff, size)) != size) { printe("Cannot write all %d data to checkpoint file, write returned %d", size, ret); } return ret;}int datawriteCheckpointFile(int fd, int protosize, int where, void *buff, int size){ int ret; if(lseek(fd, where + protosize + 5 * sizeof(int), SEEK_SET) < 0) { printe("Cannot seek in checkpoint file to the specified position in image data area %d (lower bound=%d)\n", protosize + where + 5 * sizeof(int), protosize + 5 * sizeof(int)); return -1; } if((ret = write(fd, buff, size)) != size) { printe("Cannot write all %d data to checkpoint file, write returned %d", size, ret); } return ret;}int datasizewriteCheckpointFile(int fd, int protosize, int datasize){ int ret1; int ret2; if(lseek(fd, 3 * sizeof(int), SEEK_SET) < 0) { printe("Cannot seek in checkpoint file to the specified position in protocol area (%d)", 4 * sizeof(int)); return -1; } if((ret1 = write(fd, &protosize, sizeof(int))) != sizeof(int)) { printe("Cannot write protosize=%d to checkpoint file, write returned %d", protosize, ret1); } if((ret2 = write(fd, &datasize, sizeof(int))) != sizeof(int)) { printe("Cannot write datasize=%d to checkpoint file, write returned %d", datasize, ret2); } return ret1 + ret2;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -