mom_mach.c
来自「OpenPBS」· C语言 代码 · 共 2,037 行 · 第 1/3 页
C
2,037 行
if (retval != PBSE_NONE) return (error(pname, retval)); if ((mem_limit == 0) || (value < mem_limit)) mem_limit = value; } else if (strcmp(pname, "pvmem") == 0) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = getsize(pres, &value); if (retval != PBSE_NONE) return (error(pname, retval)); if (value > INT_MAX) return (error(pname, PBSE_BADATVAL)); if ((mem_limit == 0) || (value < mem_limit)) mem_limit = value; } } else if (strcmp(pname, "pmem") == 0) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = getsize(pres, &value); if (retval != PBSE_NONE) return (error(pname, retval)); reslim.rlim_cur = reslim.rlim_max = value; if (setrlimit(RLIMIT_RSS, &reslim) < 0) return (error("RLIMIT_RSS", PBSE_SYSTEM)); } } else if (strcmp(pname, "walltime") == 0) { /* Check */ retval = gettime(pres, &value); if (retval != PBSE_NONE) return (error(pname, retval)); } else if (strcmp(pname, "nice") == 0) { /* set nice */ if (set_mode == SET_LIMIT_SET) { errno = 0; if ((nice((int)pres->rs_value.at_val.at_long) == -1) && (errno != 0)) return (error(pname, PBSE_BADATVAL)); } } else if ((pres->rs_defin->rs_flags & ATR_DFLAG_RMOMIG) == 0) /* don't recognize and not marked as ignore by mom */ return (error(pname, PBSE_UNKRESC)); pres = (resource *)GET_NEXT(pres->rs_link); } if (set_mode == SET_LIMIT_SET) { /* if either of vmem or pvmem was given, set sys limit to lesser */ if (mem_limit != 0) { reslim.rlim_cur = reslim.rlim_max = mem_limit; if (setrlimit(RLIMIT_DATA, &reslim) < 0) return (error("RLIMIT_DATA", PBSE_SYSTEM)); if (setrlimit(RLIMIT_STACK, &reslim) < 0) return (error("RLIMIT_STACK", PBSE_SYSTEM)); } } return (PBSE_NONE);}/* * State whether MOM main loop has to poll this job to determine if some * limits are being exceeded. * * Sets flag TRUE if polling is necessary, FALSE otherwise. Actual * polling is done using the mom_over_limit machine-dependent function. */int mom_do_poll(pjob) job *pjob;{ char *id = "mom_do_poll"; char *pname; resource *pres; DBPRT(("%s: entered\n", id)) assert(pjob != NULL); assert(pjob->ji_wattr[(int)JOB_ATR_resource].at_type == ATR_TYPE_RESC); pres = (resource *) GET_NEXT(pjob->ji_wattr[(int)JOB_ATR_resource].at_val.at_list); while (pres != NULL) { assert(pres->rs_defin != NULL); pname = pres->rs_defin->rs_name; assert(pname != NULL); assert(*pname != '\0'); if (strcmp(pname, "walltime") == 0 || strcmp(pname, "cput") == 0 || strcmp(pname, "pvmem") == 0 || strcmp(pname, "vmem") == 0) return (TRUE); pres = (resource *)GET_NEXT(pres->rs_link); } return (FALSE);}/* * Setup for polling. * */int mom_open_poll(){ char *id = "mom_open_poll"; DBPRT(("%s: entered\n", id)) proc_tbl = malloc(ASIZE*sizeof(struct procsinfo)); proctot = ASIZE; return (PBSE_NONE);}/* * Declare start of polling loop. * * Until the next call to mom_get_sample, all mom_over_limit calls will * use the same data. Returns a PBS error code. */int mom_get_sample(){ char *id = "mom_get_sample"; struct procsinfo *pp; int num, addnum; pid_t pid; DBPRT(("%s: entered\n", id)) addnum = proctot; nproc = 0; pid = 0; pp = proc_tbl; while ((num = getprocs(pp, sizeof(struct procsinfo), NULL, sizeof(struct fdsinfo), &pid, addnum)) > 0) { DBPRT(("%s: loop start: got %d\n", id, num)) nproc += num; if (num < addnum) break; proctot += ASIZE; addnum = ASIZE; proc_tbl = realloc(proc_tbl, proctot*sizeof(struct procsinfo)); pp = &proc_tbl[nproc]; } if (num == -1) { log_err(errno, id, "getprocs"); return PBSE_SYSTEM; } DBPRT(("%s: nproc = %d\n", id, nproc)) return (PBSE_NONE);}/* * Measure job resource usage and compare with its limits. * * If it has exceeded any well-formed polled limit return TRUE. * Otherwise, return FALSE. */int mom_over_limit(pjob) job *pjob;{ char *id = "mom_over_limit"; char *pname; int retval; unsigned long value, num; resource *pres; assert(pjob != NULL); assert(pjob->ji_wattr[(int)JOB_ATR_resource].at_type == ATR_TYPE_RESC); DBPRT(("%s: entered\n", id)) pres = (resource *) GET_NEXT(pjob->ji_wattr[(int)JOB_ATR_resource].at_val.at_list); for ( ; pres != NULL; pres = (resource *)GET_NEXT(pres->rs_link)) { assert(pres->rs_defin != NULL); pname = pres->rs_defin->rs_name; assert(pname != NULL); assert(*pname != '\0'); if (strcmp(pname, "cput") == 0) { retval = gettime(pres, &value); if (retval != PBSE_NONE) continue; if ((num = cput_sum(pjob)) > value) { sprintf(log_buffer, "cput %lu exceeded limit %lu", num, value); return (TRUE); } } else if (strcmp(pname, "vmem") == 0) { retval = getsize(pres, &value); if (retval != PBSE_NONE) continue; if ((num = mem_sum(pjob)) > value) { sprintf(log_buffer, "vmem %lu exceeded limit %lu", num, value); return (TRUE); } } else if (strcmp(pname, "pvmem") == 0) { retval = getsize(pres, &value); if (retval != PBSE_NONE) continue; if (overmem_proc(pjob, value)) { sprintf(log_buffer, "pvmem exceeded limit %lu", value); return (TRUE); } } else if (strcmp(pname, "walltime") == 0) { retval = gettime(pres, &value); if (retval != PBSE_NONE) continue; num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); if (num > value) { sprintf(log_buffer, "walltime %d exceeded limit %d", num, value); return (TRUE); } } } return (FALSE);}/* * Update the job attribute for resources used. * * The first time this is called for a job, set up resource entries for * each resource that can be reported for this machine. Fill in the * correct values. Return an error code. * * Assumes that the session ID attribute has already been set. */int mom_set_use(pjob) job *pjob;{ char *id = "mom_set_use"; resource *pres; attribute *at; resource_def *rd; unsigned long *lp, lnum; DBPRT(("%s: entered\n", id)) assert(pjob != NULL); at = &pjob->ji_wattr[(int)JOB_ATR_resc_used]; assert(at->at_type == ATR_TYPE_RESC); at->at_flags |= ATR_VFLAG_MODIFY; if ((at->at_flags & ATR_VFLAG_SET) == 0) { at->at_flags |= ATR_VFLAG_SET; rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); assert(pres != NULL); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); assert(pres != NULL); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; pres->rs_value.at_val.at_size.atsv_num = 0; rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); assert(pres != NULL); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); assert(pres != NULL); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; pres->rs_value.at_val.at_size.atsv_num = 0; } rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = (unsigned long *)&pres->rs_value.at_val.at_long; lnum = cput_sum(pjob); *lp = MAX(*lp, lnum); rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; lnum = (mem_sum(pjob) + 1023) >> 10; /* as KB */ *lp = MAX(*lp, lnum); rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; lnum = (resi_sum(pjob) + 1023) >> 10; /* as KB */ *lp = MAX(*lp, lnum); return (PBSE_NONE);}/* * Kill a task session. * Call with the job pointer and a signal number. */int kill_task(ptask, sig) task *ptask; int sig;{ int ct = 0; int i, err; int sesid; sesid = ptask->ti_qs.ti_sid; if (sesid <= 1) return 0; if ((err = mom_get_sample()) != PBSE_NONE) return 0; for (i=0; i<nproc; i++) { register struct procsinfo *pp = &proc_tbl[i]; if (pp->pi_state == SNONE) continue; if (sesid != pp->pi_sid) continue; DBPRT(("kill_task: send signal %d to pid %d\n", sig, pp->pi_pid)) (void)kill(pp->pi_pid, sig); ++ct; } return ct;}/* * Clean up everything related to polling. * */int mom_close_poll(){ DBPRT(("mom_close_poll entered\n")) if (proc_tbl) { free(proc_tbl); proc_tbl = NULL; } return (PBSE_NONE);}/* * mom_does_chkpnt - return 1 if mom supports checkpoint * 0 if not */int mom_does_chkpnt(){ return (0);}/* * Checkpoint the job. * * If abort is true, kill it too. */int mach_checkpoint(ptask, file, abort) task *ptask; char *file; int abort;{ return (-1);}/* * Restart the job from the checkpoint file. * * Return a -1 on error or sid. */long mach_restart(ptask, file) task *ptask; char *file;{ return (-1);}intkvm_read(fd, addr, buf, size) int fd; long addr; char *buf; int size;{ int ret; if (lseek(fd, addr, SEEK_SET) != addr) return -1; if ((ret = read(fd, buf, size)) == -1) return -1; return ret;}intgetproctab(){ static uint lastproc = 0; char *id = "getproctab"; if (lastproc == reqnum) /* don't need new proc table */ return nproc; if (mom_get_sample() != PBSE_NONE) return 0; lastproc = reqnum; return(nproc);}doubledsecs(val)struct timeval *val;{ return ( (double)val->tv_sec + (double)val->tv_usec*1e-6 );}char *cput_job(jobid)pid_t jobid;{ char *id = "cput_job"; int i, nproc; int found = 0; double cputime, addtime; if ((nproc = getproctab()) == 0) { rm_errno = RM_ERR_SYSTEM; return NULL; } cputime = 0; for (i=0; i<nproc; i++) { register struct procsinfo *pp = &proc_tbl[i]; if (pp->pi_state == SNONE) continue; if (jobid != pp->pi_sid) continue; found = 1; DBPRT(("%s: pid=%d", id, pp->pi_pid)) if (pp->pi_state == SZOMB) { DBPRT((" (zombie)")) addtime = dsecs(&pp->pi_utime) + dsecs(&pp->pi_stime); } else { DBPRT((" (active)")) addtime = dsecs(&pp->pi_ru.ru_utime) + dsecs(&pp->pi_ru.ru_stime) + dsecs(&pp->pi_cru.ru_utime) + dsecs(&pp->pi_cru.ru_stime); } cputime += addtime; DBPRT((" %.2f total=%.2f\n", addtime, cputime)) } if (found) { sprintf(ret_string, "%.2f", cputime * cputfactor); return ret_string; } rm_errno = RM_ERR_EXIST; return NULL;}char *cput_proc(pid)pid_t pid;{ char *id = "cput_proc"; int i, nproc; int found = 0; double cputime; if ((nproc = getproctab()) == 0) { rm_errno = RM_ERR_SYSTEM; return NULL; } for (i=0; i<nproc; i++) { register struct procsinfo *pp = &proc_tbl[i]; if (pp->pi_state == SNONE) continue; if (pid != pp->pi_pid) continue; DBPRT(("%s: pid=%d", id, pp->pi_pid)) if (pp->pi_state == SZOMB) { DBPRT((" (zombie)")) cputime = dsecs(&pp->pi_utime) + dsecs(&pp->pi_stime); } else { DBPRT((" (active)")) cputime = dsecs(&pp->pi_ru.ru_utime) + dsecs(&pp->pi_ru.ru_stime) + dsecs(&pp->pi_cru.ru_utime) + dsecs(&pp->pi_cru.ru_stime); } DBPRT((" %.2f\n", cputime)) found = 1; break; } if (found) { sprintf(ret_string, "%.2f", cputime * cputfactor); return ret_string; } rm_errno = RM_ERR_EXIST; return NULL;}char *cput(attrib)struct rm_attribute *attrib;{ char *id = "cput"; int value; if (attrib == NULL) { log_err(-1, id, no_parm); rm_errno = RM_ERR_NOPARAM; return NULL; } if ((value = atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, id, log_buffer); rm_errno = RM_ERR_BADPARAM; return NULL; } if (momgetattr(NULL)) { log_err(-1, id, extra_parm); rm_errno = RM_ERR_BADPARAM; return NULL; } if (strcmp(attrib->a_qualifier, "session") == 0) return (cput_job((pid_t)value)); else if (strcmp(attrib->a_qualifier, "proc") == 0) return (cput_proc((pid_t)value)); else { rm_errno = RM_ERR_BADPARAM; return NULL; }}char *mem_job(jobid)pid_t jobid;{ char *id = "mem_job"; int i, nproc; int memsize; int found = 0; if ((nproc = getproctab()) == 0) { rm_errno = RM_ERR_SYSTEM; return NULL; } memsize = 0; for (i=0; i<nproc; i++) { register struct procsinfo *pp = &proc_tbl[i]; if (pp->pi_state == SNONE) continue; if (jobid != pp->pi_sid) continue; found = 1; memsize += pp->pi_size; DBPRT(("%s: pid %d memsize %d pi_size %d\n", id, pp->pi_pid, memsize, pp->pi_size)) } if (found) { sprintf(ret_string, "%ukb", ctob(memsize) >> 10); /* KB */ return ret_string; } rm_errno = RM_ERR_EXIST; return NULL;}char *mem_proc(pid)pid_t pid;{ char *id = "mem_proc"; int i, nproc; int memsize; int found = 0; if ((nproc = getproctab()) == 0) { rm_errno = RM_ERR_SYSTEM; return NULL; } memsize = 0; for (i=0; i<nproc; i++) { register struct procsinfo *pp = &proc_tbl[i]; if (pp->pi_state == SNONE) continue; if (pid != pp->pi_pid) continue; found = 1; memsize = pp->pi_size; break; } if (found) { sprintf(ret_string, "%ukb", ctob(memsize) >> 10); /* KB */ return ret_string; } rm_errno = RM_ERR_EXIST; return NULL;}char *mem(attrib)struct rm_attribute *attrib;{ char *id = "mem"; int value; if (attrib == NULL) { log_err(-1, id, no_parm); rm_errno = RM_ERR_NOPARAM; return NULL; } if ((value = atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, id, log_buffer); rm_errno = RM_ERR_BADPARAM; return NULL; } if (momgetattr(NULL)) { log_err(-1, id, extra_parm); rm_errno = RM_ERR_BADPARAM;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?