📄 mom_mach.c
字号:
* * If it has exceeded any well-formed polled limit return TRUE. * Otherwise, return FALSE. */int mom_over_limit(pjob) job *pjob;{ char *pname; int retval; rlim64_t sizeval; unsigned long value, num; rlim64_t num64; resource *pres; assert(pjob != NULL); assert(pjob->ji_wattr[(int)JOB_ATR_resource].at_type == ATR_TYPE_RESC); pres = (resource *) GET_NEXT(pjob->ji_wattr[(int)JOB_ATR_resource].at_val.at_list); for ( ; pres != NULL; pres = (resource *)GET_NEXT(pres->rs_link)) { assert(pres->rs_defin != NULL); pname = pres->rs_defin->rs_name; assert(pname != NULL); assert(*pname != '\0'); if (strcmp(pname, "ncpus") == 0) { attribute *at; resource_def *rd; resource *prescpup; retval = getlong(pres, &value); if (retval != PBSE_NONE) continue; at = &pjob->ji_wattr[(int)JOB_ATR_resc_used]; assert(at->at_type == ATR_TYPE_RESC); rd = find_resc_def(svr_resc_def, "cpupercent", svr_resc_size); assert(rd != NULL); prescpup = find_resc_entry(at, rd); assert(prescpup != NULL); num = prescpup->rs_value.at_val.at_long; if (num > (value*100+10)) { sprintf(log_buffer, "ncpus %.2f exceeded limit %lu", (float)num/100.0, value);#if !defined(SGI_ZOMBIE_WRONG) return (TRUE);#else LOG_EVENT(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer);#endif /* SGI_ZOMBIE_WRONG */ } } else if (strcmp(pname, "cput") == 0) { retval = getlong(pres, &value); if (retval != PBSE_NONE) continue; if ((num = cput_sum(pjob)) > value) { sprintf(log_buffer, "cput %lu exceeded limit %lu", num, value); return (TRUE); } } else if (strcmp(pname, "pcput") == 0) { retval = getlong(pres, &value); if (retval != PBSE_NONE) continue; if (overcpu_proc(pjob, value)) { sprintf(log_buffer, "pcput exceeded limit %lu", value); return (TRUE); } } else if (strcmp(pname, "vmem") == 0) { retval = getsize(pres, &sizeval); if (retval != PBSE_NONE) continue; if ((num64 = mem_sum(pjob)) > sizeval) { sprintf(log_buffer, "vmem %llu exceeded limit %llu", num64, sizeval); return (TRUE); } } else if (strcmp(pname, "pvmem") == 0) { retval = getsize(pres, &sizeval); if (retval != PBSE_NONE) continue; if (overmem_proc(pjob, sizeval)) { sprintf(log_buffer, "pvmem exceeded limit %llukb", sizeval); return (TRUE); } } else if (strcmp(pname, "mem") == 0) { retval = getsize(pres, &sizeval); if (retval != PBSE_NONE) continue; if ((num64 = resi_sum(pjob)) > sizeval) { sprintf(log_buffer, "mem %llu exceeded limit %llu", num64, sizeval); return (TRUE); } } else if (strcmp(pname, "walltime") == 0) { retval = getlong(pres, &value); if (retval != PBSE_NONE) continue; num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); if (num > value) { sprintf(log_buffer, "walltime %lu exceeded limit %lu", num, value); return (TRUE); } } } return (FALSE);}/* * Update the job attribute for resources used. * * The first time this is called for a job, set up resource entries for * each resource that can be reported for this machine. Fill in the * correct values. Return an error code. */int mom_set_use(pjob) job *pjob;{ resource *pres; attribute *at; resource_def *rd; unsigned long *lp, lnum, newcpu, oldcpu; long dur; unsigned long percent; assert(pjob != NULL); at = &pjob->ji_wattr[(int)JOB_ATR_resc_used]; assert(at->at_type == ATR_TYPE_RESC); at->at_flags |= ATR_VFLAG_MODIFY; if ((at->at_flags & ATR_VFLAG_SET) == 0) { at->at_flags |= ATR_VFLAG_SET; rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "cpupercent", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; pres->rs_value.at_val.at_long = 0; rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* in KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; } rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = (unsigned long *)&pres->rs_value.at_val.at_long; oldcpu = *lp; lnum = cput_sum(pjob); if (lnum > *lp) { *lp = lnum; if ( (dur = sampletime - pjob->ji_sampletim) > 10) { newcpu = *lp; /* save new cput */ rd = find_resc_def(svr_resc_def, "cpupercent", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = (unsigned long *)&pres->rs_value.at_val.at_long; percent = (newcpu - oldcpu)*100 / dur; *lp = MAX(*lp, percent); DBPRT(("cpu %% : ses %ld (new %lu - old %lu)/delta %ld = %lu%%\n", pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long, newcpu, oldcpu, dur, percent)) } pjob->ji_sampletim = sampletime; } rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; lnum = (mem_sum(pjob) + 1023) >> 10; /* as KB */ *lp = MAX(*lp, lnum); rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; lnum = (resi_sum(pjob) + 1023) >> 10; /* in KB */ *lp = MAX(*lp, lnum); return (PBSE_NONE);}/* * Kill a task session. * Call with the task pointer and a signal number. */int kill_task(ptask, sig) task *ptask; int sig;{ char *id = "kill_task"; ash_t ash; int ct = 0; int np; struct startjob_rtn sgid; aspidlist_t *taskpids = 0; extern aserror_t aserrorcode; if (ptask->ti_job->ji_globid != NULL) { sscanf(ptask->ti_job->ji_globid, "%llx", &ash); } else { ash = asashofpid(ptask->ti_qs.ti_sid); sgid.sj_ash = ash; set_globid(ptask->ti_job, &sgid); } if ((ash != 0LL) && (ash != -1LL)) { taskpids = aspidsinash_local(ash); if (taskpids) { for (np=0; np<taskpids->numpids; ++np) { (void)kill(taskpids->pids[np], sig); ++ct; } } else { sprintf(log_buffer, "no pids in ash %lld in %s",ash,id); log_err(aserrorcode, id, log_buffer); } } return ct;}/* * Clean up everything related to polling. */int mom_close_poll(){ char *id = "mom_close_poll"; int i; DBPRT(("%s: entered\n", id)) if (proc_array) {#if COMPLEX_MEM_CALC==1 for(i=0; i<max_proc; i++) { struct proc_info *pi = &proc_array[i]; if (pi->map) free(pi->map); }#endif /* COMPLEX_MEM_CALC */ free(proc_array); } if (pdir) { if (closedir(pdir) != 0) { log_err(errno, id, "closedir"); return (PBSE_SYSTEM); } } return (PBSE_NONE);}/* * mom_does_chkpnt - return 1 if mom supports checkpoint * 0 if not */int mom_does_chkpnt(){#if MOM_CHECKPOINT == 1 return (1);#else /* MOM_CHECKPOINT */ return (0);#endif /* MOM_CHECKPOINT */}/* * Checkpoint the task. * * If abort is true, kill it too. */int mach_checkpoint(ptask, file, abort) task *ptask; char *file; int abort;{#if MOM_CHECKPOINT == 1 ash_t ash; sscanf(ptask->ti_job->ji_globid, "%llx", &ash); /* ckpt_setup(0, 0); Does nothing so why have it */ if (abort) cpr_flags = CKPT_CHECKPOINT_KILL | CKPT_NQE; else cpr_flags = CKPT_CHECKPOINT_CONT | CKPT_NQE; return ( ckpt_create(file, ash, P_ASH, 0, 0) ); /* return ( ckpt_create(file, ptask->ti_qs.ti_sid, P_SID, 0, 0) ); */#else /* MOM_CHECKPOINT */ return (-1);#endif /* MOM_CHECKPOINT */}/* * Restart the task from the checkpoint file. * * Return -1 on error or sid if okay. */long mach_restart(ptask, file) task *ptask; char *file;{#if MOM_CHECKPOINT == 1 ckpt_id_t rc; ash_t momash; ash_t oldash = 0; char cvtbuf[20]; cpr_flags = CKPT_NQE; /* KLUDGE to work-around SGI problem, for some reason ckpt_restart() */ /* passes open file descriptor to /proc to restarted process */ if (pdir) closedir(pdir); /* To restart the job with its old ASH, Mom must be in that ASH */ /* When she does the restart. However, before changing to that */ /* ASH, Mom must put herself in a new ASH all by herself, otherwise */ /* she will take other system daemons with her into the job's ASH */ momash = getash(); newarraysess(); /* isolate Mom in a ASH by herself */ if (ptask->ti_job->ji_globid != NULL) { /* now get job's old ASH and set it */ sscanf(ptask->ti_job->ji_globid, "%llx", &oldash); if (setash(oldash) == -1) { DBPRT(("setash failed before restart, errno = %d", errno)) } } rc = ckpt_restart(file, (struct ckpt_args **)0, 0); if ((ptask->ti_job->ji_globid == NULL) && (rc > 0)) { (void)sprintf(cvtbuf, "%llx", rc); ptask->ti_job->ji_globid = strdup(cvtbuf); } newarraysess(); /* again, isolate Mom into ASH by herself */ if (setash(momash) == -1) { /* put Mom back to her old ASH */ DBPRT(("setash failed after restart, errno = %d", errno)) } /* KLUDGE TO work-around SGI problem, ckpt_restart sets the uid of */ /* the calling process (me) to that of the restarted process */ (void)setuid(0); if ((pdir = opendir(procfs)) == NULL) { log_err(errno, "mach_restart", "opendir"); } return ((int)rc);#else /* MOM_CHECKPOINT */ return (-1);#endif /* MOM_CHECKPOINT */}/*** Return 1 if proc table can be read, 0 otherwise.*/intgetprocs(){ static unsigned int lastproc = 0; if (lastproc == reqnum) /* don't need new proc table */ return 1; if (mom_get_sample() != PBSE_NONE) return 0; lastproc = reqnum; return 1;}char *cput(attrib)struct rm_attribute *attrib;{ rm_errno = RM_ERR_UNKNOWN; return NULL;}char *mem(attrib)struct rm_attribute *attrib;{ rm_errno = RM_ERR_UNKNOWN; return NULL;}char *sessions(attrib)struct rm_attribute *attrib;{ rm_errno = RM_ERR_UNKNOWN; return NULL;}char *pids(attrib)struct rm_attribute *attrib;{ rm_errno = RM_ERR_UNKNOWN; return NULL;}char *nsessions(attrib)struct rm_attribute *attrib;{ rm_errno = RM_ERR_UNKNOWN; return NULL;}char *nusers(attrib)struct rm_attribute *attrib;{ rm_errno = RM_ERR_UNKNOWN; return NULL;}static char *totmem(attrib)struct rm_attribute *attrib;{ static char id[] = "totmem"; struct statfs fsbuf; if (attrib) { log_err(-1, id, extra_parm); rm_errno = RM_ERR_BADPARAM; return NULL; } if (statfs(procfs, &fsbuf, sizeof(struct statfs), 0) == -1) { log_err(errno, id, "statfs"); rm_errno = RM_ERR_SYSTEM; return NULL; } DBPRT(("%s: bsize=%ld blocks=%lld\n", id, fsbuf.f_bsize, fsbuf.f_blocks)) sprintf(ret_string, "%llukb", ((rlim64_t)fsbuf.f_bsize * (rlim64_t)fsbuf.f_blocks) >> 10); return ret_string;}/* * availmem() - return amount of available memory in system in KB as string */static char *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -