📄 mom_mach.c
字号:
assert(*pname != '\0'); if (strcmp(pname, "ncpus") == 0) return (TRUE); if (strcmp(pname, "walltime") == 0) return (TRUE); pres = (resource *)GET_NEXT(pres->rs_link); } return (FALSE);}/* * Setup for polling. * * Open kernel device and get namelist info. */int mom_open_poll(){ static char *id = "mom_open_poll"; DBPRT(("%s: entered\n", id))#if SRFS#define SETDEV(name, var) \ if ((dir = var_value(name)) != NULL) { \ i |= chk_file_sec(dir, 1, 1, S_IWGRP|S_IWOTH, 1); \ if (quotactl(dir, SRFS_INFO, (caddr_t)&srfsinfo) == -1) \ log_err(errno, id, dir); \ else { \ var = srfsinfo.index; \ DBPRT(("%s: got %d for %s %s\n", id, var, name, dir)) \ } \ }{ int i; struct fsres_s srfsinfo; char *dir, *var_value(); var_init(); i = 0; SETDEV("TMPDIR", srfs_tmp_dev) SETDEV("BIGDIR", srfs_big_dev) SETDEV("FASTDIR", srfs_fast_dev)#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) if (i) return (PBSE_PERM);#endif /* NO_SECURITY_CHECK */}#else#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) if (chk_file_sec(TMP_DIR, 1, 1, S_IWGRP|S_IWOTH, 1)) return (PBSE_PERM);#endif /* NO_SECURITY_CHECK */#endif /* SRFS */ return (PBSE_NONE);}/* * Declare start of polling loop. * * Until the next call to mom_get_sample, all mom_over_limit calls will * use the same data. Returns a PBS error code. */int mom_get_sample(){ static char *id = "mom_get_sample"; struct tbs info; struct proc *pp; struct pcomm *pc; int i, pbase; DBPRT(("%s: entered\n", id)) if (session_table != NULL) free(session_table); if (tabinfo (SESS, &info) == -1) return (PBSE_SYSTEM); session_table_size = info.ent * info.len; session_table = (struct sess*)malloc (session_table_size); if (session_table == NULL) return (PBSE_SYSTEM); if (tabread(SESS, (char *) session_table, session_table_size, info.head) == -1) return (PBSE_SYSTEM); session_table_size = info.ent; if (process_table != NULL) free(process_table); if (tabinfo(PROCTAB, &info) == -1 ) return (PBSE_SYSTEM); process_table_size = info.ent * info.len; process_table = (struct proc*)malloc (process_table_size); if (process_table == NULL) return (PBSE_SYSTEM); if (tabread(PROCTAB, (char *) process_table, process_table_size, info.head) == -1) return (PBSE_SYSTEM); process_table_size = info.ent; pbase = (int)info.addr; for (pp=process_table,i=0; i<process_table_size; pp++,i++) { if (pp->p_stat == 0) continue; if ((pc = pp->p_pc) != NULL) { pp->p_pc = (struct pcomm *)((int)pc - pbase + (int)process_table); } } return (PBSE_NONE);}/* * Measure job resource usage and compare with its limits. * * If it has exceeded any well-formed polled limit return TRUE. * Otherwise, return FALSE. */int mom_over_limit(pjob) job *pjob;{ static char *id = "mom_over_limit"; char *pname; int retval; unsigned long value; resource *pres; int num; assert(pjob != NULL); assert(pjob->ji_wattr[(int)JOB_ATR_resource].at_type == ATR_TYPE_RESC); pres = (resource *) GET_NEXT(pjob->ji_wattr[(int)JOB_ATR_resource].at_val.at_list); DBPRT(("%s: entered %s\n", id, pjob->ji_qs.ji_jobid)) for ( ; pres != NULL; pres = (resource *)GET_NEXT(pres->rs_link)) { assert(pres->rs_defin != NULL); pname = pres->rs_defin->rs_name; assert(pname != NULL); assert(*pname != '\0'); if (strcmp(pname, "ncpus") == 0) { retval = getlong(pres, &value); if (retval != PBSE_NONE) continue; if ((num = cpus_sum(pjob)) > value) { sprintf(log_buffer, "ncpus %d exceeded limit %d", num, value); return (TRUE); } } if (strcmp(pname, "walltime") == 0) { retval = gettime(pres, &value); if (retval != PBSE_NONE) continue; num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); if (num > value) { sprintf(log_buffer, "walltime %d exceeded limit %d", num, value); return (TRUE); } } } return (FALSE);}/* * Update the job attribute for resources used. * * The first time this is called for a job, set up resource entries for * each resource that can be reported for this machine. Fill in the * correct values. Return an error code. */int mom_set_use(pjob) job *pjob;{ static char *id = "mom_set_use"; resource *pres; attribute *at; resource_def *rd; unsigned long *ulp, unum; long *lp, num; assert(pjob != NULL); at = &pjob->ji_wattr[(int)JOB_ATR_resc_used]; assert(at->at_type == ATR_TYPE_RESC); if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) != 0) return (PBSE_NONE); /* job suspended, don't track it */ DBPRT(("%s: entered %s\n", id, pjob->ji_qs.ji_jobid)) at->at_flags |= ATR_VFLAG_MODIFY; if ((at->at_flags & ATR_VFLAG_SET) == 0) { at->at_flags |= ATR_VFLAG_SET;#if SRFS rd = find_resc_def(svr_resc_def, "srfs_tmp", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "srfs_big", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "srfs_fast", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;#endif /* SRFS */ rd = find_resc_def(svr_resc_def, "ncpus", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "pf", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "sds", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "procs", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "mppt", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; } rd = find_resc_def(svr_resc_def, "ncpus", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_long; num = cpus_sum(pjob); *lp = max(*lp, num); rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_long; num = cput_sum(pjob); *lp = max(*lp, num); rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); ulp = &pres->rs_value.at_val.at_size.atsv_num; unum = (mem_sum(pjob) + 1023) >> 10; /* KB */ *ulp = max(*ulp, unum); rd = find_resc_def(svr_resc_def, "pf", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = (long *)&pres->rs_value.at_val.at_size.atsv_num; num = (pf_sum(pjob) + 1023) >> 10; /* KB */ *lp = max(*lp, num); rd = find_resc_def(svr_resc_def, "sds", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); ulp = &pres->rs_value.at_val.at_size.atsv_num; unum = (sds_sum(pjob) + 1023)>> 10; /* KB */ *ulp = max(*ulp, unum);#if SRFS rd = find_resc_def(svr_resc_def, "srfs_tmp", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); ulp = &pres->rs_value.at_val.at_size.atsv_num; unum = (srfs_sum(pjob, srfs_tmp_dev) + 1023) >> 10; /* KB */ *ulp = max(*ulp, unum); rd = find_resc_def(svr_resc_def, "srfs_big", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); ulp = &pres->rs_value.at_val.at_size.atsv_num; unum = (srfs_sum(pjob, srfs_big_dev) + 1023) >> 10; /* KB */ *ulp = max(*ulp, unum); rd = find_resc_def(svr_resc_def, "srfs_fast", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); ulp = &pres->rs_value.at_val.at_size.atsv_num; unum = (srfs_sum(pjob, srfs_fast_dev) + 1023) >> 10; /* KB */ *ulp = max(*ulp, unum);#endif /* SRFS */ rd = find_resc_def(svr_resc_def, "procs", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_long; num = proc_cnt(pjob); *lp = max(*lp, num); rd = find_resc_def(svr_resc_def, "mppt", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_long; num = mppt_sum(pjob); *lp = max(*lp, num); rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); return (PBSE_NONE);}/* * Kill a task session. * Call with the task pointer and a signal number. */int kill_task(ptask, sig) task *ptask; int sig;{ static char *id = "kill_task"; int ct = 1; int sesid; sesid = ptask->ti_qs.ti_sid; if (sesid > 1) { if (killm(C_JOB, sesid, sig) == -1) { if (errno != ESRCH) { sprintf(log_buffer, "killm: sid=%d sig=%d", sesid, sig); log_err(errno, id, log_buffer); } else { ct = 0; sprintf(log_buffer, "killm: sid=%d sig=%d", sesid, sig); log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } } } return ct;}/* * Clean up everything related to polling. * * In the case of the sun, close the kernal if it is open. */int mom_close_poll(){ static char *id = "mom_close_poll"; DBPRT(("%s: entered\n", id)) return (PBSE_NONE);}/* * mom_does_chkpnt - return 1 if mom supports checkpoint * 0 if not */int mom_does_chkpnt(){ return (1);}/* * Checkpoint the job. * * If abort is TRUE, kill it too. */int mach_checkpoint(ptask, path, abort) task *ptask; char *path; int abort;{ int cprtn; long flags = 0; if (abort) flags = CHKPNT_KILL; cprtn = chkpnt( C_SESS, ptask->ti_qs.ti_sid, path, flags ); return cprtn;}/* * Restart the job from the checkpoint file. * * Return the session/job id */long mach_restart(ptask, path) task *ptask; char *path;{ int sid; sid = restart(path, 0); return sid;}intgetprocs(){ static unsigned int lastproc = 0; if (lastproc == reqnum) /* don't need new proc table */ return process_table_size; if (mom_get_sample() != PBSE_NONE) return 0; lastproc = reqnum; return(process_table_size);}char *cput_job(jobid)pid_t jobid;{ char *id = "cput_job"; int i, nproc; int found = 0; time_t addtime; double cputime; if ((nproc = getprocs()) == 0) { rm_errno = RM_ERR_SYSTEM; return NULL; } cputime = 0.0; for (i=0; i<nproc; i++) { register struct proc *pp = &process_table[i]; register struct pcomm *pc; if (pp->p_stat==0) continue; if ((pc = pp->p_pc) == NULL) continue; if (jobid != pc->pc_sid) continue; found = 1; addtime = pp->p_utime + pp->p_stime + pp->p_sctime + pc->pc_cutime + pc->pc_cstime;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -