mom_mach.c

来自「OpenPBS」· C语言 代码 · 共 2,037 行 · 第 1/3 页

C
2,037
字号
			if (retval != PBSE_NONE)			        return (error(pname, retval));			if ((mem_limit == 0) || (value < mem_limit))				mem_limit = value;		} else if (strcmp(pname, "pvmem") == 0) {	/* set */			if (set_mode == SET_LIMIT_SET)  {			    retval = getsize(pres, &value);			    if (retval != PBSE_NONE)			        return (error(pname, retval));			    if (value > INT_MAX)			        return (error(pname, PBSE_BADATVAL));			if ((mem_limit == 0) || (value < mem_limit))				mem_limit = value;			}		} else if (strcmp(pname, "pmem") == 0) {	/* set */			if (set_mode == SET_LIMIT_SET)  {			    retval = getsize(pres, &value);			    if (retval != PBSE_NONE)			        return (error(pname, retval));			    reslim.rlim_cur = reslim.rlim_max = value;			    if (setrlimit(RLIMIT_RSS, &reslim) < 0)	        		return (error("RLIMIT_RSS", PBSE_SYSTEM));			}		} else if (strcmp(pname, "walltime") == 0) {	/* Check */			retval = gettime(pres, &value);			if (retval != PBSE_NONE)			        return (error(pname, retval));		} else if (strcmp(pname, "nice") == 0) {	/* set nice */			if (set_mode == SET_LIMIT_SET)  {			    errno = 0;			    if ((nice((int)pres->rs_value.at_val.at_long) == -1)			        && (errno != 0))				return (error(pname, PBSE_BADATVAL));			}		} else if ((pres->rs_defin->rs_flags & ATR_DFLAG_RMOMIG) == 0)			/* don't recognize and not marked as ignore by mom */			return (error(pname, PBSE_UNKRESC));		pres = (resource *)GET_NEXT(pres->rs_link);	}	if (set_mode == SET_LIMIT_SET)  {	    /* if either of vmem or pvmem was given, set sys limit to lesser */	    if (mem_limit != 0) {		reslim.rlim_cur = reslim.rlim_max = mem_limit;		if (setrlimit(RLIMIT_DATA, &reslim) < 0)	        	return (error("RLIMIT_DATA", PBSE_SYSTEM));		if (setrlimit(RLIMIT_STACK, &reslim) < 0)	        	return (error("RLIMIT_STACK", PBSE_SYSTEM));	    }	}	return (PBSE_NONE);}/* * State whether MOM main loop has to poll this job to determine if some * limits are being exceeded. * *	Sets flag TRUE if polling is necessary, FALSE otherwise.  Actual *	polling is done using the mom_over_limit machine-dependent function. */int mom_do_poll(pjob)    job			*pjob;{	char		*id = "mom_do_poll";	char		*pname;	resource	*pres;	DBPRT(("%s: entered\n", id))	assert(pjob != NULL);	assert(pjob->ji_wattr[(int)JOB_ATR_resource].at_type == ATR_TYPE_RESC);	pres = (resource *)	    GET_NEXT(pjob->ji_wattr[(int)JOB_ATR_resource].at_val.at_list);	while (pres != NULL) {		assert(pres->rs_defin != NULL);		pname = pres->rs_defin->rs_name;		assert(pname != NULL);		assert(*pname != '\0');		if (strcmp(pname, "walltime") == 0 ||		    strcmp(pname, "cput") == 0 ||		    strcmp(pname, "pvmem") == 0 ||		    strcmp(pname, "vmem") == 0)			return (TRUE);		pres = (resource *)GET_NEXT(pres->rs_link);	}	return (FALSE);}/* * Setup for polling. * */int mom_open_poll(){	char		*id = "mom_open_poll";	DBPRT(("%s: entered\n", id))	proc_tbl = malloc(ASIZE*sizeof(struct procsinfo));	proctot = ASIZE;	return (PBSE_NONE);}/* * Declare start of polling loop. * *	Until the next call to mom_get_sample, all mom_over_limit calls will *	use the same data.  Returns a PBS error code. */int mom_get_sample(){	char		*id = "mom_get_sample";	struct	procsinfo	*pp;	int		num, addnum;	pid_t		pid;	DBPRT(("%s: entered\n", id))	addnum = proctot;	nproc = 0;	pid = 0;	pp = proc_tbl;	while ((num = getprocs(pp, sizeof(struct procsinfo),			NULL, sizeof(struct fdsinfo),			&pid, addnum)) > 0) {		DBPRT(("%s: loop start: got %d\n", id, num))		nproc += num;		if (num < addnum)			break;		proctot += ASIZE;		addnum = ASIZE;		proc_tbl = realloc(proc_tbl, proctot*sizeof(struct procsinfo));		pp = &proc_tbl[nproc];	}	if (num == -1) {		log_err(errno, id, "getprocs");		return PBSE_SYSTEM;	}	DBPRT(("%s: nproc = %d\n", id, nproc))	return (PBSE_NONE);}/* * Measure job resource usage and compare with its limits. * *	If it has exceeded any well-formed polled limit return TRUE. *	Otherwise, return FALSE. */int mom_over_limit(pjob)    job			*pjob;{	char		*id = "mom_over_limit";	char		*pname;	int		retval;	unsigned long	value, num;	resource	*pres;	assert(pjob != NULL);	assert(pjob->ji_wattr[(int)JOB_ATR_resource].at_type == ATR_TYPE_RESC);	DBPRT(("%s: entered\n", id))	pres = (resource *)	    GET_NEXT(pjob->ji_wattr[(int)JOB_ATR_resource].at_val.at_list);	for ( ; pres != NULL; pres = (resource *)GET_NEXT(pres->rs_link)) {		assert(pres->rs_defin != NULL);		pname = pres->rs_defin->rs_name;		assert(pname != NULL);		assert(*pname != '\0');		if (strcmp(pname, "cput") == 0) {			retval = gettime(pres, &value);			if (retval != PBSE_NONE)				continue;			if ((num = cput_sum(pjob)) > value) {				sprintf(log_buffer,					"cput %lu exceeded limit %lu",					num, value);				return (TRUE);			}		} else if (strcmp(pname, "vmem") == 0) {			retval = getsize(pres, &value);			if (retval != PBSE_NONE)				continue;			if ((num = mem_sum(pjob)) > value) {				sprintf(log_buffer,					"vmem %lu exceeded limit %lu",					num, value);				return (TRUE);			}		} else if (strcmp(pname, "pvmem") == 0) {			retval = getsize(pres, &value);			if (retval != PBSE_NONE)				continue;			if (overmem_proc(pjob, value)) {				sprintf(log_buffer,					"pvmem exceeded limit %lu", value);				return (TRUE);			}		} else if (strcmp(pname, "walltime") == 0) {			retval = gettime(pres, &value);			if (retval != PBSE_NONE)				continue;			num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor);			if (num > value) {				sprintf(log_buffer,					"walltime %d exceeded limit %d",					num, value);				return (TRUE);			}		}	}	return (FALSE);}/* * Update the job attribute for resources used. * *	The first time this is called for a job, set up resource entries for *	each resource that can be reported for this machine.  Fill in the *	correct values.  Return an error code. * *	Assumes that the session ID attribute has already been set. */int mom_set_use(pjob)    job			*pjob;{	char			*id = "mom_set_use";	resource		*pres;	attribute		*at;	resource_def		*rd;	unsigned long		*lp, lnum;	DBPRT(("%s: entered\n", id))	assert(pjob != NULL);	at = &pjob->ji_wattr[(int)JOB_ATR_resc_used];	assert(at->at_type == ATR_TYPE_RESC);	at->at_flags |= ATR_VFLAG_MODIFY;	if ((at->at_flags & ATR_VFLAG_SET) == 0) {		at->at_flags |= ATR_VFLAG_SET;		rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);		assert(rd != NULL);		pres = add_resource_entry(at, rd);		assert(pres != NULL);		pres->rs_value.at_flags |= ATR_VFLAG_SET;		pres->rs_value.at_type = ATR_TYPE_LONG;		pres->rs_value.at_val.at_long = 0;		rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);		assert(rd != NULL);		pres = add_resource_entry(at, rd);		assert(pres != NULL);		pres->rs_value.at_flags |= ATR_VFLAG_SET;		pres->rs_value.at_type = ATR_TYPE_SIZE;		pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */		pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;		pres->rs_value.at_val.at_size.atsv_num = 0;		rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);		assert(rd != NULL);		pres = add_resource_entry(at, rd);		assert(pres != NULL);		pres->rs_value.at_flags |= ATR_VFLAG_SET;		pres->rs_value.at_type = ATR_TYPE_LONG;		pres->rs_value.at_val.at_long = 0;		rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);		assert(rd != NULL);		pres = add_resource_entry(at, rd);		assert(pres != NULL);		pres->rs_value.at_flags |= ATR_VFLAG_SET;		pres->rs_value.at_type = ATR_TYPE_SIZE;		pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */		pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ;		pres->rs_value.at_val.at_size.atsv_num = 0;	}	rd = find_resc_def(svr_resc_def, "cput", svr_resc_size);	assert(rd != NULL);	pres = find_resc_entry(at, rd);	assert(pres != NULL);	lp = (unsigned long *)&pres->rs_value.at_val.at_long;	lnum = cput_sum(pjob);	*lp = MAX(*lp, lnum);	rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);	assert(rd != NULL);	pres = find_resc_entry(at, rd);	assert(pres != NULL);	lp = &pres->rs_value.at_val.at_size.atsv_num;	lnum = (mem_sum(pjob) + 1023) >> 10;	/* as KB */	*lp = MAX(*lp, lnum);	rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);	assert(rd != NULL);	pres = find_resc_entry(at, rd);	assert(pres != NULL);	pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor);	rd = find_resc_def(svr_resc_def, "mem", svr_resc_size);	assert(rd != NULL);	pres = find_resc_entry(at, rd);	assert(pres != NULL);	lp = &pres->rs_value.at_val.at_size.atsv_num;	lnum = (resi_sum(pjob) + 1023) >> 10;	/* as KB */	*lp = MAX(*lp, lnum);	return (PBSE_NONE);}/* *	Kill a task session. *	Call with the job pointer and a signal number. */int kill_task(ptask, sig)    task	*ptask;    int  	sig;{	int	ct = 0;	int	i, err;	int	sesid;	sesid = ptask->ti_qs.ti_sid;	if (sesid <= 1)		return 0;	if ((err = mom_get_sample()) != PBSE_NONE)		return 0;	for (i=0; i<nproc; i++) {		register struct procsinfo	*pp = &proc_tbl[i];		if (pp->pi_state == SNONE)			continue;		if (sesid != pp->pi_sid)			continue;		DBPRT(("kill_task: send signal %d to pid %d\n",			sig, pp->pi_pid))		(void)kill(pp->pi_pid, sig);		++ct;	}	return ct;}/* * Clean up everything related to polling. * */int mom_close_poll(){	DBPRT(("mom_close_poll entered\n"))	if (proc_tbl) {		free(proc_tbl);		proc_tbl = NULL;	}	return (PBSE_NONE);}/* * mom_does_chkpnt - return 1 if mom supports checkpoint *			    0 if not */int mom_does_chkpnt(){	return (0);}/* * Checkpoint the job. * *	If abort is true, kill it too. */int mach_checkpoint(ptask, file, abort)    task	*ptask;    char	*file;    int		abort;{       	return (-1);}/* * Restart the job from the checkpoint file. * *	Return a -1 on error or sid. */long mach_restart(ptask, file)    task	*ptask;    char	*file;{	return (-1);}intkvm_read(fd, addr, buf, size)    int		fd;    long	addr;    char	*buf;    int		size;{	int	ret;	if (lseek(fd, addr, SEEK_SET) != addr)		return -1;	if ((ret = read(fd, buf, size)) == -1)		return -1;	return ret;}intgetproctab(){	static	uint	lastproc = 0;	char		*id = "getproctab";	if (lastproc == reqnum)		/* don't need new proc table */		return nproc;	if (mom_get_sample() != PBSE_NONE)		return 0;	lastproc = reqnum;	return(nproc);}doubledsecs(val)struct	timeval	*val;{        return ( (double)val->tv_sec + (double)val->tv_usec*1e-6 );}char	*cput_job(jobid)pid_t	jobid;{	char		*id = "cput_job";	int		i, nproc;	int		found = 0;	double		cputime, addtime;	if ((nproc = getproctab()) == 0) {		rm_errno = RM_ERR_SYSTEM;		return NULL;	}	cputime = 0;	for (i=0; i<nproc; i++) {		register struct procsinfo	*pp = &proc_tbl[i];		if (pp->pi_state == SNONE)			continue;		if (jobid != pp->pi_sid)			continue;		found = 1;		DBPRT(("%s: pid=%d", id, pp->pi_pid))		if (pp->pi_state == SZOMB) {			DBPRT((" (zombie)"))			addtime = dsecs(&pp->pi_utime) +				dsecs(&pp->pi_stime);		}		else {			DBPRT((" (active)"))			addtime = dsecs(&pp->pi_ru.ru_utime) +				dsecs(&pp->pi_ru.ru_stime) +				dsecs(&pp->pi_cru.ru_utime) +				dsecs(&pp->pi_cru.ru_stime);		}		cputime += addtime;		DBPRT((" %.2f total=%.2f\n", addtime, cputime))	}	if (found) {		sprintf(ret_string, "%.2f", cputime * cputfactor);		return ret_string;	}	rm_errno = RM_ERR_EXIST;	return NULL;}char	*cput_proc(pid)pid_t	pid;{	char			*id = "cput_proc";	int			i, nproc;	int			found = 0;	double			cputime;	if ((nproc = getproctab()) == 0) {		rm_errno = RM_ERR_SYSTEM;		return NULL;	}	for (i=0; i<nproc; i++) {		register struct procsinfo	*pp = &proc_tbl[i];		if (pp->pi_state == SNONE)			continue;		if (pid != pp->pi_pid)			continue;		DBPRT(("%s: pid=%d", id, pp->pi_pid))		if (pp->pi_state == SZOMB) {			DBPRT((" (zombie)"))			cputime = dsecs(&pp->pi_utime) +				dsecs(&pp->pi_stime);		}		else {			DBPRT((" (active)"))			cputime = dsecs(&pp->pi_ru.ru_utime) +				dsecs(&pp->pi_ru.ru_stime) +				dsecs(&pp->pi_cru.ru_utime) +				dsecs(&pp->pi_cru.ru_stime);		}		DBPRT((" %.2f\n", cputime))		found = 1;		break;	}	if (found) {		sprintf(ret_string, "%.2f", cputime * cputfactor);		return ret_string;	}	rm_errno = RM_ERR_EXIST;	return NULL;}char	*cput(attrib)struct	rm_attribute	*attrib;{	char			*id = "cput";	int			value;	if (attrib == NULL) {		log_err(-1, id, no_parm);		rm_errno = RM_ERR_NOPARAM;		return NULL;	}	if ((value = atoi(attrib->a_value)) == 0) {		sprintf(log_buffer, "bad param: %s", attrib->a_value);		log_err(-1, id, log_buffer);		rm_errno = RM_ERR_BADPARAM;		return NULL;	}	if (momgetattr(NULL)) {		log_err(-1, id, extra_parm);		rm_errno = RM_ERR_BADPARAM;		return NULL;	}	if (strcmp(attrib->a_qualifier, "session") == 0)		return (cput_job((pid_t)value));	else if (strcmp(attrib->a_qualifier, "proc") == 0)		return (cput_proc((pid_t)value));	else {		rm_errno = RM_ERR_BADPARAM;		return NULL;	}}char	*mem_job(jobid)pid_t	jobid;{	char			*id = "mem_job";	int			i, nproc;	int			memsize;	int			found = 0;	if ((nproc = getproctab()) == 0) {		rm_errno = RM_ERR_SYSTEM;		return NULL;	}	memsize = 0;	for (i=0; i<nproc; i++) {		register struct procsinfo	*pp = &proc_tbl[i];		if (pp->pi_state == SNONE)			continue;		if (jobid != pp->pi_sid)			continue;		found = 1;		memsize += pp->pi_size;		DBPRT(("%s: pid %d memsize %d pi_size %d\n", id, pp->pi_pid,				memsize, pp->pi_size))	}	if (found) {		sprintf(ret_string, "%ukb", ctob(memsize) >> 10); /* KB */		return ret_string;	}	rm_errno = RM_ERR_EXIST;	return NULL;}char	*mem_proc(pid)pid_t	pid;{	char			*id = "mem_proc";	int			i, nproc;	int			memsize;	int			found = 0;	if ((nproc = getproctab()) == 0) {		rm_errno = RM_ERR_SYSTEM;		return NULL;	}	memsize = 0;	for (i=0; i<nproc; i++) {		register struct procsinfo	*pp = &proc_tbl[i];		if (pp->pi_state == SNONE)			continue;		if (pid != pp->pi_pid)			continue;		found = 1;		memsize = pp->pi_size;		break;	}	if (found) {		sprintf(ret_string, "%ukb", ctob(memsize) >> 10); /* KB */		return ret_string;	}	rm_errno = RM_ERR_EXIST;	return NULL;}char	*mem(attrib)struct	rm_attribute	*attrib;{	char			*id = "mem";	int			value;	if (attrib == NULL) {		log_err(-1, id, no_parm);		rm_errno = RM_ERR_NOPARAM;		return NULL;	}	if ((value = atoi(attrib->a_value)) == 0) {		sprintf(log_buffer, "bad param: %s", attrib->a_value);		log_err(-1, id, log_buffer);		rm_errno = RM_ERR_BADPARAM;		return NULL;	}	if (momgetattr(NULL)) {		log_err(-1, id, extra_parm);		rm_errno = RM_ERR_BADPARAM;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?