📄 schedule.c

📁 openPBS的开放源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
     * Allocation and usage information are updated at [roughly] 2:00 AM      * (Eastern time).  Since they may have been updated, attempt to fetch      * them again in the middle of the night.     */    if (schd_NeedToGetDecayInfo)	    schd_decay_info("r");    /* get users' recent past usage */    if (schd_ENFORCE_ALLOCATION && schd_TimeNow >= schd_ENFORCE_ALLOCATION) {	/* 	 * If the allocations file has already been loaded, consult the file	 * timestamp to determine if it has changed.  If so, flag that it	 * needs to be reloaded.	 */	if (!schd_NeedToGetAllocInfo && schd_AllocFilename)	    schd_NeedToGetAllocInfo = 		schd_file_has_changed(schd_AllocFilename, 1);	if (!schd_NeedToGetYTDInfo && schd_CurrentFilename)	    schd_NeedToGetYTDInfo = 		schd_file_has_changed(schd_CurrentFilename, 1);	/* If either file needs to be [re]loaded, do so. */	if (schd_NeedToGetAllocInfo || schd_NeedToGetYTDInfo)	    schd_alloc_info();    }    /*      * We need to save the past usage data periodically, so that a restart      * of pbs_sched doesn't lose it ...     */    if (schd_save_decay())	/* is it time yet ? */	schd_decay_info("w");	/* yep, so do it */    if (schd_SubmitQueue->queue->jobs && 	!(schd_SubmitQueue->queue->flags & (QFLAGS_DISABLED | QFLAGS_STOPPED)))    {	/*	 * Test each job against the set of execution queues.  If it can	 * never be run in any queue, reject it immediately.  This saves	 * the user having to wait for the scheduler to get around to being	 * able to run it.	 */	jobs = reject_unrunnables(schd_SubmitQueue->queue->jobs);	/*	 * Look for queues whose execution hosts are in dedicated time.  If	 * any are found, note that fact and continue.  Otherwise, add them	 * to the normalQs list, which will be scheduled normally.  If the	 * flag is set indicating that one or more hosts is in dedtime, they	 * will be scheduled after everything else is done.	 */	for (qptr = schd_BatchQueues; qptr != NULL; qptr = qptr->next) {	    if (schd_ENFORCE_DEDTIME && schd_TimeNow >= schd_ENFORCE_DEDTIME)		outages = schd_host_outage(qptr->queue->exechost, 0);	    else		outages = NULL;	    /* 	     * Is there a scheduled outage right now for this host?  If so,	     * note that fact and continue to the next queue.  All of this	     * information is cached, so this isn't as expensive as it seems.	     */	    if (outages != NULL) {		if ((outages->beg_time <= schd_TimeNow) &&		    (outages->end_time > schd_TimeNow))		{		    DBPRT(("%s: Host %s is in dedtime (from %s:%s to %s:%s)\n",			id, outages->exechost, 			outages->beg_datestr, outages->beg_timestr,			outages->end_datestr, outages->end_timestr));		    DBPRT(("%s: Queue %s@%s will not be scheduled.\n", id,			qptr->queue->qname, qptr->queue->exechost));		    /* This exechost is in dedicated time, ignore the queue. */		    hosts_in_dedtime ++;		    continue;		} else if (outages->beg_time > schd_TimeNow) {		    /* Upcoming dedtime, but not yet.  Schedule the queue. */		    DBPRT(("%s: Host %s upcoming dedtime (at %s:%s to %s:%s)\n",			id, outages->exechost, 			outages->beg_datestr, outages->beg_timestr,			outages->end_datestr, outages->end_timestr));		}	    }	    /* 	     * This host is not currently in dedicated time.  Add it to the 	     * tail of the list of queues to be scheduled.	     */	    newqlp = (QueueList *)malloc(sizeof(QueueList));	    if (newqlp == NULL) {		(void)sprintf(log_buffer, "malloc(QueueList) for %s@%s failed",		    qptr->queue->qname, qptr->queue->exechost);		log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, 			log_buffer);		DBPRT(("%s: %s\n", id, log_buffer));		if (normalQs)		    schd_free_qlist(normalQs);		return (1);	    }	    newqlp->queue = qptr->queue;	    if (normalQtail)		normalQtail->next = newqlp;	    else		normalQs = newqlp;	    normalQtail = newqlp;	    newqlp->next = NULL;	}	DBPRT(("%s: calling schedule_jobs(", id));	if (normalQs) {	    for (qptr = normalQs; qptr != NULL; qptr = qptr->next)		DBPRT(("%s@%s%s", qptr->queue->qname, qptr->queue->exechost,		    qptr->next ? ", " : ""));	} else {	    DBPRT(("<no batch queues>"));	}	DBPRT((")\n"));	total_ran += ran = schedule_jobs(normalQs, jobs, reason);	if (ran < 0) {	    DBPRT(("Could not run any jobs!\n"));	} else {	    DBPRT(("RAN %d jobs.\n", ran));	}	if (normalQs)	    schd_free_qlist(normalQs);	normalQs = normalQtail = NULL;    }    /*     * If there are any externally-routed queues, schedule any jobs      * that are enqueued in them.     */    for (qptr = schd_ExternQueues; qptr != NULL; qptr = qptr->next) {	if (qptr->queue->queued == 0)	    continue;	(void)sprintf(log_buffer, "Scheduling external queue %s@%s ...",	    qptr->queue->qname, qptr->queue->exechost);	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, 		log_buffer);	DBPRT(("%s: %s\n", id, log_buffer));	/* 	 * Keep track of the next pointer.  Zero it so that each queue	 * looks like a single queue to schd_pack_queues().	 */	next = qptr->next;	qptr->next = NULL;	ran = schd_pack_queues(qptr->queue->jobs, qptr, reason);	if (ran < 0) {	    (void)sprintf(log_buffer, "sched_pack_queues(%s@%s) failed!",		qptr->queue->qname, qptr->queue->exechost);	    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, 		log_buffer);	    DBPRT(("%s: %s\n", id, log_buffer));	} else {	    DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname,		qptr->queue->exechost));	    total_ran += ran;	}	/* Replace the zero'd next pointer to rechain the list. */	qptr->next = next;    }    /*     * Now check the dedtime queues with queued jobs for hosts that are      * in dedicated time.  If any are found, comment the jobs appropriately     * and/or schedule them.     */    for (qptr = schd_DedQueues; qptr != NULL; qptr = qptr->next) {	if (qptr->queue->queued == 0)	    continue;	DBPRT(("%s: schd_handle_dedicated_time(%s)\n", id, qptr->queue->qname));	/* 	 * Keep track of the next pointer, and zero the queue's next ptr so	 * it looks like a single queue.	 */	next = qptr->next; 	qptr->next = NULL;	ran = schd_handle_dedicated_time(qptr->queue);	if (ran < 0) {	    (void)sprintf(log_buffer, 		"schd_handle_dedicated_time(%s@%s) failed!", 		qptr->queue->qname, qptr->queue->exechost);	    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, 		log_buffer);	    DBPRT(("%s: %s\n", id, log_buffer));	} else {	    DBPRT(("RAN %d jobs on %s@%s.\n", ran, qptr->queue->qname,		qptr->queue->exechost));	    total_ran += ran;	}	/* Replace the zero'd next pointer to rechain the list. */	qptr->next = next;    }    /*     * Attempt to revoke any unused HPM counters that are still in user     * mode.  Returns number of errors encountered.  This should be zero     * for a healthy system.     */    if (schd_MANAGE_HPM) {	if (schd_revoke_hpm()) {	    (void)sprintf(log_buffer, "Failed to revoke unused HPM counters!");	    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);	    DBPRT(("%s\n", log_buffer));	}    }    if (total_ran > 0) {	(void)sprintf(log_buffer, "System resources after scheduling:");	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);	schd_dump_rsrclist();    }    (void)sprintf(log_buffer, ">>>  End Scheduling Cycle (ran %d jobs)  <<<",	total_ran);    log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);    DBPRT(("%s\n", log_buffer));    return (1);}static intschedule_jobs(QueueList *queues, Job *jobs, char *reason){    char *id = "schedule_jobs";    int    numran;    Job   *job;    Queue *shortest;    int    priority_to_1st = 1;    /*     * Since the sorting code has provided an order in which the jobs should     * be run, attempt to honor that order by treating the first job on the     * list as our first priority.  This amounts to draining the queue in     * order to run that job, if necessary.     *      * If the job has been waiting too long, find the smallest queue in which     * the job will fit, and consider its expected run time.  If the waiting     * job cannot run when the queue has emptied, then go on to the next.       * However, if there are jobs running on the queue, it is possible that     * this queue could support the waiting job if it were started draining     * now.  When enough jobs had exited, the waiting job would be runnable.     * In order to determine if this is true, walk through the list of jobs,     * which are sorted in order of completion (from soonest to last), and     * find how many resources would be available after that job finished.     * If there is space, calculate what time it will be when that many jobs     * have completed, and see if the primetime limits apply at that time.     * If the job fits in the primetime limits at that time, then start the     * queue draining.  If it will not fit after all jobs have been tested,     * then give up on this queue and go on to the next.     *      * If a queue was found that requires draining, mark it for draining.     *      * After the waiting job handling has completed, collect a list of     * all the available execution queues, and place it into the pointer     * given to this function by the caller.     */    for (job = jobs; job != NULL; job = job->next) {	if (job->state != 'Q')	    continue;	if (!priority_to_1st && !(job->flags & JFLAGS_WAITING))	    continue;	DBPRT(("%s: job %s is %s (eligible for %s, needs %d nodes)\n", id,	    job->jobid, 	    priority_to_1st ? "FIRSTJOB" : 		(job->flags & JFLAGS_PRIORITY) ? "SPECIAL" : "WAITING",	    schd_sec2val(job->eligible), job->nodes));	/*	 * Find the smallest, shortest-wait queue in which this job will	 * fit.  If it is empty, great.  If not, mark it to be drained,	 * in anticipation of the job being run soon.  Note that the queue	 * drain_by time should only be shortened - it doesn't make sense	 * to push it out.	 */	shortest = schd_find_drain(queues, job);	if (shortest) {	    /*	     * If there are no jobs running in the queue, then unset the	     * draining flag (if present), so that the queue will be	     * available for this job.	     *	     * If there are running jobs, set the draining flag, and	     * adjust the empty_by value to be the expected time when	     * the job will first become runnable.	     */	    if (shortest->running == 0) {		shortest->flags &= ~QFLAGS_DRAINING;	    } else {		/* If running jobs, empty_by should be non-zero. */		if (shortest->drain_by <= shortest->empty_by) {		    shortest->flags |= QFLAGS_DRAINING; 		    DBPRT(("%s:	shortest queue %s now draining, drain_by %s",			id, shortest->qname, ctime(&shortest->drain_by)));		}	    }	}	/*	 * We have looked at (and possibly arranged for special treatment	 * of) the first job on the list.  Now only look for special or	 * waiting jobs.	 */	priority_to_1st = 0;    }    numran = schd_pack_queues(jobs, queues, reason);    if (numran < 0) {	(void)sprintf(log_buffer, 	    "sched_pack_queues() failed!");	log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, id, log_buffer);	DBPRT(("%s: %s\n", id, log_buffer));    }    return (numran);}/* * Get information about each of the queues in the list of lists.  If * schd_get_queue_limits() fails, return the error condition.  It may * be a transient or a hard failure, which the caller may want to deal * with.  If all queues are successful, return '0'. */static intget_all_queue_info(int numqlists, ...){    va_list ap;    int    count = 0, ret;    QueueList *list;    QueueList *qptr;        va_start(ap, numqlists);    while (count < numqlists) {	list = va_arg(ap, QueueList *);	for (qptr = list; qptr != NULL; qptr = qptr->next) {	    /*	     * Get the limits, current resources, and any jobs for this	     * queue.	     */	    if ((ret = schd_get_queue_limits(qptr->queue)) != 0) {		DBPRT(("get_all_queue_info: get_queue_limits for %s failed.\n", 		    qptr->queue->qname));		va_end(ap);		return (ret);	    }	    /*	     * Set the queue flags if limits are exceeded.  Don't bother	     * getting a reason string.	     */	    schd_check_queue_limits(qptr->queue, NULL);	}	count ++;    }    va_end(ap);    return (0);}/* * Jobs queued on the special queue should be treated as highest priority. * They are sorted onto the top of the list of jobs that is created in the * usersort.c code.  That sorted list is then split out onto each of the * queues, so that each queue has a list of the jobs it "owns".  The jobs * then carry a backpointer to their owner queue. * * This works really nicely, since all the information about each queue * (including the list of jobs queued/running/etc on it) lives right on  * the Queue structure.  It is a clean, elegant and fully general solution. * * An unfortunate side effect of this "demultiplexing" is that jobs that * were marked "special" end up claimed by the SpecialQueue.  The scheduler * looks for jobs only on the SubmitQueue queue, so it never notices that  * there are special jobs enqueued.  In order to address this, any jobs on * the SpecialQueue are marked "waiting/high priority", and placed at the * head of the list of jobs in the SubmitQueue. * * This seems like an evil hack at first, and it arguably is.  However, if * there are multiple submission queues, it is relatively simple to support * them by simply causing them to be inserted in the submit queue's list. */static intfixup_special(void){    char   *id = "fixup_special";    Job    *job, *submitjobs, *nextjob, *specialtail;    Queue  *queue;    char    comment[MAX_TXT + 1];    int     fixedup = 0, justcomment = 0;    /*     * Sanity check -- this function shouldn't be called if there is no     * valid SpecialQueue.     */    if (schd_SpecialQueue == NULL || schd_SpecialQueue->queue == NULL) {	DBPRT(("%s: special code called but no special queue defined!\n", id));	return (-1);    }    queue = schd_SpecialQueue->queue;    if (queue->jobs == NULL) {	DBPRT(("%s: no jobs on special queue '%s'.  Ignoring.\n", id, 	    queue->qname));	return (0);    }    /*     * See if the special queue has anything to do, and if it will allow     * anything to be done to it.     */    if (queue->flags & QFLAGS_DISABLED) {	(void)sprintf(comment, "Queue %s not enabled", queue->qname);	justcomment ++;    }    if (queue->flags & QFLAGS_STOPPED) {	(void)sprintf(comment, "Queue %s not started", queue->qname);	justcomment ++;    }    /*      * If the jobs on the speical queue should just be commented, do so and     * return 0 -- no jobs were fixed up.     */    if (justcomment) {	for (job = queue->jobs; job != NULL; job = job->next)	    schd_comment_job(job, comment, JOB_COMMENT_REQUIRED);	return 0;    }    /*     * Detach the list of jobs from the SubmitQueue.  They will be tacked     * back onto the end of the list once the special jobs have been moved     * to the head.     */    submitjobs = schd_SubmitQueue->queue->jobs;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -